In [299]:
from DealMatch.trainer_unsupervised import Trainer
from DealMatch.data_unsupervised import get_targets_data, get_investors_data, get_matching_keys, clean_targets, clean_investors
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.base import TransformerMixin
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier


In [300]:
df_targets_clean = pd.read_csv('../targets.csv', index_col=0).drop(columns='index')
test = pd.read_excel('../DealMatch/targets_clean_test.xlsx')

In [301]:
# get X
X = df_targets_clean

In [302]:
# numerical pipeline --> no changes
num_features = ['target_ebit','target_ebitda','target_revenue']
num_transformer = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
                            ('scaler', RobustScaler())])

In [303]:
# custom class transform sparse data from TFIDF to Dense so it fits the numerical transformation
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.toarray()

In [304]:
# tfidf pipe --> included dense transformer
tfidf_features = 'strs'
tfidf_transformer = Pipeline([('tfidf', TfidfVectorizer()), ('dense', DenseTransformer())])

In [305]:
# full preproc
preproc = ColumnTransformer(transformers=[
            ('num_tr', num_transformer, num_features),
            ('tfidf',tfidf_transformer, tfidf_features)
        ], remainder='drop')

In [306]:
preproc.fit_transform(X).shape

(1111, 4632)

In [307]:
#full pipe excluding model (had to take out the model because couldn't use attribute 'predict')
full = Pipeline([('preproc', preproc),
                                          ('pca',
                                           PCA(0.95))])

In [308]:
full.fit_transform(X).shape

(1111, 2)

In [309]:
#fitted preproc model -> to save
preproc_fitted = full.fit(X)

#transformed X to train nneighbors
preproc_transformed = preproc_fitted.transform(X)

#fitted model -> to save
fitted_nn = NearestNeighbors(n_neighbors=10).fit(preproc_transformed)

In [310]:
# transform test data with preproc + pca pipeline
test_transformed = preproc_fitted.transform(test)

#run prediction on trained model
fitted_nn.kneighbors(test_transformed)

(array([[0.11123943, 0.27253504, 0.29032934, 0.45281042, 0.48279175,
         0.48902642, 0.57433349, 0.60868525, 0.63310101, 0.65931131]]),
 array([[451, 648, 729,  72, 886, 877, 136, 893,  66, 319]]))