In [5]:
from DealMatch.trainer_unsupervised import Trainer
from DealMatch.data_unsupervised import get_targets_data, get_investors_data, get_matching_keys, clean_targets, clean_investors
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.base import TransformerMixin
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier


In [2]:
df_targets_clean = pd.read_csv('../targets.csv', index_col=0).drop(columns='index')
test = pd.read_excel('../DealMatch/targets_clean_test.xlsx')

In [3]:
# get X
X = df_targets_clean

In [4]:
# numerical pipeline --> no changes
num_features = ['target_ebit','target_ebitda','target_revenue']
num_transformer = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
                            ('scaler', RobustScaler())])

In [34]:
# custom class transform sparse data from TFIDF to Dense so it fits the numerical transformation
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.toarray()

In [6]:
# tfidf pipe --> included dense transformer
tfidf_features = 'strs'
tfidf_transformer = Pipeline([('tfidf', TfidfVectorizer()), ('dense', DenseTransformer())])

In [7]:
# full preproc
preproc = ColumnTransformer(transformers=[
            ('num_tr', num_transformer, num_features),
            ('tfidf',tfidf_transformer, tfidf_features)
        ], remainder='drop')

In [8]:
preproc.fit_transform(X).shape

(1111, 4632)

In [9]:
#full pipe excluding model (had to take out the model because couldn't use attribute 'predict')
full = Pipeline([('preproc', preproc),
                                          ('pca',
                                           PCA(0.95))])

In [10]:
full.fit_transform(X).shape

(1111, 2)

In [11]:
#fitted preproc model -> to save
preproc_fitted = full.fit(X)

#transformed X to train nneighbors
preproc_transformed = preproc_fitted.transform(X)

#fitted model -> to save
fitted_nn = NearestNeighbors(n_neighbors=10).fit(preproc_transformed)

In [13]:
# transform test data with preproc + pca pipeline
test_transformed = preproc_fitted.transform(test)

#run prediction on trained model
distance, indices = fitted_nn.kneighbors(test_transformed)

In [22]:
indices[0]

array([451, 648, 729,  72, 886, 877, 136, 893,  66, 319])

In [28]:
out = X[X.index.isin(indices[0])]

In [30]:
X

Unnamed: 0,deal_id,deal_name,deal_type_name,target_company_id,target_name,target_description,target_revenue,target_ebitda,target_ebit,country_name,region_name,sector_name,strs
0,173,1301 Sun,OTHER,871,Sun [Target],•\t350MWp of solar photovoltaic project assets...,,,,,,Energy,energy solar power energie photovoltaik pv...
4,129,1220 Supple,OTHER,874,Supple [Target],Solaranlage in Kreta,57.70,,,,,Energy,energy renewable energy other solar power ...
16,407,1677 Heat,MAJORITY,806,SPH Sustainable Process Heat GmbH,PROJECT HEAT hat eine neue Wärmepumpentechnolo...,0.43,-0.78,-0.78,,,Industrial products and services,industrial products and services pumps and co...
22,1013,845 Apollo,OTHER,812,SUMMIQ AG,Fundraising für Renewable Holding,0.00,0.00,0.00,Germany,Bavaria,Financial Services,financial services other diversified financia...
26,752,390 Saragossa,OTHER,507,KSW Bioenergie GmbH,Errichtung eines CO2-neutralen Bio-Energie Kra...,0.00,0.00,0.00,Germany,North Rhine-Westphalia,Energy,bioenergy energy bioenergie biomasse biotr...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
342265,126,1213 Gutenberg| Navigator Capital,,667,Navigator Capital GmbH,"Ihre Buchbinderei in Darmstadt – Verbindungen,...",5.20,0.38,0.11,Germany,Saxony-Anhalt,Professional Services (B2B),other services printing and binding professi...
342474,196,1350 Thunder | mutares AG,,648,Mutares SE & Co. KGaA,Der Spezialist für ausgefallenen Bedarf. Egal ...,2.90,0.45,0.38,Germany,North Rhine-Westphalia,Construction,construction construction suppliers trade di...
343117,689,2141 Saale,MAJORITY,331,G.S.M. Gas-Heizungen Sanitärinstallationen GmbH,Sanitär-Heizung-Klima-Unternehmen,4.30,,0.70,Germany,Berlin,Industrial products and services,industrial products and services ventilation ...
345053,690,2143 Highstreet,MAJORITY,885,TLF LabelFinder GmbH,Suchmaschine für Modemarken,0.32,0.23,,Germany,Berlin,Internet/ecommerce,internetecommerce search engines and other in...


In [2]:
import pandas as pd
Y = df_investors_clean = pd.read_csv('../DealMatch/investors.csv', index_col=0)


In [7]:
test_y = Trainer(X=None, Y=Y)

In [125]:
test_y.set_pipeline_investors()

                                            name  \
0                                      10X group   
1                        123 Investment Managers   
2                                   137 Ventures   
3                                   138 Pyramids   
4                                  17Capital LLP   
...                                          ...   
3179                        winelike invest GmbH   
3180                          yabeo Capital GmbH   
3181  zfhn Zukunftsfonds Heilbronn GmbH & Co. KG   
3182                     zur Mühlen ApS & Co. KG   
3183                         zwei.7 Holding GmbH   

                                                name_de  
0                 startup app daten internet agnostisch  
1     tourismus altenpflege gastronomie labor senior...  
2                                            agnostisch  
3                                            agnostisch  
4                                            agnostisch  
...                        

  explained_variance_ = (S ** 2) / (n_samples - 1)


In [126]:
test_y.pipeline_investors

Pipeline(steps=[('preproc',
                 ColumnTransformer(transformers=[('tfidf', TfidfVectorizer(),
                                                  ['name_de'])])),
                ('pca', PCA())])

In [115]:
test_y.Y

Unnamed: 0,name,name_de
0,10X group,startup app daten internet agnostisch
1,123 Investment Managers,tourismus altenpflege gastronomie labor senior...
2,137 Ventures,agnostisch
3,138 Pyramids,agnostisch
4,17Capital LLP,agnostisch
...,...,...
3179,winelike invest GmbH,immobilie
3180,yabeo Capital GmbH,medien software fintech pharma energie reinigu...
3181,zfhn Zukunftsfonds Heilbronn GmbH & Co. KG,recycling technologie industrie optik
3182,zur Mühlen ApS & Co. KG,fleisch metzgerei lebensmittel


In [69]:
preprocessor = ColumnTransformer([('tfidf',TfidfVectorizer(),'name_de')], remainder='drop')

pipeline_investors = Pipeline([('preproc',preprocessor),
                            ('dense', DenseTransformer()),
                            ('pca',PCA(0.95))
        ])

In [70]:
fitted_preproc = pipeline_investors.fit(Y)

In [71]:
y_transformed = fitted_preproc.transform(Y)

In [72]:
y_transformed.shape

(3184, 385)

In [73]:
nn_investors = NearestNeighbors(n_neighbors=10).fit(y_transformed)

In [74]:
nn_investors

NearestNeighbors(n_neighbors=10)

In [144]:
import joblib
from DealMatch.custom_transformer import *

In [179]:
investors_pipe = joblib.load("../DealMatch/pipeline_investors.pkl")
investors_pipe

Pipeline(steps=[('preproc',
                 ColumnTransformer(transformers=[('name_de',
                                                  Pipeline(steps=[('tfidf',
                                                                   TfidfVectorizer()),
                                                                  ('dense',
                                                                   <DealMatch.custom_transformer.DenseTransformer object at 0x1573dd280>)]),
                                                  'name_de')])),
                ('pca', PCA(n_components=0.95))])

In [146]:
joblib.load("../DealMatch/pipeline.pkl")

Pipeline(steps=[('preproc',
                 ColumnTransformer(transformers=[('num_tr',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value=0,
                                                                                 strategy='constant')),
                                                                  ('scaler',
                                                                   RobustScaler())]),
                                                  ['target_ebit',
                                                   'target_ebitda',
                                                   'target_revenue']),
                                                 ('tfidf',
                                                  Pipeline(steps=[('tfidf',
                                                                   TfidfVectorizer()),
                                             

In [170]:
investors = pd.read_csv('../DealMatch/investors.csv')

In [171]:
check = investors[investors['name']=='Augur Capital AG']

In [172]:
check.drop(columns={'Unnamed: 0'}, inplace=True)
check

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  check.drop(columns={'Unnamed: 0'}, inplace=True)


Unnamed: 0,name,name_de
340,Augur Capital AG,solaranlage medizintechnik bildung umschulunge...


In [173]:
Y

Unnamed: 0,name,name_de
0,10X group,startup app daten internet agnostisch
1,123 Investment Managers,tourismus altenpflege gastronomie labor senior...
2,137 Ventures,agnostisch
3,138 Pyramids,agnostisch
4,17Capital LLP,agnostisch
...,...,...
3179,winelike invest GmbH,immobilie
3180,yabeo Capital GmbH,medien software fintech pharma energie reinigu...
3181,zfhn Zukunftsfonds Heilbronn GmbH & Co. KG,recycling technologie industrie optik
3182,zur Mühlen ApS & Co. KG,fleisch metzgerei lebensmittel


In [175]:
y_transformed = investors_pipe.transform(Y)
y_transformed.shape

(3184, 385)

In [176]:
nn_investors = NearestNeighbors(n_neighbors=10).fit(y_transformed)

In [178]:
y_transformed_1 = investors_pipe.transform(check)
y_transformed_1.shape

(1, 385)

In [161]:
nn_investors = joblib.load("../DealMatch/nn_investors.pkl")

In [162]:
nn_investors.kneighbors(y_transformed)

(array([[2.98023224e-08, 7.93708436e-01, 9.33961051e-01, ...,
         1.01037541e+00, 1.01971895e+00, 1.04815412e+00],
        [2.58095683e-08, 9.24882324e-01, 9.28591441e-01, ...,
         9.94840348e-01, 9.96029108e-01, 9.99266516e-01],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        ...,
        [1.49011612e-08, 7.85505338e-01, 8.00219324e-01, ...,
         1.02206928e+00, 1.02206928e+00, 1.02206928e+00],
        [0.00000000e+00, 5.99644741e-01, 9.46276494e-01, ...,
         1.07744598e+00, 1.08045269e+00, 1.08813997e+00],
        [0.00000000e+00, 9.04790848e-01, 9.56989744e-01, ...,
         1.00687700e+00, 1.01975114e+00, 1.02149649e+00]]),
 array([[   0, 1081, 1091, ...,   82, 1608, 3111],
        [   1,  303, 1776, ..., 1402, 1601, 2156],
        [1768, 1759, 1779, ..., 1736,  327, 1813],
        ...,
        [3181, 3148, 1067, ..., 1460, 2664,  205],
        [3182, 3109, 1553, ..., 1601,  498,  743