<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [410]:
from DealMatch.trainer_unsupervised import Trainer
from DealMatch.data_unsupervised import get_targets_data, get_investors_data, get_matching_keys, clean_targets, clean_investors
from DealMatch.predict_unsupervised import matching_investors, best_investors, make_prediction_investors
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.base import TransformerMixin
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from DealMatch.custom_transformer import DenseTransformer
from sklearn.preprocessing import OneHotEncoder
import joblib

In [9]:
df_targets_clean = pd.read_csv('../DealMatch/targets.csv', index_col=0).drop(columns='index')
test = pd.read_excel('../DealMatch/targets_clean_test.xlsx')

In [6]:
# get X
X = df_targets_clean

In [163]:
X

Unnamed: 0,deal_id,deal_name,deal_type_name,target_company_id,target_name,target_description,target_revenue,target_ebitda,target_ebit,country_name,region_name,sector_name,strs
0,173,1301 Sun,OTHER,871,Sun [Target],•\t350MWp of solar photovoltaic project assets...,,,,,,Energy,energy solar power energie photovoltaik pv...
4,129,1220 Supple,OTHER,874,Supple [Target],Solaranlage in Kreta,57.70,,,,,Energy,energy renewable energy other solar power ...
16,407,1677 Heat,MAJORITY,806,SPH Sustainable Process Heat GmbH,PROJECT HEAT hat eine neue Wärmepumpentechnolo...,0.43,-0.78,-0.78,,,Industrial products and services,industrial products and services pumps and co...
22,1013,845 Apollo,OTHER,812,SUMMIQ AG,Fundraising für Renewable Holding,0.00,0.00,0.00,Germany,Bavaria,Financial Services,financial services other diversified financia...
26,752,390 Saragossa,OTHER,507,KSW Bioenergie GmbH,Errichtung eines CO2-neutralen Bio-Energie Kra...,0.00,0.00,0.00,Germany,North Rhine-Westphalia,Energy,bioenergy energy bioenergie biomasse biotr...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
342265,126,1213 Gutenberg| Navigator Capital,,667,Navigator Capital GmbH,"Ihre Buchbinderei in Darmstadt – Verbindungen,...",5.20,0.38,0.11,Germany,Saxony-Anhalt,Professional Services (B2B),other services printing and binding professi...
342474,196,1350 Thunder | mutares AG,,648,Mutares SE & Co. KGaA,Der Spezialist für ausgefallenen Bedarf. Egal ...,2.90,0.45,0.38,Germany,North Rhine-Westphalia,Construction,construction construction suppliers trade di...
343117,689,2141 Saale,MAJORITY,331,G.S.M. Gas-Heizungen Sanitärinstallationen GmbH,Sanitär-Heizung-Klima-Unternehmen,4.30,,0.70,Germany,Berlin,Industrial products and services,industrial products and services ventilation ...
345053,690,2143 Highstreet,MAJORITY,885,TLF LabelFinder GmbH,Suchmaschine für Modemarken,0.32,0.23,,Germany,Berlin,Internet/ecommerce,internetecommerce search engines and other in...


In [10]:
test.head()

Unnamed: 0,deal_id,deal_name,deal_type_name,target_company_id,target_name,target_description,target_revenue,target_ebitda,target_ebit,country_name,region_name,sector_name,strs
0,194,1303 Sonne,OTHER,825,Sonne,•\t350MWp of solar photovoltaic project assets...,15,3,3,Germany,Bavaria,Energy,"Solarkraft, energie, photovoltaik, pv, solar, ..."


In [7]:
# numerical pipeline --> no changes
num_features = ['target_ebit','target_ebitda','target_revenue']
num_transformer = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
                            ('scaler', RobustScaler())])

In [35]:
# categorical pipeline
cat_features = ['deal_type_name', 'country_name', 'region_name', 'sector_name']
cat_transformer = Pipeline([('imputer',
                             SimpleImputer(missing_values=np.nan,
                                           strategy='constant',
                                           fill_value='no_region')),
                            ('ohe',
                             OneHotEncoder(handle_unknown='ignore',
                                           sparse=False))])

In [13]:
# tfidf pipe --> included dense transformer
tfidf_features = 'strs'
tfidf_transformer = Pipeline([('tfidf', TfidfVectorizer()), ('dense', DenseTransformer())])

In [14]:
# preproc for unsupervised
preproc_unsupervised = ColumnTransformer(transformers=[
            ('num_tr', num_transformer, num_features),
            ('tfidf',tfidf_transformer, tfidf_features)
            ], remainder='drop')

In [42]:
# preproc for supervised
preproc_supervised = ColumnTransformer([
            ('num_tr', num_transformer,
             ['target_ebit', 'target_ebitda', 'target_revenue']),
            ('cat_tr', cat_transformer,
             ['deal_type_name', 'country_name', 'region_name', 'sector_name'])
             ], remainder='drop')

### full_unsupervised = Pipeline([('preproc', preproc_unsupervised), ('pca', PCA(0.95))])

In [43]:
full_supervised = Pipeline([('preproc', preproc_supervised)])

In [25]:
#fitted preproc model -> to save
preproc_uns_fitted = full_unsupervised.fit(X)

#transformed X to train nneighbors
preproc_uns_transformed = preproc_uns_fitted.transform(X)

#fitted model -> to save
fitted_nn = NearestNeighbors(n_neighbors=10).fit(preproc_uns_transformed)

In [184]:
fitted_nn

NearestNeighbors(n_neighbors=10)

In [51]:
# transform test data with preproc + pca pipeline (unsupervised)
test_uns_transformed = preproc_uns_fitted.transform(test)

#run prediction on trained model
nearest_targets = fitted_nn.kneighbors(test_uns_transformed)

In [44]:
#fitted preproc model -> to save
preproc_s_fitted = full_supervised.fit(X)

#transformed X to train nneighbors
preproc_s_transformed = preproc_s_fitted.transform(X)

In [45]:
SimpleImputer.get_feature_names_out = (lambda self, names=None: self.feature_names_in_)

In [46]:
full_supervised.get_feature_names_out()

preproc_s_transformed = pd.DataFrame(preproc_s_transformed,
             columns=full_supervised.get_feature_names_out())

In [47]:
preproc_s_transformed

Unnamed: 0,num_tr__target_ebit,num_tr__target_ebitda,num_tr__target_revenue,cat_tr__deal_type_name_DISTRESSED,cat_tr__deal_type_name_MAJORITY,cat_tr__deal_type_name_MINORITY,cat_tr__deal_type_name_OTHER,cat_tr__deal_type_name_VC,cat_tr__deal_type_name_no_region,cat_tr__country_name_Austria,...,cat_tr__sector_name_Pharmaceuticals,cat_tr__sector_name_Professional Services (B2B),cat_tr__sector_name_Real Estate,cat_tr__sector_name_Retailing,cat_tr__sector_name_Semiconductors & Semiconductor Equipment,cat_tr__sector_name_Software & Services,cat_tr__sector_name_Telecommunication Hardware,cat_tr__sector_name_Telecommunication Services,cat_tr__sector_name_Transportation,cat_tr__sector_name_Utilities
0,-0.50,-0.467290,-0.603689,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.50,-0.467290,5.846842,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.28,-1.196262,-0.555618,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.50,-0.467290,-0.603689,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.50,-0.467290,-0.603689,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1106,-0.39,-0.112150,-0.022359,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1107,-0.12,-0.046729,-0.279486,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1108,0.20,-0.467290,-0.122974,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1109,-0.50,-0.252336,-0.567915,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# transform test data with preproc for matching table (supervised)
test_s_transformed = preproc_s_fitted.transform(test)

full_supervised.get_feature_names_out()

test_s_transformed = pd.DataFrame(test_s_transformed,
             columns=full_supervised.get_feature_names_out())

In [50]:
test_s_transformed

Unnamed: 0,num_tr__target_ebit,num_tr__target_ebitda,num_tr__target_revenue,cat_tr__deal_type_name_DISTRESSED,cat_tr__deal_type_name_MAJORITY,cat_tr__deal_type_name_MINORITY,cat_tr__deal_type_name_OTHER,cat_tr__deal_type_name_VC,cat_tr__deal_type_name_no_region,cat_tr__country_name_Austria,...,cat_tr__sector_name_Pharmaceuticals,cat_tr__sector_name_Professional Services (B2B),cat_tr__sector_name_Real Estate,cat_tr__sector_name_Retailing,cat_tr__sector_name_Semiconductors & Semiconductor Equipment,cat_tr__sector_name_Software & Services,cat_tr__sector_name_Telecommunication Hardware,cat_tr__sector_name_Telecommunication Services,cat_tr__sector_name_Transportation,cat_tr__sector_name_Utilities
0,2.5,2.336449,1.073225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
targets = df_targets_clean.copy()

name = []
target_id = []
description = []
distance = []

for x,y in zip(nearest_targets[1][0],nearest_targets[0][0]):
    target_id.append(targets['target_company_id'].iloc[x])
    name.append(targets['target_name'].iloc[x])
    description.append(targets['strs'].iloc[x])
    distance.append(y)


df_companies = pd.DataFrame({'target_company_id': target_id,
                             'name':name,
                             'description':description,
                             'distance':distance})

df_companies

Unnamed: 0,target_company_id,name,description,distance
0,680,OAK Industria Arredamenti S.p.A.,furniture industry manufacturing other couch...,0.111239
1,130,Bentheimer-Holz GmbH,agriculture forestry industrial products and...,0.272535
2,585,MAKRA Norbert Kraft,chemicals chemistry other cosmetic caring ...,0.290329
3,183,Campanet GmbH,consumer services other leisure consumer se...,0.45281
4,152,Buchberger Baugeräte Handel GmbH,construction construction suppliers trade di...,0.482792
5,546,LC272301 Consumer [Target],apparel consumer goods apparel luxury goods...,0.489026
6,854,Sommer & Denich GmbH,logistics transportation baustellenverkehr ...,0.574333
7,937,VOIGT Ingenieure GmbH Berlin,construction construction services and engine...,0.608685
8,725,Physiotherm Holding GmbH,body care consumer goods apparel health car...,0.633101
9,257,ELS European Labelling System GmbH & Co. KG,industrial equipment and machinery industrial...,0.659311


In [123]:
matching_table = pd.read_excel('matching_table_final_final.xlsx')

In [124]:
matching_table.head()

Unnamed: 0,id,project_name,target_id,target_name,target_company_id,investor_id,investor_name,deal_stage_id,is_lost,exit_oriented_x,...,investment_Northern_Europe,investment_Polynesia,investment_South_America,investment_South-eastern_Asia,investment_Southern_Africa,investment_Southern_Asia,investment_Southern_Europe,investment_Western_Africa,investment_Western_Asia,investment_Western_Europe
0,32980,1923 Creativ,10309.0,cm.supplies GmbH,1021.0,2508,MKCP Beteiligungsgesellschaft mbH,4,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,34787,1923 Creativ,10309.0,cm.supplies GmbH,1021.0,2726,Axos MidCap GmbH-Triginta Capital GmbH,4,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,32257,1923 Creativ,10309.0,cm.supplies GmbH,1021.0,2050,COREST GmbH,2,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,29699,1923 Creativ,10309.0,cm.supplies GmbH,1021.0,2597,KAUT-BULLINGER Office + Solution GmbH,2,0,,...,,,,,,,,,,
4,30467,1923 Creativ,10309.0,cm.supplies GmbH,1021.0,3913,Printus Family Office,2,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [116]:
df_match_investors = matching_table[(matching_table['target_company_id'].isin(df_companies['target_company_id'])) &
                    (matching_table['deal_stage_id']>=2) & 
                    (matching_table['exit_oriented_x'].notna())].drop_duplicates(subset="target_company_id",keep="first")

In [136]:
df_best_investors_uns = df_match_investors[['target_company_id', 'target_name', 'investor_id', 'investor_name']]
df_best_investors_uns.reset_index(inplace=True)
df_best_investors_uns.drop(columns='index', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [156]:
df_best_investors_uns = pd.merge(df_best_investors_uns, df_companies[['target_company_id', 'distance']],
                                on="target_company_id")

In [157]:
# final df including distance from input target to similar targets
df_best_investors_uns

Unnamed: 0,target_company_id,target_name,investor_id,investor_name,distance
0,585.0,MAKRA Norbert Kraft,1601,Elias Invest GmbH,0.290329
1,130.0,Bentheimer-Holz GmbH,2794,Augur Capital AG,0.272535
2,725.0,Physiotherm Holding GmbH,2204,Infinment GmbH,0.633101
3,257.0,ELS European Labelling System GmbH & Co. KG,2794,Augur Capital AG,0.659311
4,680.0,OAK Industria Arredamenti S.p.A.,9871,L Catterton Management Limited,0.111239
5,937.0,VOIGT Ingenieure GmbH Berlin,2713,Palero Capital GmbH,0.608685
6,854.0,Sommer & Denich GmbH,2618,HANNOVER Finanz GmbH,0.574333
7,183.0,Campanet GmbH,2592,Nidum Capital AG,0.45281
8,546.0,LC272301 Consumer [Target],2526,Findos Investor GmbH,0.489026


In [139]:
investors_clean = pd.read_csv('../DealMatch/investors.csv')

In [141]:
investors_clean.name.nunique()

3184

In [146]:
# get the required data for second NN (investors)
df_investor_unsupervised = pd.merge(df_best_investors_uns, investors_clean, left_on="investor_name", right_on="name")
df_investor_unsupervised = df_investor_unsupervised[['investor_name', 'name_de']]

In [147]:
df_investor_unsupervised

Unnamed: 0,investor_name,name_de
0,Elias Invest GmbH,immobilie
1,Augur Capital AG,solaranlage medizintechnik bildung umschulunge...
2,Augur Capital AG,solaranlage medizintechnik bildung umschulunge...
3,Infinment GmbH,möbel ecommerce handel immobilie heizsystem au...
4,Palero Capital GmbH,medizintechnik bau maschinenbau beratung gesch...
5,HANNOVER Finanz GmbH,personaldienstleister diy ecommerce handel mas...
6,Nidum Capital AG,ecommerce lebensmittel maschinenbau gesund loh...
7,Findos Investor GmbH,it dienstleistung automobil software b2b konsu...


In [175]:
tfidf_features_inv = 'name_de'
tfidf_transformer_inv = Pipeline([('tfidf', TfidfVectorizer()), ('dense', DenseTransformer())])

In [176]:
# preproc investors for unsupervised
preproc_investors = ColumnTransformer([('tfidf',tfidf_transformer_inv, tfidf_features_inv)], remainder='drop')

In [177]:
full_unsupervised_investors = Pipeline([
                    ('preproc',preproc_investors),
                    ('pca', PCA(n_components=0.95))])

In [183]:
full_unsupervised

Pipeline(steps=[('preproc',
                 ColumnTransformer(transformers=[('num_tr',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value=0,
                                                                                 strategy='constant')),
                                                                  ('scaler',
                                                                   RobustScaler())]),
                                                  ['target_ebit',
                                                   'target_ebitda',
                                                   'target_revenue']),
                                                 ('tfidf',
                                                  Pipeline(steps=[('tfidf',
                                                                   TfidfVectorizer()),
                                             

In [178]:
full_unsupervised_investors

Pipeline(steps=[('preproc',
                 ColumnTransformer(transformers=[('tfidf',
                                                  Pipeline(steps=[('tfidf',
                                                                   TfidfVectorizer()),
                                                                  ('dense',
                                                                   <DealMatch.custom_transformer.DenseTransformer object at 0x182637e50>)]),
                                                  'name_de')])),
                ('pca', PCA(n_components=0.95))])

In [179]:
X_inv = investors_clean

In [180]:
X_inv

Unnamed: 0.1,Unnamed: 0,name,name_de
0,0,10X group,startup app daten internet agnostisch
1,1,123 Investment Managers,tourismus altenpflege gastronomie labor senior...
2,2,137 Ventures,agnostisch
3,3,138 Pyramids,agnostisch
4,4,17Capital LLP,agnostisch
...,...,...,...
3179,3179,winelike invest GmbH,immobilie
3180,3180,yabeo Capital GmbH,medien software fintech pharma energie reinigu...
3181,3181,zfhn Zukunftsfonds Heilbronn GmbH & Co. KG,recycling technologie industrie optik
3182,3182,zur Mühlen ApS & Co. KG,fleisch metzgerei lebensmittel


In [187]:
#fitted preproc model -> to save
preproc_uns_fitted_inv = full_unsupervised_investors.fit(X_inv)

#transformed X to train nneighbors
preproc_uns_transformed_inv = preproc_uns_fitted_inv.transform(X_inv)

#fitted model -> to save
fitted_nn_inv = NearestNeighbors(n_neighbors=5).fit(preproc_uns_transformed_inv)

In [188]:
fitted_nn_inv

NearestNeighbors()

In [189]:
test_inv = df_investor_unsupervised

In [191]:
# transform test data with preproc + pca pipeline (unsupervised)
test_uns_inv_transformed = preproc_uns_fitted_inv.transform(test_inv)

#run prediction on trained model
nearest_investors = fitted_nn_inv.kneighbors(test_uns_inv_transformed)

In [324]:
name_investor = []
description_investor = []
distance_investor_investor = []
distance_target_target = []


for n in nearest_investors[1]:
    for i in n:
        name_investor.append(investors_clean['name'].iloc[i])
        description_investor.append(investors_clean['name_de'].iloc[i])
        distance_target_target.append(df_best_investors_uns[df_best_investors_uns.investor_name==investors_clean['name'].iloc[i]]['distance'].min())

for e in nearest_investors[0]:
    for l in e:
        distance_investor_investor.append(l)



In [341]:
df_investors = pd.DataFrame({'name':name_investor,
            'description':description_investor,
            'distance_investor<=>investor':distance_investor_investor,
                'distance_target<=>target':distance_target_target})

df_investors['distance_target<=>target'].ffill(inplace=True) 
df_investors['distance_target<=>target'].replace(np.nan, 0, inplace=True)
df_investors['distance_target<=>investor'] = df_investors['distance_target<=>target'] + df_investors['distance_investor<=>investor']


In [346]:
df_investors.sort_values('distance_target<=>investor', inplace=True)

In [348]:
df_investors.drop_duplicates(inplace=True)

In [352]:
df_investors.reset_index(drop=True)

Unnamed: 0,name,description,distance_investor<=>investor,distance_target<=>target,distance_target<=>investor
0,HWS Immobilien und Vermögensverwaltung GmbH,immobilie,0.0,0.0,0.0
1,Augustus Capital Management GmbH,immobilie,0.0,0.0,0.0
2,Laborgh Investment GmbH,immobilie,0.0,0.0,0.0
3,Van Deursen Group,immobilie,0.0,0.0,0.0
4,Vermögensverwaltung Vits G.b.R.,immobilie,0.0,0.0,0.0
5,Augur Capital AG,solaranlage medizintechnik bildung umschulunge...,2.580957e-08,0.272535,0.272535
6,Nidum Capital AG,ecommerce lebensmittel maschinenbau gesund loh...,2.35608e-08,0.45281,0.45281
7,Findos Investor GmbH,it dienstleistung automobil software b2b konsu...,0.0,0.489026,0.489026
8,HANNOVER Finanz GmbH,personaldienstleister diy ecommerce handel mas...,0.0,0.574333,0.574333
9,Palero Capital GmbH,medizintechnik bau maschinenbau beratung gesch...,0.0,0.608685,0.608685


In [359]:
investor_profiles = pd.read_excel("../raw_data/df_final_investors_preprocessed.xlsx", index_col=0)
investor_profiles.head()

Unnamed: 0,investor_id,exit_oriented_x,management_takeover_x,agnostic_x,no_fee,pays_1,pays_2,pays_3,fee_sharing_10,fee_sharing_25,...,investment_Northern_Europe,investment_Polynesia,investment_South_America,investment_South-eastern_Asia,investment_Southern_Africa,investment_Southern_Asia,investment_Southern_Europe,investment_Western_Africa,investment_Western_Asia,investment_Western_Europe
0,1,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,1
1,7,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,7,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,7,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,7,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [361]:
investor_profiles.drop_duplicates(inplace=True)

In [363]:
investor_profiles.shape

(1577, 101)

In [364]:
matching_table.head(1)

Unnamed: 0,id,project_name,target_id,target_name,target_company_id,investor_id,investor_name,deal_stage_id,is_lost,exit_oriented_x,...,investment_Northern_Europe,investment_Polynesia,investment_South_America,investment_South-eastern_Asia,investment_Southern_Africa,investment_Southern_Asia,investment_Southern_Europe,investment_Western_Africa,investment_Western_Asia,investment_Western_Europe
0,32980,1923 Creativ,10309.0,cm.supplies GmbH,1021.0,2508,MKCP Beteiligungsgesellschaft mbH,4,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [372]:
investor_profiles = pd.merge(investor_profiles, matching_table[['investor_id', 'investor_name']], on="investor_id")
investor_profiles.drop_duplicates(inplace=True)
investor_profiles.shape

(1576, 102)

In [374]:
cols = investor_profiles.columns.to_list()
cols = cols[-1:] + cols[:-1]
investor_profiles = investor_profiles[cols]
investor_profiles.head()

Unnamed: 0,investor_name,investor_id,exit_oriented_x,management_takeover_x,agnostic_x,no_fee,pays_1,pays_2,pays_3,fee_sharing_10,...,investment_Northern_Europe,investment_Polynesia,investment_South_America,investment_South-eastern_Asia,investment_Southern_Africa,investment_Southern_Asia,investment_Southern_Europe,investment_Western_Africa,investment_Western_Asia,investment_Western_Europe
0,Greencoat Capital,1,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,1
5,AL Capital Holding GmbH & Co. KG,7,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
55,3TS Capital Partners,31,1,0,1,1,0,0,0,0,...,1,0,0,0,0,0,1,0,1,1
66,83North Ltd.,41,1,0,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,1,1
68,Earlybird Venture Capital GmbH & Co KG,43,1,0,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,1,1


In [377]:
investor_profiles.rename(columns={'investor_name':'name'}, inplace=True)

In [378]:
investor_profiles.to_csv('investor_profiles_to_merge.csv')

In [384]:
# komischer scheiß
#matching_table[(matching_table['investor_name'].isin(df_investors['name']))].drop_duplicates(subset="investor_id",keep="first")

In [398]:
# add investment profiles

df_investors_supervised = pd.merge(df_investors['name'], investor_profiles, on="name")
df_investors_supervised.drop_duplicates(inplace=True)
df_investors_supervised

Unnamed: 0,name,investor_id,exit_oriented_x,management_takeover_x,agnostic_x,no_fee,pays_1,pays_2,pays_3,fee_sharing_10,...,investment_Northern_Europe,investment_Polynesia,investment_South_America,investment_South-eastern_Asia,investment_Southern_Africa,investment_Southern_Asia,investment_Southern_Europe,investment_Western_Africa,investment_Western_Asia,investment_Western_Europe
0,Augur Capital AG,2794,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,Nidum Capital AG,2592,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,Findos Investor GmbH,2526,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HANNOVER Finanz GmbH,2618,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,Palero Capital GmbH,2713,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
5,Infinment GmbH,2204,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
6,Family Trust Investor FTI GmbH,2765,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,4elements holding GmbH,9992,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
8,Aventur Beratungs- und Beteiligungs GmbH,2731,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
9,Inven Capital,3076,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [399]:
test_s_transformed

Unnamed: 0,num_tr__target_ebit,num_tr__target_ebitda,num_tr__target_revenue,cat_tr__deal_type_name_DISTRESSED,cat_tr__deal_type_name_MAJORITY,cat_tr__deal_type_name_MINORITY,cat_tr__deal_type_name_OTHER,cat_tr__deal_type_name_VC,cat_tr__deal_type_name_no_region,cat_tr__country_name_Austria,...,cat_tr__sector_name_Pharmaceuticals,cat_tr__sector_name_Professional Services (B2B),cat_tr__sector_name_Real Estate,cat_tr__sector_name_Retailing,cat_tr__sector_name_Semiconductors & Semiconductor Equipment,cat_tr__sector_name_Software & Services,cat_tr__sector_name_Telecommunication Hardware,cat_tr__sector_name_Telecommunication Services,cat_tr__sector_name_Transportation,cat_tr__sector_name_Utilities
0,2.5,2.336449,1.073225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [401]:
final_supervised_input = pd.concat([test_s_transformed,df_investors_supervised],axis=1).ffill().sum(level=0, axis=1)
final_supervised_input

  final_supervised_input = pd.concat([test_s_transformed,df_investors_supervised],axis=1).ffill().sum(level=0, axis=1)


Unnamed: 0,num_tr__target_ebit,num_tr__target_ebitda,num_tr__target_revenue,cat_tr__deal_type_name_DISTRESSED,cat_tr__deal_type_name_MAJORITY,cat_tr__deal_type_name_MINORITY,cat_tr__deal_type_name_OTHER,cat_tr__deal_type_name_VC,cat_tr__deal_type_name_no_region,cat_tr__country_name_Austria,...,investment_Northern_Europe,investment_Polynesia,investment_South_America,investment_South-eastern_Asia,investment_Southern_Africa,investment_Southern_Asia,investment_Southern_Europe,investment_Western_Africa,investment_Western_Asia,investment_Western_Europe
0,2.5,2.336449,1.073225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,2.5,2.336449,1.073225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,2.5,2.336449,1.073225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2.5,2.336449,1.073225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,2.5,2.336449,1.073225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
5,2.5,2.336449,1.073225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
6,2.5,2.336449,1.073225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
7,2.5,2.336449,1.073225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
8,2.5,2.336449,1.073225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
9,2.5,2.336449,1.073225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [415]:
filepath = '../raw_data/preprocessed_data_1march_2.xlsx'

supervised_train_data = pd.read_excel(filepath, index_col=0)

In [416]:
# verify same shape and column order as for the trained supervised model
# includes columns to drop: deal_id, result, investor_id
supervised_train_data.shape

(16999, 177)

In [405]:
supervised_train_data.columns.to_list()

['deal_id',
 'result',
 'target_revenue',
 'target_ebitda',
 'target_ebit',
 'target_DISTRESSED',
 'target_MAJORITY',
 'target_MINORITY',
 'target_OTHER',
 'target_VC',
 'target_Eastern_Europe',
 'target_Northern_Africa',
 'target_Northern_America',
 'target_Northern_Europe',
 'target_Southern_Europe',
 'target_Western_Europe',
 'target_Austria',
 'target_Belgium',
 'target_Czechia',
 'target_Egypt',
 'target_Germany',
 'target_Italy',
 'target_Netherlands',
 'target_Norway',
 'target_Poland',
 'target_Portugal',
 'target_Romania',
 'target_Slovakia',
 'target_Spain',
 'target_Switzerland',
 'target_United_States_of_America',
 'target_Baden-Württemberg',
 'target_Bavaria',
 'target_Berlin',
 'target_Brandenburg',
 'target_Bremen',
 'target_Hamburg',
 'target_Hesse',
 'target_Lower_Saxony',
 'target_Mecklenburg-Vorpommern',
 'target_North_Rhine-Westphalia',
 'target_Rhineland-Palatinate',
 'target_Saarland',
 'target_Saxony',
 'target_Saxony-Anhalt',
 'target_Schleswig-Holstein',
 'targ

In [417]:
final_supervised_input.shape

(19, 177)

In [407]:
final_supervised_input.columns.to_list()

['num_tr__target_ebit',
 'num_tr__target_ebitda',
 'num_tr__target_revenue',
 'cat_tr__deal_type_name_DISTRESSED',
 'cat_tr__deal_type_name_MAJORITY',
 'cat_tr__deal_type_name_MINORITY',
 'cat_tr__deal_type_name_OTHER',
 'cat_tr__deal_type_name_VC',
 'cat_tr__deal_type_name_no_region',
 'cat_tr__country_name_Austria',
 'cat_tr__country_name_Belgium',
 'cat_tr__country_name_Bulgaria',
 'cat_tr__country_name_Czechia',
 'cat_tr__country_name_Egypt',
 'cat_tr__country_name_Germany',
 'cat_tr__country_name_Italy',
 'cat_tr__country_name_Netherlands',
 'cat_tr__country_name_Poland',
 'cat_tr__country_name_Portugal',
 'cat_tr__country_name_Romania',
 'cat_tr__country_name_Slovakia',
 'cat_tr__country_name_Spain',
 'cat_tr__country_name_Switzerland',
 'cat_tr__country_name_United States of America',
 'cat_tr__country_name_no_region',
 'cat_tr__region_name_Baden-Württemberg',
 'cat_tr__region_name_Bavaria',
 'cat_tr__region_name_Berlin',
 'cat_tr__region_name_Brandenburg',
 'cat_tr__region_name

In [418]:
final_supervised_input.drop(columns=['name', 'investor_id'], inplace=True)

In [427]:
final_supervised_input.head()

Unnamed: 0,num_tr__target_ebit,num_tr__target_ebitda,num_tr__target_revenue,cat_tr__deal_type_name_DISTRESSED,cat_tr__deal_type_name_MAJORITY,cat_tr__deal_type_name_MINORITY,cat_tr__deal_type_name_OTHER,cat_tr__deal_type_name_VC,cat_tr__deal_type_name_no_region,cat_tr__country_name_Austria,...,investment_Northern_Europe,investment_Polynesia,investment_South_America,investment_South-eastern_Asia,investment_Southern_Africa,investment_Southern_Asia,investment_Southern_Europe,investment_Western_Africa,investment_Western_Asia,investment_Western_Europe
0,2.5,2.336449,1.073225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,2.5,2.336449,1.073225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,2.5,2.336449,1.073225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2.5,2.336449,1.073225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,2.5,2.336449,1.073225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [424]:
MODEL_SUPERVISED = 'model_supervised_MLP1.joblib'

In [425]:
pipe_sup = joblib.load(MODEL_SUPERVISED)

In [426]:
pipe_sup.predict_proba(final_supervised_input)

Feature names must be in the same order as they were in fit.



array([[0.35407364, 0.64592636],
       [0.77626533, 0.22373467],
       [0.76364958, 0.23635042],
       [0.56025021, 0.43974979],
       [0.61595786, 0.38404214],
       [0.49151135, 0.50848865],
       [0.23104075, 0.76895925],
       [0.8062763 , 0.1937237 ],
       [0.74299725, 0.25700275],
       [0.92224643, 0.07775357],
       [0.87510115, 0.12489885],
       [0.93141003, 0.06858997],
       [0.47826421, 0.52173579],
       [0.69445337, 0.30554663],
       [0.54634865, 0.45365135],
       [0.9043012 , 0.0956988 ],
       [0.90721585, 0.09278415],
       [0.95865944, 0.04134056],
       [0.90671715, 0.09328285]])