<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Data-exploration" data-toc-modified-id="Data-exploration-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data exploration</a></span><ul class="toc-item"><li><span><a href="#Investor-profiles" data-toc-modified-id="Investor-profiles-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Investor profiles</a></span></li><li><span><a href="#Investor-locations" data-toc-modified-id="Investor-locations-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Investor locations</a></span></li><li><span><a href="#Regions-table" data-toc-modified-id="Regions-table-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Regions table</a></span></li></ul></li><li><span><a href="#Preprocessing" data-toc-modified-id="Preprocessing-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Preprocessing</a></span><ul class="toc-item"><li><span><a href="#Location-of-investment" data-toc-modified-id="Location-of-investment-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Location of investment</a></span></li><li><span><a href="#One-hot-encoding-of-investor-location" data-toc-modified-id="One-hot-encoding-of-investor-location-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>One-hot encoding of investor location</a></span></li><li><span><a href="#One-hot-encoding-of-investment-location" data-toc-modified-id="One-hot-encoding-of-investment-location-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>One-hot encoding of investment location</a></span></li></ul></li><li><span><a href="#Data-visualisation" data-toc-modified-id="Data-visualisation-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Data visualisation</a></span><ul class="toc-item"><li><span><a href="#Location-of-investors" data-toc-modified-id="Location-of-investors-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Location of investors</a></span></li></ul></li></ul></div>

<b>Dataframes:</b>

- <code>investor_locations</code>: raw file of investor locations
- <code>regions_df</code>: dataframe of country, region and country code
- <code>investor_df</code>: cleaned investor_locations with matched country code and region for location of investor and location of investment
- <code>encoded_df</code>: final dataframe with OHE regions

In [42]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [43]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 

In [44]:
file_path1='./sla-preprocessing-data/invest_profile_transaction.xlsx'
file_path2='./sla-preprocessing-data/investor_locations.xlsx'
file_path3='./sla-preprocessing-data/country_regions.xlsx'
file_path4='./sla-preprocessing-data/country_regions_full.xlsx'
file_path5='./sla-preprocessing-data/regions_202202261659.csv'
file_path6='./sla-preprocessing-data/investor_deals.xlsx'
file_path7='./sla-preprocessing-data/countries_regions_codes.xlsx'

# Data exploration
## Investor profiles

In [45]:
investor_profiles = pd.read_excel(file_path1)
investor_profiles.head()

Unnamed: 0.1,Unnamed: 0,id,name,description,short_description,products_description,company_category_id,number_of_employees,ebit,ebitda,...,name.2,id.4,invest_profile_id.1,stake_type_id,created_at.3,updated_at.3,deleted_at.3,is_deleted.3,id.5,name.3
0,0,1,Greencoat Capital,,,,1,,,,...,GROWTH,,,,NaT,NaT,,,,
1,1,5,Daniel Vogel,,,,1,,,,...,SUCCESSION,1.0,5.0,1.0,2021-10-14 13:47:15,2021-10-14 13:47:15,,0.0,1.0,MAJORITY
2,2,6,Richmond View Ventures GmbH,,,,1,,,,...,SEED,2.0,6.0,0.0,2021-10-14 13:47:15,2021-10-14 13:47:15,,0.0,0.0,MINORITY
3,3,7,AL Capital Holding GmbH & Co. KG,,,,1,,,,...,SUCCESSION,3.0,7.0,1.0,2021-10-14 13:47:16,2021-10-14 13:47:16,,0.0,1.0,MAJORITY
4,4,7,AL Capital Holding GmbH & Co. KG,,,,1,,,,...,CARVE_OUT,3.0,7.0,1.0,2021-10-14 13:47:16,2021-10-14 13:47:16,,0.0,1.0,MAJORITY


In [46]:
investor_profiles = investor_profiles.drop(['Unnamed: 0', 'crm_id', 'crm_created_at', 'crm_updated_at', 
                                            'crm_synced_at', 'verified_at', 'verified_by', 'created_at', 
                                            'updated_at', 'deleted_at', 'is_deleted', 'id.1', 'created_at.1',
                                            'updated_at.1', 'deleted_at.1', 'is_deleted.1', 'id.2', 
                                            'is_excluded', 'created_at.2', 'updated_at.2', 'deleted_at.2', 
                                            'is_deleted.2', 'id.3', 'id.4', 'created_at.3', 'updated_at.3', 
                                            'deleted_at.3', 'is_deleted.3', 'id.5'], axis=1)

## Investor locations

In [47]:
investor_locations = pd.read_excel(file_path2)

In [48]:
investor_locations = investor_locations.drop(['Unnamed: 0', 'website', 'database_import_id', 'crm_id',
                                              'crm_created_at', 'crm_updated_at', 'crm_synced_at', 'verified_at',
                                              'verified_by', 'deleted_at', 'is_deleted', 'is_excluded.1',
                                              'created_at.3', 'updated_at.3', 'deleted_at.3', 'is_deleted.3', 
                                              'id.4', 'iso_alpha2', 'iso_alpha3', 'name_de', 'name_fr', 'name_es',
                                              'phone_country_code', 'continent_id.1', 'created_at.1', 'updated_at.1', 
                                              'deleted_at.1', 'is_deleted.1', 'id.2', 'id.1', 'created_at.2', 'updated_at.2',
                                              'deleted_at.2', 'is_deleted.2', 'id.3'], axis=1)


In [49]:
investor_locations.columns

Index(['id', 'name', 'description', 'short_description',
       'products_description', 'company_category_id', 'number_of_employees',
       'ebit', 'ebitda', 'revenue', 'bs_total', 'origin_country_id',
       'origin_region_id', 'zipcode', 'company_source_id', 'company_state_id',
       'fees_rate', 'fees_cap', 'fees_floor', 'fees_fa', 'fees_info', 'inroad',
       'exit_oriented', 'management_takeover', 'seriousness', 'agnostic',
       'margin', 'parent_company_id', 'ma_history', 'created_at', 'updated_at',
       'company_inroad_id', 'company_id', 'name.1', 'max_revenue',
       'min_revenue', 'max_ebitda', 'min_ebitda', 'max_equity', 'min_equity',
       'searchmandate', 'invest_profile_id', 'continent_id', 'is_excluded',
       'invest_profile_location_id', 'country_id', 'name_en'],
      dtype='object')

## Regions table

In [50]:
regions_df = pd.read_excel(file_path7).drop(['Unnamed: 0'], axis=1)
regions_df.head()

Unnamed: 0,country,region,country-code
0,Afghanistan,Southern Asia,4
1,Åland Islands,Northern Europe,248
2,Albania,Southern Europe,8
3,Algeria,Northern Africa,12
4,American Samoa,Polynesia,16


# Preprocessing

## Location of investment

In [51]:
investor_df = pd.merge(investor_locations, regions_df[['country', 'region']], left_on='name_en', 
                             right_on='country', how='inner')

investor_df.columns=['id', 'name', 'description', 'short_description',
       'products_description', 'company_category_id', 'number_of_employees',
       'ebit', 'ebitda', 'revenue', 'bs_total', 'origin_country_id',
       'origin_region_id', 'zipcode', 'company_source_id', 'company_state_id',
       'fees_rate', 'fees_cap', 'fees_floor', 'fees_fa', 'fees_info', 'inroad',
       'exit_oriented', 'management_takeover', 'seriousness', 'agnostic',
       'margin', 'parent_company_id', 'ma_history', 'created_at', 'updated_at',
       'company_inroad_id', 'company_id', 'name.1', 'max_revenue',
       'min_revenue', 'max_ebitda', 'min_ebitda', 'max_equity', 'min_equity',
       'searchmandate', 'invest_profile_id', 'continent_id', 'is_excluded',
       'invest_profile_location_id', 'country_id', 'name_en', 'country_of_investment',
       'region_of_investment']

del investor_df['name_en']

investor_df[['id', 'name','origin_country_id', 'country_of_investment', 'region_of_investment']].sort_values(by='id')

investor_df = pd.merge(investor_df,
                       regions_df.rename(columns={'country': 'country_of_investor',
                                                  'region': 'region_of_investor'}),
                       left_on='origin_country_id', right_on='country-code', how='inner')

investor_df.sort_values(by='id', inplace=True)

investor_df.reset_index(drop=True, inplace=True)

investor_df = investor_df.drop(['origin_country_id', 'origin_region_id', 'zipcode',
                                'continent_id', 'is_excluded', 'country_id', 'products_description',
                                'short_description', 'ebit', 'ebitda', 'revenue', 'bs_total', 
                                'company_source_id', 'parent_company_id', 'name.1', 'ma_history',
                                'fees_floor', 'fees_cap','number_of_employees', 'description',
                               'fees_info', 'fees_fa', 'country_of_investor', 'country-code', 
                                'country_of_investment', 'invest_profile_location_id'], axis=1)

investor_df=investor_df.drop_duplicates()

investor_df.reset_index(drop=True, inplace=True)

In [52]:
investor_df

Unnamed: 0,id,name,company_category_id,company_state_id,fees_rate,inroad,exit_oriented,management_takeover,seriousness,agnostic,...,max_revenue,min_revenue,max_ebitda,min_ebitda,max_equity,min_equity,searchmandate,invest_profile_id,region_of_investment,region_of_investor
0,1,Greencoat Capital,1,0,0.02,0.0,1.0,0.0,2.0,0.0,...,50.0,10.0,,,100.0,25.0,0.0,1.0,Western Europe,Northern Europe
1,1,Greencoat Capital,1,0,0.02,0.0,1.0,0.0,2.0,0.0,...,50.0,10.0,,,100.0,25.0,0.0,1.0,Northern Europe,Northern Europe
2,1,Greencoat Capital,1,0,0.02,0.0,1.0,0.0,2.0,0.0,...,50.0,10.0,,,100.0,25.0,0.0,1.0,Southern Europe,Northern Europe
3,5,Daniel Vogel,1,0,0.02,4.0,1.0,0.0,2.0,0.0,...,,,,,,1.0,0.0,5.0,Western Europe,Western Europe
4,7,AL Capital Holding GmbH & Co. KG,1,0,0.02,0.0,1.0,0.0,2.0,1.0,...,,,,,15.0,2.0,0.0,7.0,Western Europe,Western Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12219,10273,Francis Cepero MBI,1,0,2.00,,,,2.0,,...,,,,0.0,,,0.0,10241.0,Western Europe,Western Europe
12220,10274,Andreas Wohlfahrt MBI,1,0,2.00,,,,2.0,,...,,,,0.5,,,0.0,10242.0,Western Europe,Western Europe
12221,10275,Muammer Çakmakçı MBI,1,0,2.00,,,,2.0,,...,20.0,10.0,,0.0,,,0.0,10243.0,Western Europe,Western Europe
12222,10276,Julian Mick MBI,1,0,2.00,,,,2.0,,...,,,1.5,0.3,,,0.0,10244.0,Western Europe,Western Europe


## One-hot encoding of investor location

In [53]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse = False)

ohe.fit(investor_df[['region_of_investor']])

investor_encoded = ohe.transform(investor_df[['region_of_investor']])

encoded_df = investor_df.copy()

encoded_df['investor_Australia_and_New_Zealand'], encoded_df['investor_Eastern_Africa'], encoded_df['investor_Eastern_Asia'],encoded_df['investor_Eastern_Europe'], encoded_df['investor_Northern_Africa'], encoded_df['investor_Northern America'],encoded_df['investor_Northern_Europe'], encoded_df['investor_South_America'], encoded_df['investor_South_Eastern_Asia'],encoded_df['investor_Southern_Africa'], encoded_df['investor_Southern_Asia'], encoded_df['investor_Southern_Europe'], encoded_df['investor_Western_Africa'], encoded_df['investor_Western Asia'], encoded_df['investor_Western_Europe'] = investor_encoded.T

encoded_df.head()

Unnamed: 0,id,name,company_category_id,company_state_id,fees_rate,inroad,exit_oriented,management_takeover,seriousness,agnostic,...,investor_Northern America,investor_Northern_Europe,investor_South_America,investor_South_Eastern_Asia,investor_Southern_Africa,investor_Southern_Asia,investor_Southern_Europe,investor_Western_Africa,investor_Western Asia,investor_Western_Europe
0,1,Greencoat Capital,1,0,0.02,0.0,1.0,0.0,2.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Greencoat Capital,1,0,0.02,0.0,1.0,0.0,2.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,Greencoat Capital,1,0,0.02,0.0,1.0,0.0,2.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,Daniel Vogel,1,0,0.02,4.0,1.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,7,AL Capital Holding GmbH & Co. KG,1,0,0.02,0.0,1.0,0.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## One-hot encoding of investment location

In [54]:
ohe = OneHotEncoder(sparse = False)

ohe.fit(investor_df[['region_of_investment']])

investor_encoded2 = ohe.transform(investor_df[['region_of_investment']])

encoded_df['investment_Australia_and_New_Zealand'], encoded_df['investment_Caribbean'], encoded_df['investment_Central_America'], encoded_df['investment_Central_Asia'], encoded_df['investment_Eastern_Africa'], encoded_df['investment_Eastern_Asia'], encoded_df['investment_Eastern_Europe'], encoded_df['investment_Latin_America_and_the_Caribbean'], encoded_df['investment_Melanesia'], encoded_df['investment_Micronesia'], encoded_df['investment_Middle_Africa'], encoded_df['investment_Northern_Africa'], encoded_df['investment_Northern_America'], encoded_df['investment_Northern_Europe'], encoded_df['investment_Polynesia'], encoded_df['investment_South_America'], encoded_df['investment_South-eastern_Asia'], encoded_df['investment_Southern_Africa'], encoded_df['investment_Southern_Asia'], encoded_df['investment_Southern_Europe'], encoded_df['investment_Western_Africa'], encoded_df['investment_Western_Asia'], encoded_df['investment_Western_Europe'] = investor_encoded2.T

encoded_df = encoded_df.groupby('id').max().reset_index()

encoded_df = encoded_df.drop(['region_of_investment', 'region_of_investor'], axis=1)

encoded_df.head()

Unnamed: 0,id,name,company_category_id,company_state_id,fees_rate,inroad,exit_oriented,management_takeover,seriousness,agnostic,...,investment_Northern_Europe,investment_Polynesia,investment_South_America,investment_South-eastern_Asia,investment_Southern_Africa,investment_Southern_Asia,investment_Southern_Europe,investment_Western_Africa,investment_Western_Asia,investment_Western_Europe
0,1,Greencoat Capital,1,0,0.02,0.0,1.0,0.0,2.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,5,Daniel Vogel,1,0,0.02,4.0,1.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,7,AL Capital Holding GmbH & Co. KG,1,0,0.02,0.0,1.0,0.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,28,Piccard Capital Partner,1,0,0.02,0.0,1.0,0.0,2.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
4,30,Quandriga GmbH,1,0,,0.0,1.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
