In [58]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
from scipy import stats
from IPython.core.interactiveshell import InteractiveShell

In [59]:
# Libraries Settings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=Warning)
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
np.set_printoptions(suppress=True)

def set_seed(seed=42):
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
pd.set_option("display.width", 100)
pd.set_option("display.max_columns", 60)
pd.set_option("display.max_rows", 25)
pd.set_option("display.float_format", lambda x: "%.3f" % x)

In [60]:
df_deals = pd.read_excel ('data/Deals (investment).xlsx')
df_deals.head()

Unnamed: 0,1_post_date,2_post_title,3_amount,4_Stake,5_funding_round,6_investors,7_source,Country/Town,Country(HQ),Category,main_sector,Check vs Companies,"Deals Information, Level of Completeness",duplicated_conc,Duplication,Disclosed,Founded,Total Disclosed Funding,DA Classification_African Company (Yes = 0; No = 1),"funding round, DA","Industry, DA",Year,Month,Quarter,Half,Number of Investors,Investor 1,Investor 2,Investor 3,Investor 4,Investor 5,Investor 6,Investor 7,Investor 8,Investor 9,Investor 10,Investor 11,Investor 12,Investor 13,Investor 14,Investor 15,Cummulative Deals (Disclosed),Cummulative Deals (Undisclosed),Cummulative Amount,Investor 1.1,Investor 2.1,Investor 3.1,Investor 4.1,Investor 5.1,Investor 6.1,Investor 7.1,Investor 8.1,Investor 9.1,Investor 10.1,Investor 11.1,Investor 12.1,Investor 13.1,Investor 14.1,Investor 15.1
0,2008-09-01,biNu,600000,,Seed,Artesian VC,https://www.crunchbase.com/search/funding_roun...,,,,,,0.5,,,,,,,,,2008.0,9.0,3.0,2.0,1.0,#REF!,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,,,,#REF!,,,,,,,,,,,,,,
1,2008-11-01,biNu,400000,,Angel,Undisclosed,https://www.crunchbase.com/search/funding_roun...,Australia,Australia,Mobile Internet,Information Technology,biNu,0.9,,,1.0,2008.0,14220000.0,1.0,2. Seed,Other Technologies & Information Technology,2008.0,11.0,4.0,2.0,1.0,#REF!,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,,,,#REF!,,,,,,,,,,,,,,
2,2009-12-03,AllLife,6000000,,Private Equity,LeapFrog Investments,http://www.prnewswire.co.uk/news-releases/leap...,South Africa,South Africa,"Insurance, InsurTech",Financial Services,AllLife,0.9,,,1.0,2004.0,12700000.0,0.0,6. Private Equity,Financial Services,2009.0,12.0,4.0,2.0,1.0,#REF!,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,,,,#REF!,,,,,,,,,,,,,,
3,2009-12-11,Bridge International Academies,1800000,,Grant,Omidyar Network,http://foundationcenter.org/pnd/news/story.jht...,,,,,,0.643,,,1.0,,27800000.0,,1. Grant,,2009.0,12.0,4.0,2.0,1.0,Omidyar Network,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,,,,Omidyar Network,,,,,,,,,,,,,,
4,2010-04-01,biNu,320000,,Angel,Undisclosed,https://www.crunchbase.com/search/funding_roun...,,,,,,0.5,,,,,,,,,2010.0,4.0,2.0,1.0,1.0,#REF!,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,,,,#REF!,,,,,,,,,,,,,,


In [61]:
def check_df(dataframe):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(5))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

check_df(df_deals)

##################### Shape #####################
(2059, 59)
##################### Types #####################
1_post_date        datetime64[ns]
2_post_title               object
3_amount                   object
4_Stake                   float64
5_funding_round            object
                        ...      
Investor 11.1              object
Investor 12.1              object
Investor 13.1              object
Investor 14.1              object
Investor 15.1             float64
Length: 59, dtype: object
##################### Head #####################
  1_post_date                    2_post_title 3_amount  4_Stake 5_funding_round  \
0  2008-09-01                            biNu   600000      NaN            Seed   
1  2008-11-01                            biNu   400000      NaN           Angel   
2  2009-12-03                         AllLife  6000000      NaN  Private Equity   
3  2009-12-11  Bridge International Academies  1800000      NaN           Grant   
4  2010-04-01            

In [62]:
cols = df_deals.columns

In [63]:
cols

Index(['1_post_date', '2_post_title', '3_amount', '4_Stake', '5_funding_round', '6_investors',
       '7_source', 'Country/Town', 'Country(HQ)', 'Category', 'main_sector', 'Check vs Companies',
       'Deals Information, Level of Completeness', 'duplicated_conc', 'Duplication', 'Disclosed ',
       'Founded', 'Total  Disclosed Funding',
       'DA Classification_African Company (Yes = 0; No = 1)', 'funding round, DA', 'Industry, DA',
       'Year', 'Month', 'Quarter', 'Half', 'Number of Investors', 'Investor 1', 'Investor 2',
       'Investor 3', 'Investor 4', 'Investor 5', 'Investor 6', 'Investor 7', 'Investor 8',
       'Investor 9', 'Investor 10', 'Investor 11', 'Investor 12', 'Investor 13', 'Investor 14',
       'Investor 15', 'Cummulative Deals (Disclosed)', 'Cummulative Deals (Undisclosed)',
       'Cummulative Amount', 'Investor 1.1', 'Investor 2.1', 'Investor 3.1', 'Investor 4.1',
       'Investor 5.1', 'Investor 6.1', 'Investor 7.1', 'Investor 8.1', 'Investor 9.1',
       'Inv

In [64]:
for cat_col in cols:
    print(f"{cat_col}: {df_deals[cat_col].nunique()} uniqueness variable")

1_post_date: 695 uniqueness variable
2_post_title: 800 uniqueness variable
3_amount: 281 uniqueness variable
4_Stake: 7 uniqueness variable
5_funding_round: 27 uniqueness variable
6_investors: 717 uniqueness variable
7_source: 1070 uniqueness variable
Country/Town: 34 uniqueness variable
Country(HQ): 34 uniqueness variable
Category: 660 uniqueness variable
main_sector: 48 uniqueness variable
Check vs Companies: 727 uniqueness variable
Deals Information, Level of Completeness: 6 uniqueness variable
duplicated_conc: 1120 uniqueness variable
Duplication: 1 uniqueness variable
Disclosed : 2 uniqueness variable
Founded: 23 uniqueness variable
Total  Disclosed Funding: 280 uniqueness variable
DA Classification_African Company (Yes = 0; No = 1): 2 uniqueness variable
funding round, DA: 8 uniqueness variable
Industry, DA: 13 uniqueness variable
Year: 14 uniqueness variable
Month: 12 uniqueness variable
Quarter: 4 uniqueness variable
Half: 2 uniqueness variable
Number of Investors: 12 uniquenes

In [65]:
df_deals = df_deals.drop(['Investor 13.1','Investor 14.1','Investor 15.1','Investor 15','Duplication'], axis=1)

In [66]:
df_deals.shape

(2059, 54)

In [67]:
df_deals.head(20)

Unnamed: 0,1_post_date,2_post_title,3_amount,4_Stake,5_funding_round,6_investors,7_source,Country/Town,Country(HQ),Category,main_sector,Check vs Companies,"Deals Information, Level of Completeness",duplicated_conc,Disclosed,Founded,Total Disclosed Funding,DA Classification_African Company (Yes = 0; No = 1),"funding round, DA","Industry, DA",Year,Month,Quarter,Half,Number of Investors,Investor 1,Investor 2,Investor 3,Investor 4,Investor 5,Investor 6,Investor 7,Investor 8,Investor 9,Investor 10,Investor 11,Investor 12,Investor 13,Investor 14,Cummulative Deals (Disclosed),Cummulative Deals (Undisclosed),Cummulative Amount,Investor 1.1,Investor 2.1,Investor 3.1,Investor 4.1,Investor 5.1,Investor 6.1,Investor 7.1,Investor 8.1,Investor 9.1,Investor 10.1,Investor 11.1,Investor 12.1
0,2008-09-01,biNu,600000,,Seed,Artesian VC,https://www.crunchbase.com/search/funding_roun...,,,,,,0.5,,,,,,,,2008.0,9.0,3.0,2.0,1.0,#REF!,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,#REF!,,,,,,,,,,,
1,2008-11-01,biNu,400000,,Angel,Undisclosed,https://www.crunchbase.com/search/funding_roun...,Australia,Australia,Mobile Internet,Information Technology,biNu,0.9,,1.0,2008.0,14220000.0,1.0,2. Seed,Other Technologies & Information Technology,2008.0,11.0,4.0,2.0,1.0,#REF!,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,#REF!,,,,,,,,,,,
2,2009-12-03,AllLife,6000000,,Private Equity,LeapFrog Investments,http://www.prnewswire.co.uk/news-releases/leap...,South Africa,South Africa,"Insurance, InsurTech",Financial Services,AllLife,0.9,,1.0,2004.0,12700000.0,0.0,6. Private Equity,Financial Services,2009.0,12.0,4.0,2.0,1.0,#REF!,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,#REF!,,,,,,,,,,,
3,2009-12-11,Bridge International Academies,1800000,,Grant,Omidyar Network,http://foundationcenter.org/pnd/news/story.jht...,,,,,,0.643,,1.0,,27800000.0,,1. Grant,,2009.0,12.0,4.0,2.0,1.0,Omidyar Network,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,Omidyar Network,,,,,,,,,,,
4,2010-04-01,biNu,320000,,Angel,Undisclosed,https://www.crunchbase.com/search/funding_roun...,,,,,,0.5,,,,,,,,2010.0,4.0,2.0,1.0,1.0,#REF!,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,#REF!,,,,,,,,,,,
5,2010-08-20,Kopo Kopo,75000,,Seed,First Light Ventures,https://www.crunchbase.com/organization/kopo-k...,Kenya,Kenya,"Enterprise Software, Financial Services, Infor...",Financial Services,Kopo Kopo,0.9,40410Kopo Kopo75000,1.0,2011.0,6504900.0,0.0,2. Seed,Financial Services,2010.0,8.0,3.0,2.0,1.0,First Light Ventures,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,First Light Ventures,,,,,,,,,,,
6,2010-09-30,Paga,700000,,Seed,Tim Draper,https://www.crunchbase.com/funding_round/paga-...,Nigeria,Nigeria,"Money Transfer, Airtime, Bill Payments, and SM...",Financial Services,Paga,0.9,40451Paga700000,1.0,2009.0,32700000.0,0.0,2. Seed,Financial Services,2010.0,9.0,3.0,2.0,1.0,First Light Ventures,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,First Light Ventures,,,,,,,,,,,
7,2010-12-01,Wuzzuf,150000,,Grant,Undisclosed,https://www.crunchbase.com/funding_round/wuzzu...,Egypt,Egypt,"Information Services, Information Technology, ...",Recruitment,Wuzzuf,0.9,40513Wuzzuf150000,1.0,2009.0,7850000.0,0.0,1. Grant,"Employment, Talent & Labour",2010.0,12.0,4.0,2.0,1.0,First Light Ventures,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,First Light Ventures,,,,,,,,,,,
8,2011-05-01,African Leadership Academy,1500000,,Grant,Omidyar Network,https://www.crunchbase.com/funding_round/afric...,South Africa,South Africa,"E-Learning, Education, Tutoring",Education,African Leadership Academy,0.9,40664African Leadership Academy1500000,1.0,2008.0,55800000.0,0.0,1. Grant,Education,2011.0,5.0,2.0,1.0,1.0,First Light Ventures,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,First Light Ventures,,,,,,,,,,,
9,2011-10-01,M-KOPA,Undisclosed,,Series A,Undisclosed,https://www.crunchbase.com/funding_round/m-kop...,Kenya,Kenya,"Energy, Financial Services, Mobile",Energy & Environment Resources,M-Kopa,0.9,40817M-KOPAUndisclosed,0.0,2011.0,161800000.0,0.0,3. Early Venture,,2011.0,10.0,4.0,2.0,1.0,First Light Ventures,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,First Light Ventures,,,,,,,,,,,


In [70]:
#find duplicate rows across all columns
duplicateRows = df_deals[df_deals.duplicated()]

In [71]:
duplicateRows

Unnamed: 0,1_post_date,2_post_title,3_amount,4_Stake,5_funding_round,6_investors,7_source,Country/Town,Country(HQ),Category,main_sector,Check vs Companies,"Deals Information, Level of Completeness",duplicated_conc,Disclosed,Founded,Total Disclosed Funding,DA Classification_African Company (Yes = 0; No = 1),"funding round, DA","Industry, DA",Year,Month,Quarter,Half,Number of Investors,Investor 1,Investor 2,Investor 3,Investor 4,Investor 5,Investor 6,Investor 7,Investor 8,Investor 9,Investor 10,Investor 11,Investor 12,Investor 13,Investor 14,Cummulative Deals (Disclosed),Cummulative Deals (Undisclosed),Cummulative Amount,Investor 1.1,Investor 2.1,Investor 3.1,Investor 4.1,Investor 5.1,Investor 6.1,Investor 7.1,Investor 8.1,Investor 9.1,Investor 10.1,Investor 11.1,Investor 12.1
489,2018-04-19,Shezlong,350000,,Seed,"500 Startups, Endure Capital, HIMangel",https://www.menabytes.com/shezlong-350k/,Egypt,Egypt,"Health Care, Medical, mHealth",Healthcare & Pharma,Shezlong,0.900,43209Shezlong350000,1.000,2015.000,950000.000,0.000,2. Seed,Health & Pharmaceuticals,2018.000,4.000,2.000,1.000,3.000,Seedstars,Endure Capital,HIMangel,0,0,0,0,0,0,0,0,0,0,0,,,,Seedstars,Endure Capital,HIMangel,,,,,,,,,
558,2018-06-05,Onesha,50000,,Seed,Pangea Accelerator,http://disrupt-africa.com/2018/06/pangea-inves...,Kenya,Kenya,"Architecture, Brand Marketing, Digital Marketi...",Media & Entertainment,Onesha,0.900,43256Onesha50000,1.000,2016.000,100000.000,0.000,2. Seed,Media & Entertainment,2018.000,6.000,2.000,1.000,1.000,Seedstars,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,Seedstars,,,,,,,,,,,
1279,NaT,,,,,,,,,,,,,,,,,,,,1900.000,1.000,1.000,1.000,1.000,Johnson & Johnson,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,Johnson & Johnson,,,,,,,,,,,
1280,NaT,,,,,,,,,,,,,,,,,,,,1900.000,1.000,1.000,1.000,1.000,Johnson & Johnson,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,Johnson & Johnson,,,,,,,,,,,
1281,NaT,,,,,,,,,,,,,,,,,,,,1900.000,1.000,1.000,1.000,1.000,Johnson & Johnson,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,Johnson & Johnson,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,NaT,,,,,,,,,,,,,,,,,,,,,,,,,Johnson & Johnson,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,Johnson & Johnson,,,,,,,,,,,
2055,NaT,,,,,,,,,,,,,,,,,,,,,,,,,Johnson & Johnson,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,Johnson & Johnson,,,,,,,,,,,
2056,NaT,,,,,,,,,,,,,,,,,,,,,,,,,Johnson & Johnson,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,Johnson & Johnson,,,,,,,,,,,
2057,NaT,,,,,,,,,,,,,,,,,,,,,,,,,Johnson & Johnson,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,Johnson & Johnson,,,,,,,,,,,


In [72]:
#drop duplicate rows unique deals
Unique_deals = df_deals.drop_duplicates()

In [73]:
No_deals = Unique_deals.value_counts()

In [74]:
No_deals

Series([], dtype: int64)