In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
from scipy import stats
from IPython.core.interactiveshell import InteractiveShell

In [2]:
# Libraries Settings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=Warning)
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
np.set_printoptions(suppress=True)

def set_seed(seed=42):
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
pd.set_option("display.width", 100)
pd.set_option("display.max_columns", 60)
pd.set_option("display.max_rows", 25)
pd.set_option("display.float_format", lambda x: "%.3f" % x)

In [6]:
df = pd.read_excel ('data/Companies.xlsx')
df.head()

Unnamed: 0,1_Company_name,2_Company_description,3_Status,4_Year_founded,5_Headquarters,6_Other_offices,7.1_Country HQ,7.2_Assigned_Country (Africa),8_Countries_of_operation_(Business Model/Market),9_Country_Parent Incorporation,10_Total_Venture_Funding (Disclosed),10.1_Funding Rounds (Named),10.2_Number of Venture Funding rounds (#),11_Investors,12_Company_Valuation,13_Company_Valuation_Date,14_Last_funding_round_raised_date,15_Last_funding_round_raised_type,16_Last funding round raised amount,17_Largest_round,18_Amount,19_Founders,20_Female_Co-Founder (Yes = 1; No = 0),21_Main_sector,22_Categories,23_Business_Model,24_Accelerators_Attended,25_Number_of_employees,26_Website,27_Linkedin,...,Unnamed: 88,Unnamed: 89,Unnamed: 90,Unnamed: 91,Unnamed: 92,Unnamed: 93,Unnamed: 94,Unnamed: 95,Unnamed: 96,Unnamed: 97,Unnamed: 98,Unnamed: 99,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104,Unnamed: 105,Unnamed: 106,Unnamed: 107,Unnamed: 108,Unnamed: 109,Unnamed: 110,Unnamed: 111,Unnamed: 112,Unnamed: 113,Unnamed: 114,Unnamed: 115,Average time of rounds(days),Year of recieving Series A
0,Jumia Group,Jumia is connecting consumers and businesses a...,Active,2012.0,"Lagos, Nigeria","Nigeria, Egypt, Morocco, Kenya, Ivory Coast, S...",Nigeria,Nigeria,"Nigeria, Egypt, Morocco, Kenya, Ivory Coast, S...",,,,,,,,,Series C,409806000,Series C,409806000,"Jeremy Hodara, Sacha Poignonnec",,E-Commerce & Retail,"E-Commerce, Internet, Shopping Online",,,1001 - 5000,https://group.jumia.com/,https://www.linkedin.com/company/jumia-group/,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,529.0,2012.0
1,Andela,Andela invests in Africa’s most talented softw...,Active,2014.0,"New York, united States","Nigeria, Kenya, Uganda, Rwanda",United States,Nigeria,"Nigeria, Kenya, Uganda, Rwanda",,,,,"Chan Zuckerberg Initiative, CRE Venture Capita...",,,2019-01-23 00:00:00,Series D,100000000,Series D,100000000,"Brice Steven Nkengsa, Christina Sass, Ian Carn...",1.0,Commercial & Professional Services,"Edtech, BPO, Recruitment, Software, Training",B2B,Extreme Accelerator,500+,https://andela.com,https://www.linkedin.com/company/andela/,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,395.0,2015.0
2,Takealot,takealot.com is one of SA's largest online ret...,Acquired,2011.0,"Cape Town, South Africa",,South Africa,South Africa,South Africa,,,,,,,,,Venture Round,69000000,Venture Round,100000000,Kim Reid,,E-Commerce & Retail,"E-Commerce, Logistics, Retail",,,1001-5000,https://www.takealot.com/,https://www.linkedin.com/company/takealot/,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1054.0,
3,Zola Electric,ZOLA Electric (formerly Off Grid Electric) is ...,Active,2011.0,"Arusha, Tanzania",,Tanzania,Tanzania,Tanzania,,,,,"GE Ventures, Helios Investment Partners, Omidy...",,,2018-07-26 00:00:00,Debt Financing,20000000,Series D,55000000,"Erica Mackey, Joshua Pierce, Xavier Helgesen",1.0,Energy & Environment Resources,"Electrical Distribution, Energy, Renewable Ene...",B2B/B2C,-,1001-5000,http://offgrid-electric.com/,https://www.linkedin.com/company/zolaelectric/,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,d.light,d.light is a global leader in delivering affor...,Active,2007.0,"Nairobi, Kenya",,Kenya,Kenya,Kenya,,,,,,,,,Venture Round,41000000,Debt Financing,50000000,"Jacob Okoth, Adrian Bock",,Energy & Environment Resources,"Energy, Renewable Energy, Solar",,,201-500,http://www.dlight.com,https://www.linkedin.com/company/d-light-desig...,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,195.0,


In [7]:
df.shape

(3084, 118)

##### Data cleaning

In [9]:
df.replace("?", np.nan, inplace = True)
#df.head(5)

In [10]:
def check_df(dataframe):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(5))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

check_df(df)

##################### Shape #####################
(3084, 118)
##################### Types #####################
1_Company_name                   object
2_Company_description            object
3_Status                         object
4_Year_founded                  float64
5_Headquarters                   object
                                 ...   
Unnamed: 113                    float64
Unnamed: 114                    float64
Unnamed: 115                    float64
Average time of rounds(days)    float64
Year of recieving Series A      float64
Length: 118, dtype: object
##################### Head #####################
   1_Company_name                              2_Company_description  3_Status  4_Year_founded  \
0     Jumia Group  Jumia is connecting consumers and businesses a...    Active        2012.000   
1          Andela  Andela invests in Africa’s most talented softw...    Active        2014.000   
2        Takealot  takealot.com is one of SA's largest online ret...  Acquired

In [12]:
cols = df.columns

In [13]:
for cat_col in cols:
    print(f"{cat_col}: {df[cat_col].nunique()} uniqueness variable")

1_Company_name: 3084 uniqueness variable
2_Company_description: 2829 uniqueness variable
3_Status: 5 uniqueness variable
4_Year_founded: 37 uniqueness variable
5_Headquarters: 423 uniqueness variable
6_Other_offices: 54 uniqueness variable
7.1_Country HQ: 58 uniqueness variable
7.2_Assigned_Country (Africa): 33 uniqueness variable
8_Countries_of_operation_(Business Model/Market): 147 uniqueness variable
9_Country_Parent Incorporation: 15 uniqueness variable
10_Total_Venture_Funding (Disclosed): 9 uniqueness variable
10.1_Funding Rounds (Named): 3 uniqueness variable
10.2_Number of Venture Funding rounds (#): 4 uniqueness variable
11_Investors: 157 uniqueness variable
12_Company_Valuation: 0 uniqueness variable
13_Company_Valuation_Date: 0 uniqueness variable
14_Last_funding_round_raised_date: 107 uniqueness variable
15_Last_funding_round_raised_type: 43 uniqueness variable
16_Last funding round raised amount: 198 uniqueness variable
17_Largest_round: 30 uniqueness variable
18_Amount: 1

In [23]:
#listing all columns
my_list = df.columns.values.tolist()

In [24]:
my_list

['1_Company_name',
 '2_Company_description',
 '3_Status',
 '4_Year_founded',
 '5_Headquarters',
 '6_Other_offices',
 '7.1_Country HQ',
 '7.2_Assigned_Country (Africa)',
 '8_Countries_of_operation_(Business Model/Market)',
 '9_Country_Parent Incorporation',
 '10_Total_Venture_Funding (Disclosed)',
 '10.1_Funding Rounds (Named)',
 '10.2_Number of Venture Funding rounds (#)',
 '11_Investors',
 '12_Company_Valuation',
 '13_Company_Valuation_Date',
 '14_Last_funding_round_raised_date',
 '15_Last_funding_round_raised_type',
 '16_Last funding round raised amount',
 '17_Largest_round',
 '18_Amount',
 '19_Founders',
 '20_Female_Co-Founder (Yes = 1; No  = 0)',
 '21_Main_sector',
 '22_Categories',
 '23_Business_Model',
 '24_Accelerators_Attended',
 '25_Number_of_employees',
 '26_Website',
 '27_Linkedin',
 '28_Twitter',
 '29_Facebook',
 'Companies Information, Level of Completeness',
 'Duplication',
 'Companies with a Deal Check',
 'Main Sector / Industry Check',
 'DA Acquired or Exited or Dead',


In [25]:
df = df.drop(['Total Number of Funding Rounds','Total Number of Venture Funding Rounds',
 'Total Funding todate (disclosed)','Total Venture funding todate (disclosed)','Acquired',
 '1. Grant','2. Seed', '3. Early Venture','4. Late Venture', '5. Debt Financing','6.Private Equity','7. Offerings',
 '8. Exits',2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 'Investors',
 'Unnamed: 66',
 'Unnamed: 67',
 'Unnamed: 68',
 'Unnamed: 69',
 'Unnamed: 70',
 'Unnamed: 71',
 'Unnamed: 72',
 'Unnamed: 73',
 'Unnamed: 74',
 'Unnamed: 75',
 'Unnamed: 76',
 'Unnamed: 77',
 'Unnamed: 78',
 'Unnamed: 79',
 'Unnamed: 80',
 'Duplication.1',
 'Unnamed: 82',
 'Unnamed: 83',
 'Unnamed: 84',
 'Unnamed: 85',
 'Unnamed: 86',
 'Unnamed: 87',
 'Unnamed: 88',
 'Unnamed: 89',
 'Unnamed: 90',
 'Unnamed: 91',
 'Unnamed: 92',
 'Unnamed: 93',
 'Unnamed: 94',
 'Unnamed: 95',
 'Unnamed: 96',
 'Unnamed: 97',
 'Unnamed: 98',
 'Unnamed: 99',
 'Unnamed: 100',
 'Unnamed: 101',
 'Unnamed: 102',
 'Unnamed: 103',
 'Unnamed: 104',
 'Unnamed: 105',
 'Unnamed: 106',
 'Unnamed: 107',
 'Unnamed: 108',
 'Unnamed: 109',
 'Unnamed: 110',
 'Unnamed: 111',
 'Unnamed: 112',
 'Unnamed: 113',
 'Unnamed: 114',
 'Unnamed: 115',
 'Average time of rounds(days)',
 'Year of recieving Series A '], axis=1)

In [26]:
df.columns

Index(['1_Company_name', '2_Company_description', '3_Status', '4_Year_founded', '5_Headquarters',
       '6_Other_offices', '7.1_Country HQ', '7.2_Assigned_Country (Africa)',
       '8_Countries_of_operation_(Business Model/Market)', '9_Country_Parent Incorporation',
       '10_Total_Venture_Funding (Disclosed)', '10.1_Funding Rounds (Named)',
       '10.2_Number of Venture Funding rounds (#)', '11_Investors', '12_Company_Valuation',
       '13_Company_Valuation_Date', '14_Last_funding_round_raised_date',
       '15_Last_funding_round_raised_type', '16_Last funding round raised amount',
       '17_Largest_round', '18_Amount', '19_Founders', '20_Female_Co-Founder (Yes = 1; No  = 0)',
       '21_Main_sector', '22_Categories', '23_Business_Model', '24_Accelerators_Attended',
       '25_Number_of_employees', '26_Website', '27_Linkedin', '28_Twitter', '29_Facebook',
       'Companies Information, Level of Completeness', 'Duplication',
       'Companies with a Deal Check', 'Main Sector / Ind