# Problem definition

Analyzing funding received by start-ups in in different sectors in
India from 2018 to 2021

# Importations

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Loading Data

In [3]:
data_2018 = pd.read_csv('startup_funding2018.csv')
data_2019 = pd.read_csv('startup_funding2019.csv') 
data_2020 = pd.read_csv('startup_funding2020.csv') 
data_2021 = pd.read_csv('startup_funding2021.csv')

# Dataset overview

### Inspecting 2018 data

In [4]:
data_2018.head()

Unnamed: 0,Company Name,Industry,Round/Series,Amount,Location,About Company
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f..."
1,Happy Cow Dairy,"Agriculture, Farming",Seed,"₹40,000,000","Mumbai, Maharashtra, India",A startup which aggregates milk from dairy far...
2,MyLoanCare,"Credit, Financial Services, Lending, Marketplace",Series A,"₹65,000,000","Gurgaon, Haryana, India",Leading Online Loans Marketplace in India
3,PayMe India,"Financial Services, FinTech",Angel,2000000,"Noida, Uttar Pradesh, India",PayMe India is an innovative FinTech organizat...
4,Eunimart,"E-Commerce Platforms, Retail, SaaS",Seed,—,"Hyderabad, Andhra Pradesh, India",Eunimart is a one stop solution for merchants ...


Checking Data types

In [5]:
data_2018.info() # Checking the size of the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 526 entries, 0 to 525
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Company Name   526 non-null    object
 1   Industry       526 non-null    object
 2   Round/Series   526 non-null    object
 3   Amount         526 non-null    object
 4   Location       526 non-null    object
 5   About Company  526 non-null    object
dtypes: object(6)
memory usage: 24.8+ KB


In [6]:
data_2018["Amount"] # Checking the Amount column

0           250000
1      ₹40,000,000
2      ₹65,000,000
3          2000000
4                —
          ...     
521      225000000
522              —
523           7500
524    ₹35,000,000
525       35000000
Name: Amount, Length: 526, dtype: object

### Dropping the ₹ sign 

In [7]:
data_2018["Amount"] = data_2018.Amount.apply(lambda x:str(x).replace("₹", ""))
data_2018["Amount"]

0          250000
1      40,000,000
2      65,000,000
3         2000000
4               —
          ...    
521     225000000
522             —
523          7500
524    35,000,000
525      35000000
Name: Amount, Length: 526, dtype: object

In [9]:
data_2018.isna().sum() # Checking for missing values

Company Name     0
Industry         0
Round/Series     0
Amount           0
Location         0
About Company    0
dtype: int64

### Inspecting 2019 dataset

In [10]:
data_2019.head()

Unnamed: 0,Company/Brand,Founded,HeadQuarter,Sector,What it does,Founders,Investor,Amount($),Stage
0,Bombay Shaving,,,Ecommerce,Provides a range of male grooming products,Shantanu Deshpande,Sixth Sense Ventures,"$6,300,000",
1,Ruangguru,2014.0,Mumbai,Edtech,A learning platform that provides topic-based ...,"Adamas Belva Syah Devara, Iman Usman.",General Atlantic,"$150,000,000",Series C
2,Eduisfun,,Mumbai,Edtech,It aims to make learning fun via games.,Jatin Solanki,"Deepak Parekh, Amitabh Bachchan, Piyush Pandey","$28,000,000",Fresh funding
3,HomeLane,2014.0,Chennai,Interior design,Provides interior designing solutions,"Srikanth Iyer, Rama Harinath","Evolvence India Fund (EIF), Pidilite Group, FJ...","$30,000,000",Series D
4,Nu Genes,2004.0,Telangana,AgriTech,"It is a seed company engaged in production, pr...",Narayana Reddy Punyala,Innovation in Food and Agriculture (IFA),"$6,000,000",


In [11]:
data_2019.info() # Checking the dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company/Brand  89 non-null     object 
 1   Founded        60 non-null     float64
 2   HeadQuarter    70 non-null     object 
 3   Sector         84 non-null     object 
 4   What it does   89 non-null     object 
 5   Founders       86 non-null     object 
 6   Investor       89 non-null     object 
 7   Amount($)      89 non-null     object 
 8   Stage          43 non-null     object 
dtypes: float64(1), object(8)
memory usage: 6.4+ KB


In [12]:
data_2019.isna().sum()

Company/Brand     0
Founded          29
HeadQuarter      19
Sector            5
What it does      0
Founders          3
Investor          0
Amount($)         0
Stage            46
dtype: int64

In [13]:
data_2020.head()

Unnamed: 0,Company/Brand,Founded,HeadQuarter,Sector,What it does,Founders,Investor,Amount($),Stage,Unnamed: 9
0,Aqgromalin,2019,Chennai,AgriTech,Cultivating Ideas for Profit,"Prasanna Manogaran, Bharani C L",Angel investors,"$200,000",,
1,Krayonnz,2019,Bangalore,EdTech,An academy-guardian-scholar centric ecosystem ...,"Saurabh Dixit, Gurudutt Upadhyay",GSF Accelerator,"$100,000",Pre-seed,
2,PadCare Labs,2018,Pune,Hygiene management,Converting bio-hazardous waste to harmless waste,Ajinkya Dhariya,Venture Center,Undisclosed,Pre-seed,
3,NCOME,2020,New Delhi,Escrow,Escrow-as-a-service platform,Ritesh Tiwari,"Venture Catalysts, PointOne Capital","$400,000",,
4,Gramophone,2016,Indore,AgriTech,Gramophone is an AgTech platform enabling acce...,"Ashish Rajan Singh, Harshit Gupta, Nishant Mah...","Siana Capital Management, Info Edge","$340,000",,


In [14]:
data_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1055 entries, 0 to 1054
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Company/Brand  1055 non-null   object
 1   Founded        843 non-null    object
 2   HeadQuarter    961 non-null    object
 3   Sector         1042 non-null   object
 4   What it does   1055 non-null   object
 5   Founders       1043 non-null   object
 6   Investor       1017 non-null   object
 7   Amount($)      1052 non-null   object
 8   Stage          591 non-null    object
 9   Unnamed: 9     2 non-null      object
dtypes: object(10)
memory usage: 82.5+ KB


In [15]:
data_2021.head()

Unnamed: 0,Company/Brand,Founded,HeadQuarter,Sector,What it does,Founders,Investor,Amount($),Stage
0,Unbox Robotics,2019.0,Bangalore,AI startup,Unbox Robotics builds on-demand AI-driven ware...,"Pramod Ghadge, Shahid Memon","BEENEXT, Entrepreneur First","$1,200,000",Pre-series A
1,upGrad,2015.0,Mumbai,EdTech,UpGrad is an online higher education platform.,"Mayank Kumar, Phalgun Kompalli, Ravijot Chugh,...","Unilazer Ventures, IIFL Asset Management","$120,000,000",
2,Lead School,2012.0,Mumbai,EdTech,LEAD School offers technology based school tra...,"Smita Deorah, Sumeet Mehta","GSV Ventures, Westbridge Capital","$30,000,000",Series D
3,Bizongo,2015.0,Mumbai,B2B E-commerce,Bizongo is a business-to-business online marke...,"Aniket Deb, Ankit Tomar, Sachin Agrawal","CDC Group, IDG Capital","$51,000,000",Series C
4,FypMoney,2021.0,Gurugram,FinTech,"FypMoney is Digital NEO Bank for Teenagers, em...",Kapil Banwari,"Liberatha Kallat, Mukesh Yadav, Dinesh Nagpal","$2,000,000",Seed


In [16]:
data_2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1209 entries, 0 to 1208
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company/Brand  1209 non-null   object 
 1   Founded        1208 non-null   float64
 2   HeadQuarter    1208 non-null   object 
 3   Sector         1209 non-null   object 
 4   What it does   1209 non-null   object 
 5   Founders       1205 non-null   object 
 6   Investor       1147 non-null   object 
 7   Amount($)      1206 non-null   object 
 8   Stage          781 non-null    object 
dtypes: float64(1), object(8)
memory usage: 85.1+ KB


Creating new columns["Founded","Founders","Stage", "Investor"] for 2018 dataset to match 2019, 2020, and 2021 datasets

In [49]:
#using 'df["New_column_name"] = pd.Series([Values])'
new_2018 = data_2018["Founded"] = pd.Series([0, 0, 0, 0])
new_2018 = data_2018["Stage"] = pd.Series(["0","0","0","0"])
new_2018 = data_2018["Founders"] = pd.Series(["0","0","0","0"])
new_2018 = data_2018["Investor"] = pd.Series(["0","0","0","0"])
data_2018.head(50)

Unnamed: 0,Company Name,Industry,Round/Series,Amount,Location,About Company,Founded,Stage,Founders,Investor
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f...",0.0,0.0,0.0,0.0
1,Happy Cow Dairy,"Agriculture, Farming",Seed,40000000,"Mumbai, Maharashtra, India",A startup which aggregates milk from dairy far...,0.0,0.0,0.0,0.0
2,MyLoanCare,"Credit, Financial Services, Lending, Marketplace",Series A,65000000,"Gurgaon, Haryana, India",Leading Online Loans Marketplace in India,0.0,0.0,0.0,0.0
3,PayMe India,"Financial Services, FinTech",Angel,2000000,"Noida, Uttar Pradesh, India",PayMe India is an innovative FinTech organizat...,0.0,0.0,0.0,0.0
4,Eunimart,"E-Commerce Platforms, Retail, SaaS",Seed,—,"Hyderabad, Andhra Pradesh, India",Eunimart is a one stop solution for merchants ...,,,,
5,Hasura,"Cloud Infrastructure, PaaS, SaaS",Seed,1600000,"Bengaluru, Karnataka, India",Hasura is a platform that allows developers to...,,,,
6,Tripshelf,"Internet, Leisure, Marketplace",Seed,16000000,"Kalkaji, Delhi, India",Tripshelf is an online market place for holida...,,,,
7,Hyperdata.IO,Market Research,Angel,50000000,"Hyderabad, Andhra Pradesh, India",Hyperdata combines advanced machine learning w...,,,,
8,Freightwalla,"Information Services, Information Technology",Seed,—,"Mumbai, Maharashtra, India",Freightwalla is an international forwarder tha...,,,,
9,Microchip Payments,Mobile Payments,Seed,—,"Bangalore, Karnataka, India",Microchip payments is a mobile-based payment a...,,,,


In [24]:
#fill in NaN values with zero
new_2018 = data_2018.fillna(0)
new_2018

Unnamed: 0,Company Name,Industry,Round/Series,Amount,Location,About Company,Founded,Stage,Founders,Investor
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f...",0.0,0,0,0
1,Happy Cow Dairy,"Agriculture, Farming",Seed,40000000,"Mumbai, Maharashtra, India",A startup which aggregates milk from dairy far...,0.0,0,0,0
2,MyLoanCare,"Credit, Financial Services, Lending, Marketplace",Series A,65000000,"Gurgaon, Haryana, India",Leading Online Loans Marketplace in India,0.0,0,0,0
3,PayMe India,"Financial Services, FinTech",Angel,2000000,"Noida, Uttar Pradesh, India",PayMe India is an innovative FinTech organizat...,0.0,0,0,0
4,Eunimart,"E-Commerce Platforms, Retail, SaaS",Seed,—,"Hyderabad, Andhra Pradesh, India",Eunimart is a one stop solution for merchants ...,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
521,Udaan,"B2B, Business Development, Internet, Marketplace",Series C,225000000,"Bangalore, Karnataka, India","Udaan is a B2B trade platform, designed specif...",0.0,0,0,0
522,Happyeasygo Group,"Tourism, Travel",Series A,—,"Haryana, Haryana, India",HappyEasyGo is an online travel domain.,0.0,0,0,0
523,Mombay,"Food and Beverage, Food Delivery, Internet",Seed,7500,"Mumbai, Maharashtra, India",Mombay is a unique opportunity for housewives ...,0.0,0,0,0
524,Droni Tech,Information Technology,Seed,35000000,"Mumbai, Maharashtra, India",Droni Tech manufacture UAVs and develop softwa...,0.0,0,0,0


In [None]:
#adding Column using assign() method
#df.assign(new_column_name=value).head()

In [None]:
# adding column using insert() method
data_2018.insert(loc=1, column='Founded', value='0', allow_duplicates=True)
data_2018.insert(loc=5, column='Founders', value='0', allow_duplicates=True)
data_2018.insert(loc=8, column='Stage', value='0', allow_duplicates=True)
data_2018.insert(loc=6, column='Investor', value='0', allow_duplicate=True)

In [25]:
new_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 526 entries, 0 to 525
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company Name   526 non-null    object 
 1   Industry       526 non-null    object 
 2   Round/Series   526 non-null    object 
 3   Amount         526 non-null    object 
 4   Location       526 non-null    object 
 5   About Company  526 non-null    object 
 6   Founded        526 non-null    float64
 7   Stage          526 non-null    object 
 8   Founders       526 non-null    object 
 9   Investor       526 non-null    object 
dtypes: float64(1), object(9)
memory usage: 41.2+ KB


In [38]:
# Inspecting the values in "Amount" column
new_2018['Amount']

0          250000
1      40,000,000
2      65,000,000
3         2000000
4               —
          ...    
521     225000000
522             —
523          7500
524    35,000,000
525      35000000
Name: Amount, Length: 526, dtype: object

In [36]:
# Convert Amount value from integer to float

In [None]:
#new_2018['Amount']=new_2018['Amount'].astype(float)

#new_2018['Amount']=new_2018['Amount'].to_numeric(new_2018['Amount'], downcast='float')

In [48]:
#new_2018[new_2018.columns[3:]] = new_2018[new_2018.columns[3:]].replace('[\$,]', '', regex=True).astype(float)

In [None]:
#import babel.numbers
#import decimal
#babel.numbers.format_currency( decimal.Decimal( "188518982.18" ), "GBP" )

# Hypotheses

Null Hypothesis, H0: 
    There is no relationship between Amount sourced and Industry
    
Alternative Hypothesis, H1:
    There is a relationship between Amount sourced and Industry

# Questions

1. Which Industry received the highest funding?
2. Is there a relationship between the Age of Company and Amount of funding sourced?
3. Which investors give the highest funding?
4. Is location a factor in sourcing funding?
5. Which type of funding is mostly accessed?