In [1]:
# Supress Warnings

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import the numpy and pandas packages

import numpy as np
import pandas as pd

## Checkpoint 1: Data Cleaning

Import and read the companies.txt and rounds2.csv. Store it in a variables called `companies` and `rounds2`.

In [3]:
companies = pd.read_csv("companies.txt", sep="\t", encoding = "ISO-8859-1")
companies.head()

FileNotFoundError: File b'companies.txt' does not exist

In [None]:
rounds2 = pd.read_csv("rounds2.csv", encoding = "ISO-8859-1")
rounds2.head()

In [None]:
# Encoding and decoding company_permalink column to remove special characters and converting it to lower case

rounds2.company_permalink = rounds2.company_permalink.str.encode('utf-8').str.decode('ascii', 'ignore').str.lower()

In [None]:
# Counting the unique values in company_permalink of rounds2

rounds2.company_permalink.nunique()

In [None]:
# Encoding and decoding permalink column to remove special characters and converting it to lower case

companies.permalink = companies.permalink.str.encode('utf-8').str.decode('ascii', 'ignore').str.lower()

In [None]:
# Counting the unique values in permalink of companies

companies.permalink.nunique()

In [None]:
# Inspecting companies dataframe

companies.info()

# As Companies has total rows 66368 and only two column has all the value (i.e permalink and status)
# By looking at data status has duplicate values and permalink doesnot have any duplicate value
# We can consider permalink as unique for each company

In [None]:
rounds2.info()

In [None]:
# Checking difference in company_permalink column of round2 dataframe and permalink column of companies dataframe

set(rounds2.company_permalink.tolist())-set(companies.permalink.tolist())

# We the empty set hence
# All companies in round2 is in companies

In [None]:
# Renaming the column company_permalink to permalink of rounds2 dataframe

rounds2.rename(columns={'company_permalink': 'permalink'}, inplace=True)

In [None]:
# Merging rounds2 and companies dataframe on permalink

master_frame = pd.merge(rounds2, companies, how='inner', on='permalink')

In [None]:
# Finding the observations of master_frame

master_frame.info()

In [None]:
# Removing unwanted columns from master_frame

master_frame.drop('funding_round_permalink', axis=1, inplace= True)
master_frame.drop('funding_round_code', axis=1, inplace= True)
master_frame.drop('funded_at', axis=1, inplace= True)
master_frame.drop('name', axis=1, inplace= True)
master_frame.drop('homepage_url', axis=1, inplace= True)
master_frame.drop('status', axis=1, inplace= True)
master_frame.drop('state_code', axis=1, inplace= True)
master_frame.drop('region', axis=1, inplace= True)
master_frame.drop('founded_at', axis=1, inplace= True)

master_frame.info()

## Checkpoint 2: Funding Type Analysis

In [None]:
# Checking null values

master_frame.isnull().sum()

In [None]:
# Removing rows with null value in raised_amount_usd

master_frame = master_frame[~np.isnan(master_frame['raised_amount_usd'])]
master_frame.isnull().sum()

In [None]:
# Creating the groups in master_frame by funding_round_type

master_frame_by_funding_amount = master_frame.groupby('funding_round_type')

In [None]:
# Finding the average of raised_amount_usd in funding round type groups

master_frame_by_funding_amount['raised_amount_usd'].mean().sort_values(ascending = False)

# Venture has the highest amoung (private_equity, venture, angel, seed) and in range of 5 million to 15 millions

## Checkpoint 3: Funding Type Analysis

In [None]:
# Creating new data frame master_frame_for_venture containing funding_round_tye Venture only

master_frame_for_venture = master_frame.loc[master_frame.funding_round_type == 'venture']
master_frame_for_venture.head()

In [None]:
# Grouping master_frame_for_venture by country_code

master_frame_by_country = master_frame_for_venture.groupby('country_code')

In [None]:
# Creating new dataframe named top9 with the top nine countries based on the total investment amount each country has received

top9=master_frame_by_country.raised_amount_usd.sum().sort_values(ascending = False).head(9)
top9

# Top three English-speaking countries are :- USA, GBR, IND

## Checkpoint 4: Sector Analysis 1

### Reading mapping.csv and storing in variable `mapping`

In [None]:
# Reading mapping.csv 
mapping = pd.read_csv("mapping.csv")
mapping.head()

In [None]:
# Removing row with NAN i.e first one

mapping = mapping[~pd.isnull(mapping['category_list'])]

In [None]:
# Reshaping the mapping data

reshaped_mapping = pd.melt(mapping, id_vars=["category_list"],var_name="main_sector", value_name="present")

In [None]:
# Removing rows values with 0 in presnet column

mapping_data = reshaped_mapping[reshaped_mapping.present == 1]

In [None]:
# Correcting the data as some name in catgeory contains 0 in place of na as Fi0nce it should be Finance

mapping_data['category_list'] = mapping_data['category_list'].str.replace('0','na')
mapping_data.head()

In [None]:
# Dropping column present we don't need it now

mapping_data = mapping_data.drop('present', axis=1)

In [None]:
# Renaming column category_list to primary_sector

mapping_data.rename(columns={'category_list': 'primary_sector'}, inplace=True)
mapping_data.head()

In [None]:
# Adding new column named primary_sector in master_frame by taking first value in category_list column

master_frame['primary_sector'] = master_frame['category_list'].str.split('|').str[0]
master_frame.head()

In [None]:
# Merging master_frame and mapping_data

merged_frame = pd.merge(master_frame, mapping_data, how='inner', on='primary_sector')
merged_frame.head()

## Checkpoint 5: Sector Analysis 2

### Creating new dataframes D1, D2 and D3 for each of the three countries 
### containing the observations of funding type venture falling within the 5-15 million USD range

In [None]:
D1 = merged_frame.loc[(merged_frame.funding_round_type == 'venture') & (merged_frame.country_code == 'USA') 
                          & (merged_frame.raised_amount_usd >= 5000000) & (merged_frame.raised_amount_usd <= 15000000),:]
D2 = merged_frame.loc[(merged_frame.funding_round_type == 'venture') & (merged_frame.country_code == 'GBR') 
                          & (merged_frame.raised_amount_usd >= 5000000) & (merged_frame.raised_amount_usd <= 15000000),:]
D3 = merged_frame.loc[(merged_frame.funding_round_type == 'venture') & (merged_frame.country_code == 'IND') 
                          & (merged_frame.raised_amount_usd >= 5000000) & (merged_frame.raised_amount_usd <= 15000000),:]

In [None]:
# Adding number_of_inv and amnt_invested columns to D1 containing count of investments and total amount invested in each sector

D1['number_of_inv'] = D1.groupby('main_sector')['main_sector'].transform('count')
D1['amnt_invested'] = D1.raised_amount_usd.groupby(D1.main_sector).transform('sum')
D1.sort_values(['number_of_inv'],ascending = False, inplace= True)
D1.head()

In [None]:
# Adding number_of_inv and amnt_invested columns to D2 containing count of investments and total amount invested in each sector

D2['number_of_inv'] = D2.groupby('main_sector')['main_sector'].transform('count')
D2['amnt_invested'] = D2.raised_amount_usd.groupby(D2.main_sector).transform('sum')
D2.sort_values(['number_of_inv'],ascending = False, inplace= True)
D2.head()

In [None]:
# Adding number_of_inv and amnt_invested columns to D3 containing count of investments and total amount invested in each sector

D3['number_of_inv'] = D3.groupby('main_sector')['main_sector'].transform('count')
D3['amnt_invested'] = D3.raised_amount_usd.groupby(D3.main_sector).transform('sum')
D3.sort_values(['number_of_inv'],ascending = False, inplace= True)
D3.head()

### Finding total number of investments

In [None]:
# Total number of investments in D1

D1.permalink.count()

In [None]:
# Total number of investments in D2

D2.permalink.count()

In [None]:
# Total number of investments in D3

D3.permalink.count()

### Total amount of investments

In [None]:
# Total amount of investments in D1

D1.raised_amount_usd.sum()

In [None]:
# Total amount of investments in D1

D2.raised_amount_usd.sum()

In [None]:
# Total amount of investments in D1

D3.raised_amount_usd.sum()

### Top sector based on count of investments

In [None]:
# Top sectors in D1

D1.groupby('main_sector')['raised_amount_usd'].count().reset_index(name='count').sort_values(['count'], ascending=False)

In [None]:
# Top sectors in D2

D2.groupby('main_sector')['raised_amount_usd'].count().reset_index(name='count').sort_values(['count'], ascending=False)

In [None]:
# Top sectors in D3

D3.groupby('main_sector')['raised_amount_usd'].count().reset_index(name='count').sort_values(['count'], ascending=False)

### which company received the highest investment

In [None]:
# For the top sectors count-wise, company received the highest investment for D1

d1s1 = D1.loc[D1.main_sector == 'Others']
top_company_d1s1 = d1s1.groupby('permalink')['raised_amount_usd'].sum().reset_index(name='amount').sort_values(['amount'], ascending=False)
top_company_d1s1.head(3)





In [None]:
# For the top sectors count-wise, company received the highest investment for D2

d2s1 = D2.loc[D2.main_sector == 'Others']
top_company_d2s1 = d2s1.groupby('permalink')['raised_amount_usd'].sum().reset_index(name='amount').sort_values(['amount'], ascending=False)
top_company_d2s1.head(3)

In [None]:
# For the top sectors count-wise, company received the highest investment for D3

d3s1 = D3.loc[D3.main_sector == 'Others']
top_company_d3s1 = d3s1.groupby('permalink')['raised_amount_usd'].sum().reset_index(name='amount').sort_values(['amount'], ascending=False)
top_company_d3s1.head(3)

In [None]:
# For the second best sectors count-wise, company received the highest investment for D1
d1s2 = D1.loc[D1.main_sector == 'Social, Finance, Analytics, Advertising']
top_company_d1s2 = d1s2.groupby('permalink')['raised_amount_usd'].sum().reset_index(name='amount').sort_values(['amount'], ascending=False)
top_company_d1s2.head(3)

In [None]:
# For the second best sectors count-wise, company received the highest investment for D2

d2s2 = D2.loc[D2.main_sector == 'Social, Finance, Analytics, Advertising']
top_company_d2s2 = d2s2.groupby('permalink')['raised_amount_usd'].sum().reset_index(name='amount').sort_values(['amount'], ascending=False)
top_company_d2s2.head(3)


In [None]:
# For the second best sectors count-wise, company received the highest investment for D3

d3s2 = D3.loc[D3.main_sector == 'Social, Finance, Analytics, Advertising']
top_company_d3s2 = d3s2.groupby('permalink')['raised_amount_usd'].sum().reset_index(name='amount').sort_values(['amount'], ascending=False)
top_company_d3s2.head(3)