In [1]:
import pandas as pd

# Load the dataset
file_path = "F:\\school\\Azubi Africa\\LP1 Data Analytics Project\\LP-1-Project\\data\\aba3.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())

   Unnamed: 0 Company_Brand  Founded HeadQuarter              Sector  \
0           0    Aqgromalin   2019.0     Chennai            AgriTech   
1           1      Krayonnz   2019.0   Bangalore              EdTech   
2           2  PadCare Labs   2018.0        Pune  Hygiene management   
3           3         NCOME   2020.0   New Delhi              Escrow   
4           4    Gramophone   2016.0      Indore            AgriTech   

                                        What_it_does  \
0                       Cultivating Ideas for Profit   
1  An academy-guardian-scholar centric ecosystem ...   
2   Converting bio-hazardous waste to harmless waste   
3                       Escrow-as-a-service platform   
4  Gramophone is an AgTech platform enabling acce...   

                                            Founders  \
0                    Prasanna Manogaran, Bharani C L   
1                   Saurabh Dixit, Gurudutt Upadhyay   
2                                    Ajinkya Dhariya   
3     

**Data Cleaning and Preprocessing**

In [2]:
# Display the first few rows of the dataframe to understand its structure
print("Original Dataframe:")
print(data.head())

# 1. Standardize Column Names
data.columns = data.columns.str.replace('[^A-Za-z0-9_]+', '', regex=True)
data.rename(columns={'CompanyName': 'Company_Name', 
                   'HeadQuarter': 'Head_Quarter',
                   'AboutCompany': 'About_Company'}, inplace=True)

# 2. Handle Missing Values
# a. Replace 'Undisclosed' with NaN in 'Amount' column
data['Amount'] = data['Amount'].replace('Undisclosed')

# b. Convert 'Amount' column to numeric (float)
data['Amount'] = pd.to_numeric(data['Amount'], errors='coerce')

# 3. Clean 'Industry' Column
data['Industry'] = data['Industry'].str.strip()

# 4. Clean 'Head_Quarter' Column
data['Head_Quarter'] = data['Head_Quarter'].str.strip()
data['Head_Quarter'] = data['Head_Quarter'].str.replace('Small Towns, Andhra Pradesh', 'Andhra Pradesh', regex=False)
data['Head_Quarter'] = data['Head_Quarter'].str.replace('Faridabad, Haryana', 'Faridabad', regex=False)

# 5. Clean 'RoundSeries' Column
data['RoundSeries'] = data['RoundSeries'].str.strip()

# 6. Correcting inconsistencies in 'Company_Name'
data['Company_Name'] = data['Company_Name'].str.strip()

# Replace specific company names to maintain consistency
data['Company_Name'] = data['Company_Name'].replace({
    'PUREandFRES-Milk': 'Puresh Daily',
    'FanPlay': 'FanPlay'
})

# 7. Remove Duplicates
data.drop_duplicates(inplace=True)

# Print some info and the head of the cleaned data
print("\nCleaned Dataframe Info:")
print(data.info())

print("\nCleaned Dataframe:")
print(data.head())


Original Dataframe:
   Unnamed: 0 Company_Brand  Founded HeadQuarter              Sector  \
0           0    Aqgromalin   2019.0     Chennai            AgriTech   
1           1      Krayonnz   2019.0   Bangalore              EdTech   
2           2  PadCare Labs   2018.0        Pune  Hygiene management   
3           3         NCOME   2020.0   New Delhi              Escrow   
4           4    Gramophone   2016.0      Indore            AgriTech   

                                        What_it_does  \
0                       Cultivating Ideas for Profit   
1  An academy-guardian-scholar centric ecosystem ...   
2   Converting bio-hazardous waste to harmless waste   
3                       Escrow-as-a-service platform   
4  Gramophone is an AgTech platform enabling acce...   

                                            Founders  \
0                    Prasanna Manogaran, Bharani C L   
1                   Saurabh Dixit, Gurudutt Upadhyay   
2                                    Ajink

  data['Amount'] = data['Amount'].replace('Undisclosed')


TypeError: arg must be a list, tuple, 1-d array, or Series

In [None]:
# Path where you want to save the file
path_to_save = "F:\\school\\Azubi Africa\\LP1 Data Analytics Project\\LP-1-Project\\data"

# Save the cleaned dataset to a new CSV file.
data.to_csv(f"{path_to_save}\\Aba3_cleaned.csv", index=False)


In [None]:
import pandas as pd

# Load the dataset
file_path = "F:\\school\\Azubi Africa\\LP1 Data Analytics Project\\LP-1-Project\\data\\Aba3_cleaned.csv"
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(df.head())

In [None]:
print(df.columns)


**Analyze Funding Rounds**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Group by 'Round/Series' and calculate the total funding amount
funding_by_round = df.groupby('RoundSeries')['Amount'].sum().sort_values(ascending=False)

# Plot the total funding amount by round
plt.figure(figsize=(10, 6))
sns.barplot(x=funding_by_round.index, y=funding_by_round.values, palette='viridis')
plt.title('Total Funding Amount by RoundSeries')
plt.xlabel('RoundSeries')
plt.ylabel('Total Funding Amount (in USD)')
plt.xticks(rotation=45)
plt.show()

**Analyze Industries**

In [None]:
# Group by 'Industry' and calculate the total funding amount
funding_by_industry = df.groupby('Industry')['Amount'].sum().sort_values(ascending=False)

# Plot the total funding amount by industry
plt.figure(figsize=(12, 8))
sns.barplot(x=funding_by_industry.index, y=funding_by_industry.values, palette='magma')
plt.title('Total Funding Amount by Industry')
plt.xlabel('Industry')
plt.ylabel('Total Funding Amount (in USD)')
plt.xticks(rotation=90)
plt.show()

**Geographical Distribution**

In [None]:
# Group by 'HeadQuarter' and calculate the total funding amount
funding_by_city = df.groupby('HeadQuarter')['Amount'].sum().sort_values(ascending=False)

# Plot the total funding amount by city
plt.figure(figsize=(12, 8))
sns.barplot(x=funding_by_city.index, y=funding_by_city.values, palette='plasma')
plt.title('Total Funding Amount by City')
plt.xlabel('City')
plt.ylabel('Total Funding Amount (in USD)')
plt.xticks(rotation=90)
plt.show()

**Analyze Investors**

In [None]:
# Split the 'Investor' column (since it contains multiple investors separated by commas)
df['Investor'] = df['Investor'].str.split(', ')

# Explode the 'Investor' column to have one investor per row
df_exploded = df.explode('Investor')

# Group by 'Investor' and calculate the total funding amount
funding_by_investor = df_exploded.groupby('Investor')['Amount'].sum().sort_values(ascending=False).head(20)

# Plot the total funding amount by investor
plt.figure(figsize=(12, 8))
sns.barplot(x=funding_by_investor.index, y=funding_by_investor.values, palette='inferno')
plt.title('Total Funding Amount by Investor (Top 20)')
plt.xlabel('Investor')
plt.ylabel('Total Funding Amount (in USD)')
plt.xticks(rotation=90)
plt.show()

**Analyze Company Age**

In [None]:
# Calculate the age of each company (assuming the current year is 2023)
df['Company_Age'] = 2023 - df['Founded'].dt.year

# Plot the distribution of company ages
plt.figure(figsize=(10, 6))
sns.histplot(df['Company_Age'], bins=20, kde=True, color='blue')
plt.title('Distribution of Company Ages')
plt.xlabel('Company Age (Years)')
plt.ylabel('Frequency')
plt.show()

# Plot funding amount vs company age
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['Company_Age'], y=df['Amount'], alpha=0.6, color='green')
plt.title('Funding Amount vs Company Age')
plt.xlabel('Company Age (Years)')
plt.ylabel('Funding Amount (in USD)')
plt.show()

**Analyze Founders**

In [None]:
# Split the 'Founders' column (since it contains multiple founders separated by commas)
df['Founders'] = df['Founders'].str.split(', ')

# Explode the 'Founders' column to have one founder per row
df_exploded_founders = df.explode('Founders')

# Group by 'Founders' and calculate the total funding amount
funding_by_founder = df_exploded_founders.groupby('Founders')['Amount'].sum().sort_values(ascending=False).head(20)

# Plot the total funding amount by founder
plt.figure(figsize=(12, 8))
sns.barplot(x=funding_by_founder.index, y=funding_by_founder.values, palette='cividis')
plt.title('Total Funding Amount by Founder (Top 20)')
plt.xlabel('Founder')
plt.ylabel('Total Funding Amount (in USD)')
plt.xticks(rotation=90)
plt.show()

**Analyze Funding Trends Over Time**

In [None]:
# Extract the year from the 'Founded' column
df['Year'] = df['Founded'].dt.year

# Group by 'Year' and calculate the total funding amount
funding_by_year = df.groupby('Year')['Amount'].sum()

# Plot the total funding amount by year
plt.figure(figsize=(10, 6))
sns.lineplot(x=funding_by_year.index, y=funding_by_year.values, marker='o', color='purple')
plt.title('Total Funding Amount by Year')
plt.xlabel('Year')
plt.ylabel('Total Funding Amount (in USD)')
plt.show()

**Analyze Company Size**

In [None]:
# Assuming 'Company_Size' is a column in the dataset (if not, you may need to derive it)
# For example, you could categorize companies based on funding amount
df['Company_Size'] = pd.cut(df['Amount'], bins=[0, 1e6, 10e6, 50e6, 100e6, float('inf')], 
                          labels=['Small', 'Medium', 'Large', 'Very Large', 'Mega'])

# Plot the distribution of company sizes
plt.figure(figsize=(10, 6))
sns.countplot(x=df['Company_Size'], palette='rocket')
plt.title('Distribution of Company Sizes')
plt.xlabel('Company Size')
plt.ylabel('Count')
plt.show()