In [1]:
# Imports
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [8]:
# Read the startup_data.csv file from the Resources folder into a Pandas DataFrame
df = pd.read_csv(
    Path("./Resources/startup_data.csv")
)

# Review the DataFrame
df.head()

Unnamed: 0,Company_Name,Dependent-Company Status,year of founding,Age of company in years,Internet Activity Score,Short Description of company profile,Industry of company,Focus functions of company,Investors,Employee Count,...,Percent_skill_Data Science,Percent_skill_Business Strategy,Percent_skill_Product Management,Percent_skill_Sales,Percent_skill_Domain,Percent_skill_Law,Percent_skill_Consulting,Percent_skill_Finance,Percent_skill_Investment,Renown score
0,Company1,Success,No Info,No Info,-1.0,Video distribution,,operation,KPCB Holdings|Draper Fisher Jurvetson (DFJ)|Kl...,3.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
1,Company2,Success,2011,3,125.0,,Market Research|Marketing|Crowdfunding,"Marketing, sales",,,...,8.823529412,21.76470588,10.88235294,2.941176471,0.0,0,0,0,0,8
2,Company3,Success,2011,3,455.0,Event Data Analytics API,Analytics|Cloud Computing|Software Development,operations,TechStars|Streamlined Ventures|Amplify Partner...,14.0,...,3.846153846,17.09401709,9.401709402,0.0,2.777777778,0,0,0,0,9
3,Company4,Success,2009,5,-99.0,The most advanced analytics for mobile,Mobile|Analytics,Marketing & Sales,Michael Birch|Max Levchin|Sequoia Capital|Keit...,45.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,5
4,Company5,Success,2010,4,496.0,The Location-Based Marketing Platform,Analytics|Marketing|Enterprise Software,Marketing & Sales,DFJ Frontier|Draper Nexus Ventures|Gil Elbaz|A...,39.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,6


In [9]:
# Review the data types associated with the columns
df.dtypes

Company_Name                 object
Dependent-Company Status     object
year of founding             object
Age of company in years      object
Internet Activity Score     float64
                             ...   
Percent_skill_Law            object
Percent_skill_Consulting     object
Percent_skill_Finance        object
Percent_skill_Investment     object
Renown score                 object
Length: 116, dtype: object

In [25]:
# Keep relevant columns in the dataframe
startup_df = df[['Age of company in years',
                 'Internet Activity Score', 
                 # 'Industry of company', 
                 'Focus functions of company', 
                 # 'Employee Count', 
                 # 'Has the team size grown', 
                 # 'Last Funding Date', 
                 'Last Funding Amount', 
                 'Country of company', 
                 # 'Continent of company', 
                 'Number of Investors in Seed', 
                 'Number of Investors in Angel and or VC', 
                 # 'Number of Co-founders', 
                 'Number of of advisors', 
                 'Team size Senior leadership', 
                 # 'Team size all employees', 
                 'Presence of a top angel or venture fund in previous round of investment', 
                 'Number of of repeat investors', 
                 # 'Number of  Sales Support material', 
                 # 'Worked in top companies', 
                 'Have been part of successful startups in the past?',
                 # 'Product or service company?', 
                 # 'Catering to product/service across verticals',
                 # 'Subscription based business', 
                 'Local or global player', 
                 'Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive',
                 # 'B2C or B2B venture?',
                 'Proprietary or patent position (competitive position)',
                 'Barriers of entry for the competitors',
                 # 'google page rank of company website',
                 'Disruptiveness of technology',
                 'Number of Direct competitors',
                 # 'Last round of funding received (in milionUSD)',
                 'Time to 1st investment (in months)', 
                 'Avg time to investment - average across all rounds, measured from previous investment']]
startup_df              

Unnamed: 0,Age of company in years,Internet Activity Score,Focus functions of company,Last Funding Amount,Country of company,Number of Investors in Seed,Number of Investors in Angel and or VC,Number of of advisors,Team size Senior leadership,Presence of a top angel or venture fund in previous round of investment,Number of of repeat investors,Have been part of successful startups in the past?,Local or global player,"Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive",Proprietary or patent position (competitive position),Barriers of entry for the competitors,Disruptiveness of technology,Number of Direct competitors,Time to 1st investment (in months),"Avg time to investment - average across all rounds, measured from previous investment"
0,No Info,-1.0,operation,450000.0,United States,2,0,2,2,Yes,4,No,Global,Yes,No,Yes,Low,0,No Info,11.56
1,3,125.0,"Marketing, sales",,United States,5,0,0,4,No,0,Yes,Local,No,Yes,Yes,Medium,0,10,9
2,3,455.0,operations,2350000.0,United States,15,0,0,7,No,0,No,Local,No,Yes,Yes,Medium,0,2,7.344444444
3,5,-99.0,Marketing & Sales,10250000.0,United States,6,0,0,4,Yes,0,Yes,Local,No,Yes,Yes,Medium,2,1,8.7
4,4,496.0,Marketing & Sales,5500000.0,United States,7,0,1,8,No,0,No,Local,Yes,Yes,Yes,Medium,0,13,9.822222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,3,-5.0,,,United States,0,0,0,1,No,0,No Info,local,No,No Info,Yes,No Info,No Info,No Info,No Info
468,,,consumer web,,,0,4,0,1,No,0,No Info,global,Yes,No Info,Yes,No Info,No Info,No Info,No Info
469,4,0.0,CAMPAIGN MANAGEMENT,,United States,No Info,No Info,0,1,No Info,No Info,No Info,GLOBAL,No,No Info,Yes,No Info,No Info,No Info,No Info
470,5,0.0,SOCIALMEDIA CAMPAIGN,100000.0,United States,No Info,No Info,2,1,No Info,No Info,No Info,GLOBAL,No,No Info,Yes,No Info,No Info,No Info,No Info


In [26]:
startup_df.dropna()

Unnamed: 0,Age of company in years,Internet Activity Score,Focus functions of company,Last Funding Amount,Country of company,Number of Investors in Seed,Number of Investors in Angel and or VC,Number of of advisors,Team size Senior leadership,Presence of a top angel or venture fund in previous round of investment,Number of of repeat investors,Have been part of successful startups in the past?,Local or global player,"Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive",Proprietary or patent position (competitive position),Barriers of entry for the competitors,Disruptiveness of technology,Number of Direct competitors,Time to 1st investment (in months),"Avg time to investment - average across all rounds, measured from previous investment"
0,No Info,-1.0,operation,450000.0,United States,2,0,2,2,Yes,4,No,Global,Yes,No,Yes,Low,0,No Info,11.56
2,3,455.0,operations,2350000.0,United States,15,0,0,7,No,0,No,Local,No,Yes,Yes,Medium,0,2,7.344444444
3,5,-99.0,Marketing & Sales,10250000.0,United States,6,0,0,4,Yes,0,Yes,Local,No,Yes,Yes,Medium,2,1,8.7
4,4,496.0,Marketing & Sales,5500000.0,United States,7,0,1,8,No,0,No,Local,Yes,Yes,Yes,Medium,0,13,9.822222222
5,4,106.0,analytics,1000000.0,United States,2,0,0,4,No,2,No,Local,No,Yes,No,High,0,12,9.322222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461,4,4.0,Operations,12000.0,United States,No Info,No Info,0,2,No Info,No Info,No,Local,No,No Info,Yes,No Info,No Info,No Info,No Info
462,7,18.0,Operations,440000.0,United States,No Info,No Info,0,3,No Info,No Info,No,Local,No,No Info,Yes,No Info,No Info,No Info,No Info
466,8,0.0,sales,30000000.0,United States,0,2,0,1,No,0,No Info,local,Yes,No Info,Yes,No Info,No Info,No Info,No Info
470,5,0.0,SOCIALMEDIA CAMPAIGN,100000.0,United States,No Info,No Info,2,1,No Info,No Info,No Info,GLOBAL,No,No Info,Yes,No Info,No Info,No Info,No Info


In [24]:
'''RENAME COLUMNS'''

'RENAME COLUMNS'

In [27]:
# Create a list of categorical variables 
categorical_variables = list(startup_df.dtypes[startup_df.dtypes == "object"].index)

# Display the categorical variables list
categorical_variables

['Age of company in years',
 'Focus functions of company',
 'Country of company',
 'Number of Investors in Seed',
 'Number of Investors in Angel and or VC',
 'Presence of a top angel or venture fund in previous round of investment',
 'Number of of repeat investors',
 'Have been part of successful startups in the past?',
 'Local or global player',
 'Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive',
 'Proprietary or patent position (competitive position)',
 'Barriers of entry for the competitors',
 'Disruptiveness of technology',
 'Number of Direct competitors',
 'Time to 1st investment (in months)',
 'Avg time to investment - average across all rounds, measured from previous investment']

In [28]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [29]:
# Encode the categorical variables using OneHotEncoder
encoded_data = enc.fit_transform(startup_df[categorical_variables])
encoded_data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.]])

In [30]:
# Create a DataFrame with the encoded variables
encoded_df = pd.DataFrame(
    encoded_data,
    columns = enc.get_feature_names(categorical_variables)
)

# Display the DataFrame
encoded_df

Unnamed: 0,Age of company in years_1,Age of company in years_10,Age of company in years_11,Age of company in years_12,Age of company in years_14,Age of company in years_15,Age of company in years_17,Age of company in years_2,Age of company in years_3,Age of company in years_4,...,"Avg time to investment - average across all rounds, measured from previous investment_9.077777778","Avg time to investment - average across all rounds, measured from previous investment_9.233333333","Avg time to investment - average across all rounds, measured from previous investment_9.322222222","Avg time to investment - average across all rounds, measured from previous investment_9.375","Avg time to investment - average across all rounds, measured from previous investment_9.45","Avg time to investment - average across all rounds, measured from previous investment_9.466666667","Avg time to investment - average across all rounds, measured from previous investment_9.688888889","Avg time to investment - average across all rounds, measured from previous investment_9.822222222","Avg time to investment - average across all rounds, measured from previous investment_96","Avg time to investment - average across all rounds, measured from previous investment_No Info"
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
