# Unicorn Predictor

In [1]:
# Imports
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import confusion_matrix,classification_report,balanced_accuracy_score

In [2]:
# Read the startup_data.csv file from the Resources folder into a Pandas DataFrame
df = pd.read_csv(
    Path("./Resources/startup_data.csv")
)

# Review the DataFrame
df.head()

Unnamed: 0,Company_Name,Dependent-Company Status,year of founding,Age of company in years,Internet Activity Score,Short Description of company profile,Industry of company,Focus functions of company,Investors,Employee Count,...,Percent_skill_Data Science,Percent_skill_Business Strategy,Percent_skill_Product Management,Percent_skill_Sales,Percent_skill_Domain,Percent_skill_Law,Percent_skill_Consulting,Percent_skill_Finance,Percent_skill_Investment,Renown score
0,Company1,Success,No Info,No Info,-1.0,Video distribution,,operation,KPCB Holdings|Draper Fisher Jurvetson (DFJ)|Kl...,3.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
1,Company2,Success,2011,3,125.0,,Market Research|Marketing|Crowdfunding,"Marketing, sales",,,...,8.823529412,21.76470588,10.88235294,2.941176471,0.0,0,0,0,0,8
2,Company3,Success,2011,3,455.0,Event Data Analytics API,Analytics|Cloud Computing|Software Development,operations,TechStars|Streamlined Ventures|Amplify Partner...,14.0,...,3.846153846,17.09401709,9.401709402,0.0,2.777777778,0,0,0,0,9
3,Company4,Success,2009,5,-99.0,The most advanced analytics for mobile,Mobile|Analytics,Marketing & Sales,Michael Birch|Max Levchin|Sequoia Capital|Keit...,45.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,5
4,Company5,Success,2010,4,496.0,The Location-Based Marketing Platform,Analytics|Marketing|Enterprise Software,Marketing & Sales,DFJ Frontier|Draper Nexus Ventures|Gil Elbaz|A...,39.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,6


In [3]:
# Review the data types associated with the columns
df.dtypes

Company_Name                 object
Dependent-Company Status     object
year of founding             object
Age of company in years      object
Internet Activity Score     float64
                             ...   
Percent_skill_Law            object
Percent_skill_Consulting     object
Percent_skill_Finance        object
Percent_skill_Investment     object
Renown score                 object
Length: 116, dtype: object

In [4]:
# Keep relevant columns in the dataframe
startup_df = df[['Dependent-Company Status', # y target
                 'Age of company in years',
                 'Internet Activity Score', 
                 # 'Industry of company', 
                 'Focus functions of company', 
                 # 'Employee Count', 
                 # 'Has the team size grown', 
                 # 'Last Funding Date', 
                 'Last Funding Amount', 
                 'Country of company', 
                 # 'Continent of company', 
                 'Number of Investors in Seed', 
                 'Number of Investors in Angel and or VC', 
                 # 'Number of Co-founders', 
                 'Number of of advisors', 
                 'Team size Senior leadership', 
                 # 'Team size all employees', 
                 'Presence of a top angel or venture fund in previous round of investment', 
                 'Number of of repeat investors', 
                 # 'Number of  Sales Support material', 
                 # 'Worked in top companies', 
                 'Have been part of successful startups in the past?',
                 # 'Product or service company?', 
                 # 'Catering to product/service across verticals',
                 # 'Subscription based business', 
                 'Local or global player', 
                 'Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive',
                 # 'B2C or B2B venture?',
                 'Proprietary or patent position (competitive position)',
                 'Barriers of entry for the competitors',
                 # 'google page rank of company website',
                 'Disruptiveness of technology',
                 'Number of Direct competitors',
                 # 'Last round of funding received (in milionUSD)',
                 'Time to 1st investment (in months)', 
                 'Avg time to investment - average across all rounds, measured from previous investment']]

#preview dataframe
startup_df              

Unnamed: 0,Dependent-Company Status,Age of company in years,Internet Activity Score,Focus functions of company,Last Funding Amount,Country of company,Number of Investors in Seed,Number of Investors in Angel and or VC,Number of of advisors,Team size Senior leadership,...,Number of of repeat investors,Have been part of successful startups in the past?,Local or global player,"Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive",Proprietary or patent position (competitive position),Barriers of entry for the competitors,Disruptiveness of technology,Number of Direct competitors,Time to 1st investment (in months),"Avg time to investment - average across all rounds, measured from previous investment"
0,Success,No Info,-1.0,operation,450000.0,United States,2,0,2,2,...,4,No,Global,Yes,No,Yes,Low,0,No Info,11.56
1,Success,3,125.0,"Marketing, sales",,United States,5,0,0,4,...,0,Yes,Local,No,Yes,Yes,Medium,0,10,9
2,Success,3,455.0,operations,2350000.0,United States,15,0,0,7,...,0,No,Local,No,Yes,Yes,Medium,0,2,7.344444444
3,Success,5,-99.0,Marketing & Sales,10250000.0,United States,6,0,0,4,...,0,Yes,Local,No,Yes,Yes,Medium,2,1,8.7
4,Success,4,496.0,Marketing & Sales,5500000.0,United States,7,0,1,8,...,0,No,Local,Yes,Yes,Yes,Medium,0,13,9.822222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,Failed,3,-5.0,,,United States,0,0,0,1,...,0,No Info,local,No,No Info,Yes,No Info,No Info,No Info,No Info
468,Failed,,,consumer web,,,0,4,0,1,...,0,No Info,global,Yes,No Info,Yes,No Info,No Info,No Info,No Info
469,Failed,4,0.0,CAMPAIGN MANAGEMENT,,United States,No Info,No Info,0,1,...,No Info,No Info,GLOBAL,No,No Info,Yes,No Info,No Info,No Info,No Info
470,Failed,5,0.0,SOCIALMEDIA CAMPAIGN,100000.0,United States,No Info,No Info,2,1,...,No Info,No Info,GLOBAL,No,No Info,Yes,No Info,No Info,No Info,No Info


### Rename Columns

In [5]:
#replace null values
startup_df.fillna(0)

Unnamed: 0,Dependent-Company Status,Age of company in years,Internet Activity Score,Focus functions of company,Last Funding Amount,Country of company,Number of Investors in Seed,Number of Investors in Angel and or VC,Number of of advisors,Team size Senior leadership,...,Number of of repeat investors,Have been part of successful startups in the past?,Local or global player,"Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive",Proprietary or patent position (competitive position),Barriers of entry for the competitors,Disruptiveness of technology,Number of Direct competitors,Time to 1st investment (in months),"Avg time to investment - average across all rounds, measured from previous investment"
0,Success,No Info,-1.0,operation,450000.0,United States,2,0,2,2,...,4,No,Global,Yes,No,Yes,Low,0,No Info,11.56
1,Success,3,125.0,"Marketing, sales",0.0,United States,5,0,0,4,...,0,Yes,Local,No,Yes,Yes,Medium,0,10,9
2,Success,3,455.0,operations,2350000.0,United States,15,0,0,7,...,0,No,Local,No,Yes,Yes,Medium,0,2,7.344444444
3,Success,5,-99.0,Marketing & Sales,10250000.0,United States,6,0,0,4,...,0,Yes,Local,No,Yes,Yes,Medium,2,1,8.7
4,Success,4,496.0,Marketing & Sales,5500000.0,United States,7,0,1,8,...,0,No,Local,Yes,Yes,Yes,Medium,0,13,9.822222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,Failed,3,-5.0,0,0.0,United States,0,0,0,1,...,0,No Info,local,No,No Info,Yes,No Info,No Info,No Info,No Info
468,Failed,0,0.0,consumer web,0.0,0,0,4,0,1,...,0,No Info,global,Yes,No Info,Yes,No Info,No Info,No Info,No Info
469,Failed,4,0.0,CAMPAIGN MANAGEMENT,0.0,United States,No Info,No Info,0,1,...,No Info,No Info,GLOBAL,No,No Info,Yes,No Info,No Info,No Info,No Info
470,Failed,5,0.0,SOCIALMEDIA CAMPAIGN,100000.0,United States,No Info,No Info,2,1,...,No Info,No Info,GLOBAL,No,No Info,Yes,No Info,No Info,No Info,No Info


In [6]:
#rename unclear column name descriptions

startup_df.rename(columns = {
                 'Age of company in years': 'Age of Company',
                 'Number of of repeat investors': 'Number of repeat investors',
                 'Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive': 'Capital intensive business',
                 'Proprietary or patent position (competitive position)':'Proprietary or patent position',
                 'Barriers of entry for the competitors':'Barriers of entry for the competitors'})

Unnamed: 0,Dependent-Company Status,Age of Company,Internet Activity Score,Focus functions of company,Last Funding Amount,Country of company,Number of Investors in Seed,Number of Investors in Angel and or VC,Number of of advisors,Team size Senior leadership,...,Number of repeat investors,Have been part of successful startups in the past?,Local or global player,Capital intensive business,Proprietary or patent position,Barriers of entry for the competitors,Disruptiveness of technology,Number of Direct competitors,Time to 1st investment (in months),"Avg time to investment - average across all rounds, measured from previous investment"
0,Success,No Info,-1.0,operation,450000.0,United States,2,0,2,2,...,4,No,Global,Yes,No,Yes,Low,0,No Info,11.56
1,Success,3,125.0,"Marketing, sales",,United States,5,0,0,4,...,0,Yes,Local,No,Yes,Yes,Medium,0,10,9
2,Success,3,455.0,operations,2350000.0,United States,15,0,0,7,...,0,No,Local,No,Yes,Yes,Medium,0,2,7.344444444
3,Success,5,-99.0,Marketing & Sales,10250000.0,United States,6,0,0,4,...,0,Yes,Local,No,Yes,Yes,Medium,2,1,8.7
4,Success,4,496.0,Marketing & Sales,5500000.0,United States,7,0,1,8,...,0,No,Local,Yes,Yes,Yes,Medium,0,13,9.822222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,Failed,3,-5.0,,,United States,0,0,0,1,...,0,No Info,local,No,No Info,Yes,No Info,No Info,No Info,No Info
468,Failed,,,consumer web,,,0,4,0,1,...,0,No Info,global,Yes,No Info,Yes,No Info,No Info,No Info,No Info
469,Failed,4,0.0,CAMPAIGN MANAGEMENT,,United States,No Info,No Info,0,1,...,No Info,No Info,GLOBAL,No,No Info,Yes,No Info,No Info,No Info,No Info
470,Failed,5,0.0,SOCIALMEDIA CAMPAIGN,100000.0,United States,No Info,No Info,2,1,...,No Info,No Info,GLOBAL,No,No Info,Yes,No Info,No Info,No Info,No Info


In [7]:
# Create a list of categorical variables 
categorical_variables = list(startup_df.dtypes[startup_df.dtypes == "object"].index)

# Display the categorical variables list
categorical_variables

['Dependent-Company Status',
 'Age of company in years',
 'Focus functions of company',
 'Country of company',
 'Number of Investors in Seed',
 'Number of Investors in Angel and or VC',
 'Presence of a top angel or venture fund in previous round of investment',
 'Number of of repeat investors',
 'Have been part of successful startups in the past?',
 'Local or global player',
 'Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive',
 'Proprietary or patent position (competitive position)',
 'Barriers of entry for the competitors',
 'Disruptiveness of technology',
 'Number of Direct competitors',
 'Time to 1st investment (in months)',
 'Avg time to investment - average across all rounds, measured from previous investment']

In [8]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [9]:
# Encode the categorical variables using OneHotEncoder
encoded_data = enc.fit_transform(startup_df[categorical_variables])
encoded_data

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.]])

In [10]:
# Create a DataFrame with the encoded variables
encoded_df = pd.DataFrame(
    encoded_data,
    columns = enc.get_feature_names(categorical_variables)
)

# Display the encoded DataFrame
encoded_df.fillna(0)

Unnamed: 0,Dependent-Company Status_Failed,Dependent-Company Status_Success,Age of company in years_1,Age of company in years_10,Age of company in years_11,Age of company in years_12,Age of company in years_14,Age of company in years_15,Age of company in years_17,Age of company in years_2,...,"Avg time to investment - average across all rounds, measured from previous investment_9.077777778","Avg time to investment - average across all rounds, measured from previous investment_9.233333333","Avg time to investment - average across all rounds, measured from previous investment_9.322222222","Avg time to investment - average across all rounds, measured from previous investment_9.375","Avg time to investment - average across all rounds, measured from previous investment_9.45","Avg time to investment - average across all rounds, measured from previous investment_9.466666667","Avg time to investment - average across all rounds, measured from previous investment_9.688888889","Avg time to investment - average across all rounds, measured from previous investment_9.822222222","Avg time to investment - average across all rounds, measured from previous investment_96","Avg time to investment - average across all rounds, measured from previous investment_No Info"
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
468,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
469,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
470,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
# Create a DataFrame with the columnns containing numerical variables from the original dataset
numerical_variables_df = startup_df.drop(columns = categorical_variables).copy()

# Review the DataFrame
numerical_variables_df.fillna(0)

Unnamed: 0,Internet Activity Score,Last Funding Amount,Number of of advisors,Team size Senior leadership
0,-1.0,450000.0,2,2
1,125.0,0.0,0,4
2,455.0,2350000.0,0,7
3,-99.0,10250000.0,0,4
4,496.0,5500000.0,1,8
...,...,...,...,...
467,-5.0,0.0,0,1
468,0.0,0.0,0,1
469,0.0,0.0,0,1
470,0.0,100000.0,2,1


In [12]:
# Add the numerical variables from the original DataFrame to the one-hot encoding DataFrame
encoded_df = pd.concat(
    [
        numerical_variables_df,
        encoded_df
    ],
    axis=1
)

# Review the Dataframe
encoded_df.fillna(0)

Unnamed: 0,Internet Activity Score,Last Funding Amount,Number of of advisors,Team size Senior leadership,Dependent-Company Status_Failed,Dependent-Company Status_Success,Age of company in years_1,Age of company in years_10,Age of company in years_11,Age of company in years_12,...,"Avg time to investment - average across all rounds, measured from previous investment_9.077777778","Avg time to investment - average across all rounds, measured from previous investment_9.233333333","Avg time to investment - average across all rounds, measured from previous investment_9.322222222","Avg time to investment - average across all rounds, measured from previous investment_9.375","Avg time to investment - average across all rounds, measured from previous investment_9.45","Avg time to investment - average across all rounds, measured from previous investment_9.466666667","Avg time to investment - average across all rounds, measured from previous investment_9.688888889","Avg time to investment - average across all rounds, measured from previous investment_9.822222222","Avg time to investment - average across all rounds, measured from previous investment_96","Avg time to investment - average across all rounds, measured from previous investment_No Info"
0,-1.0,450000.0,2,2,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,125.0,0.0,0,4,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,455.0,2350000.0,0,7,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-99.0,10250000.0,0,4,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,496.0,5500000.0,1,8,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,-5.0,0.0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
468,0.0,0.0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
469,0.0,0.0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
470,0.0,100000.0,2,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
# Define the target set y using the Dependent-Company Status column
y = encoded_df["Dependent-Company Status_Success"]

# Display a sample of y
y

0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
467    0.0
468    0.0
469    0.0
470    0.0
471    0.0
Name: Dependent-Company Status_Success, Length: 472, dtype: float64

In [14]:
# Define features set X by selecting all columns but Dependent-Company Status_Success
X = encoded_df.drop(columns=["Dependent-Company Status_Success","Dependent-Company Status_Failed"])

# Review the features DataFrame
X.fillna(0, inplace=True)
X

Unnamed: 0,Internet Activity Score,Last Funding Amount,Number of of advisors,Team size Senior leadership,Age of company in years_1,Age of company in years_10,Age of company in years_11,Age of company in years_12,Age of company in years_14,Age of company in years_15,...,"Avg time to investment - average across all rounds, measured from previous investment_9.077777778","Avg time to investment - average across all rounds, measured from previous investment_9.233333333","Avg time to investment - average across all rounds, measured from previous investment_9.322222222","Avg time to investment - average across all rounds, measured from previous investment_9.375","Avg time to investment - average across all rounds, measured from previous investment_9.45","Avg time to investment - average across all rounds, measured from previous investment_9.466666667","Avg time to investment - average across all rounds, measured from previous investment_9.688888889","Avg time to investment - average across all rounds, measured from previous investment_9.822222222","Avg time to investment - average across all rounds, measured from previous investment_96","Avg time to investment - average across all rounds, measured from previous investment_No Info"
0,-1.0,450000.0,2,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,125.0,0.0,0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,455.0,2350000.0,0,7,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-99.0,10250000.0,0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,496.0,5500000.0,1,8,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,-5.0,0.0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
468,0.0,0.0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
469,0.0,0.0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
470,0.0,100000.0,2,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [15]:
# Split the preprocessed data into a training and testing dataset
# Assign the function a random_state equal to 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [16]:
# Review the distinct values from y
y_train.value_counts()

1.0    235
0.0    119
Name: Dependent-Company Status_Success, dtype: int64

In [17]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Fit the scaler to the features training dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Incorporating encoded fake data df for example purposes

In [18]:
#display fake data df for example usage
fakedata_df = pd.read_csv(Path('Resources/fakedatadf.csv'))
fakedata_df

Unnamed: 0,Age of company in years,Internet Activity Score,Focus functions of company,Last Funding Amount,Country of company,Number of Investors in Seed,Number of Investors in Angel and or VC,Number of of advisors,Team size Senior leadership,Presence of a top angel or venture fund in previous round of investment,Number of of repeat investors,Have been part of successful startups in the past?,Local or global player,"Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive",Proprietary or patent position (competitive position),Barriers of entry for the competitors,Disruptiveness of technology,Number of Direct competitors,Time to 1st investment (in months),Avg time to investment - average across all rounds measured from previous investment
0,3,50,operation,450000,United States,2,0,3,4,No,4,Yes,Global,Yes,No,Yes,Medium,1,7,9.234


In [19]:
#encode the fakedata_df to match up with trained encoded data df for testing purposes
encoded_fakedata_df = enc.fit_transform(startup_df[categorical_variables])

In [76]:
#create df for newly encoded df based on example testing information
encoded_fakedata_df = pd.DataFrame(
    encoded_data,
    columns = enc.get_feature_names(categorical_variables))

#preview the test encoded data
encoded_fakedata_df.head()

Unnamed: 0,Dependent-Company Status_Failed,Dependent-Company Status_Success,Age of company in years_1,Age of company in years_10,Age of company in years_11,Age of company in years_12,Age of company in years_14,Age of company in years_15,Age of company in years_17,Age of company in years_2,...,"Avg time to investment - average across all rounds, measured from previous investment_9.077777778","Avg time to investment - average across all rounds, measured from previous investment_9.233333333","Avg time to investment - average across all rounds, measured from previous investment_9.322222222","Avg time to investment - average across all rounds, measured from previous investment_9.375","Avg time to investment - average across all rounds, measured from previous investment_9.45","Avg time to investment - average across all rounds, measured from previous investment_9.466666667","Avg time to investment - average across all rounds, measured from previous investment_9.688888889","Avg time to investment - average across all rounds, measured from previous investment_9.822222222","Avg time to investment - average across all rounds, measured from previous investment_96","Avg time to investment - average across all rounds, measured from previous investment_No Info"
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [79]:
#seperating the y and X in the encoded_fakedata_df  to get matching container sizes for predictions

encoded_fakedata_df = encoded_fakedata_df.drop(columns=["Dependent-Company Status_Success","Dependent-Company Status_Failed"]).copy()
encoded_fakedata_df

Unnamed: 0,Age of company in years_1,Age of company in years_10,Age of company in years_11,Age of company in years_12,Age of company in years_14,Age of company in years_15,Age of company in years_17,Age of company in years_2,Age of company in years_3,Age of company in years_4,...,"Avg time to investment - average across all rounds, measured from previous investment_9.077777778","Avg time to investment - average across all rounds, measured from previous investment_9.233333333","Avg time to investment - average across all rounds, measured from previous investment_9.322222222","Avg time to investment - average across all rounds, measured from previous investment_9.375","Avg time to investment - average across all rounds, measured from previous investment_9.45","Avg time to investment - average across all rounds, measured from previous investment_9.466666667","Avg time to investment - average across all rounds, measured from previous investment_9.688888889","Avg time to investment - average across all rounds, measured from previous investment_9.822222222","Avg time to investment - average across all rounds, measured from previous investment_96","Avg time to investment - average across all rounds, measured from previous investment_No Info"
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [80]:
# Add the numerical variables from the original DataFrame to the one-hot encoding DataFrame
encoded_fakedata_df = pd.concat(
    [
        numerical_variables_df,
        encoded_fakedata_df
    ],
    axis=1
)

# Review the Dataframe
encoded_fakedata_df.fillna(0)

Unnamed: 0,Internet Activity Score,Last Funding Amount,Number of of advisors,Team size Senior leadership,Age of company in years_1,Age of company in years_10,Age of company in years_11,Age of company in years_12,Age of company in years_14,Age of company in years_15,...,"Avg time to investment - average across all rounds, measured from previous investment_9.077777778","Avg time to investment - average across all rounds, measured from previous investment_9.233333333","Avg time to investment - average across all rounds, measured from previous investment_9.322222222","Avg time to investment - average across all rounds, measured from previous investment_9.375","Avg time to investment - average across all rounds, measured from previous investment_9.45","Avg time to investment - average across all rounds, measured from previous investment_9.466666667","Avg time to investment - average across all rounds, measured from previous investment_9.688888889","Avg time to investment - average across all rounds, measured from previous investment_9.822222222","Avg time to investment - average across all rounds, measured from previous investment_96","Avg time to investment - average across all rounds, measured from previous investment_No Info"
0,-1.0,450000.0,2,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,125.0,0.0,0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,455.0,2350000.0,0,7,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-99.0,10250000.0,0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,496.0,5500000.0,1,8,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,-5.0,0.0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
468,0.0,0.0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
469,0.0,0.0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
470,0.0,100000.0,2,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [81]:
#fill in/replace missing values
encoded_fakedata_df.fillna(0, inplace=True)

In [82]:
#scale the encoded_fakedata_df with the X_scaler to use for predictions on y target in the models below

X_fakedata_df_scaled = X_scaler.transform(encoded_fakedata_df)
#preview array that can be used in the y prediction for "real input data" 
X_fakedata_df_scaled

array([[-0.46510968, -0.453277  ,  0.51570613, ..., -0.05322463,
        -0.05322463, -0.4831661 ],
       [ 0.12678054, -0.50531399, -0.50134106, ..., -0.05322463,
        -0.05322463, -0.4831661 ],
       [ 1.6769692 , -0.23356525, -0.50134106, ..., -0.05322463,
        -0.05322463, -0.4831661 ],
       ...,
       [-0.46041214, -0.50531399, -0.50134106, ..., -0.05322463,
        -0.05322463,  2.06968164],
       [-0.46041214, -0.49375021,  0.51570613, ..., -0.05322463,
        -0.05322463,  2.06968164],
       [-0.40873918, -0.49143746, -0.50134106, ..., -0.05322463,
        -0.05322463,  2.06968164]])

## after this section above (cleaning data) we input the information into the various different ML models below

# Neural Network Model (2 layers 1 output)

In [26]:
# Define the the number of inputs (features) to the model
number_input_features = len(X_train.iloc[0])

# Review the number of features
number_input_features

509

In [27]:
# Define the number of neurons in the output layer
number_output_neurons = 1

In [28]:
# Define the number of hidden nodes for the first hidden layer
hidden_nodes_layer1 =  (number_input_features + number_output_neurons) // 2 

# Review the number hidden nodes in the first layer
hidden_nodes_layer1

255

In [29]:
# Define the number of hidden nodes for the second hidden layer
hidden_nodes_layer2 =  (hidden_nodes_layer1 + number_output_neurons) // 2 

# Review the number hidden nodes in the second layer
hidden_nodes_layer2

128

In [30]:
# Create the Sequential model instance
nn = Sequential()

In [31]:
# Add the first hidden layer
nn.add(Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

In [32]:
# Add the second hidden layer
nn.add(Dense(units=hidden_nodes_layer2, activation="relu"))

In [33]:
# Add the output layer to the model specifying the number of output neurons and activation function
nn.add(Dense(units=1, activation="sigmoid"))

In [34]:
# Display the Sequential model summary
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 255)               130050    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32768     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 162,947
Trainable params: 162,947
Non-trainable params: 0
_________________________________________________________________


In [35]:
# Compile the Sequential model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [36]:
# Fit the model using 50 epochs and the training data
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [84]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

4/4 - 0s - loss: 0.4928 - accuracy: 0.8644
Loss: 0.49278637766838074, Accuracy: 0.8644067645072937


# RandomClassifier Model

In [85]:
# Import the RandomForestClassifier from sklearn
from sklearn.ensemble import RandomForestClassifier

# Instantiate a RandomForestClassifier instance
model = RandomForestClassifier()

# Fit the traning data to the model
model.fit(X_train_scaled, y_train)

RandomForestClassifier()

In [86]:
# Predict labels for original scaled testing features
y_pred = model.predict(X_test_scaled)

In [87]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, y_pred)

0.8119047619047619

In [88]:
# Print confusion matrix
confusion_matrix(y_test,y_pred)

array([[32, 16],
       [ 3, 67]], dtype=int64)

In [89]:
# Print classification reports for the train vs test
print(f" Random Classifier Report")
print(classification_report(y_pred, y_test))

 Random Classifier Report
              precision    recall  f1-score   support

         0.0       0.67      0.91      0.77        35
         1.0       0.96      0.81      0.88        83

    accuracy                           0.84       118
   macro avg       0.81      0.86      0.82       118
weighted avg       0.87      0.84      0.84       118



# Gaussian Process Classifier Model

In [90]:
#import new model: Gaussian Process Classifier from sklearn
from sklearn.gaussian_process import GaussianProcessClassifier

In [91]:
#define the model and parameters
gaussian_model = GaussianProcessClassifier(max_iter_predict=1000,n_restarts_optimizer=3, random_state=1)

In [92]:
#fit the model
gaussian_model.fit(X_train_scaled, y_train)

GaussianProcessClassifier(max_iter_predict=1000, n_restarts_optimizer=3,
                          random_state=1)

In [93]:
#set prediction based on X_test_scaled
y_pred = gaussian_model.predict(X_test_scaled)

In [94]:
#check balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.7193452380952381

In [95]:
#display confusion matrix based on y_pred
confusion_matrix(y_test,y_pred)

array([[43,  5],
       [32, 38]], dtype=int64)

In [96]:
#display classification report on the Gaussian Classifier model
print(f" Gaussian Classifiction Report")
print(classification_report(y_pred, y_test))

 Gaussian Classifiction Report
              precision    recall  f1-score   support

         0.0       0.90      0.57      0.70        75
         1.0       0.54      0.88      0.67        43

    accuracy                           0.69       118
   macro avg       0.72      0.73      0.69       118
weighted avg       0.77      0.69      0.69       118



# AdaBoost Model

In [97]:
#import AdaBoost model and supporting 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris

In [98]:
#set X and y
X, y = load_iris(return_X_y=True)

In [99]:
#define model with parameters
adaboost_model = AdaBoostClassifier(n_estimators=100)

In [100]:
#define cross_val_score with model
scores = cross_val_score(adaboost_model, X, y, cv=5)

In [101]:
#display average of scores
scores.mean()

0.9466666666666665

In [102]:
#fit AdaBoost model
adaboost_model.fit(X_train_scaled, y_train)

AdaBoostClassifier(n_estimators=100)

In [103]:
#test prediction based on X_test_scaled
y_pred = adaboost_model.predict(X_test_scaled)

In [104]:
#check balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.8738095238095238

In [105]:
#display confusion matrix
confusion_matrix(y_test,y_pred)

array([[40,  8],
       [ 6, 64]], dtype=int64)

In [106]:
#display AdaBoost Classifier report

print(f"AdaBoost Classifiction Report")
print(classification_report(y_pred, y_test))

AdaBoost Classifiction Report
              precision    recall  f1-score   support

         0.0       0.83      0.87      0.85        46
         1.0       0.91      0.89      0.90        72

    accuracy                           0.88       118
   macro avg       0.87      0.88      0.88       118
weighted avg       0.88      0.88      0.88       118



## Classification Tree

In [107]:
# Import Modules: Classification Tree
from sklearn import tree

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image


In [108]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [109]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

* Making Predictions Using the Tree Model

In [110]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

In [111]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, y_pred)

0.8738095238095238

In [112]:
# Print confusion matrix
confusion_matrix(y_test,y_pred)

array([[40,  8],
       [ 6, 64]], dtype=int64)

In [113]:
# Displaying classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.72      0.75      0.73        48
         1.0       0.82      0.80      0.81        70

    accuracy                           0.78       118
   macro avg       0.77      0.78      0.77       118
weighted avg       0.78      0.78      0.78       118



In [114]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, y_pred)

0.8738095238095238

 * Visualizing the Decision Tree

In [115]:
# # Draw graph
# graph = pydotplus.graph_from_dot_data(dot_data)

# # Show graph
# Image(graph.create_png())

In [116]:
# # Saving the tree as PDF
# file_path = ("./Resources/startup_data.csv")
# graph.write_pdf(file_path)

# # Saving the tree as PNG
# file_path = ("./Resources/startup_data.csv")
# graph.write_png(file_path)

# #put this code in images

## 1.5. Stochastic Gradient Descent¶


In [117]:
#import new Gradient Descent Model from SKlearn
from sklearn.linear_model import SGDClassifier

#define model 
#fit model 
clf = SGDClassifier()
clf.fit(X_train_scaled, y_train)


SGDClassifier()

In [118]:
#make prediction with trained model based on X_test_scaled
y_pred = clf.predict(X_test_scaled)

In [119]:
y_pred

array([1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1.,
       1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 0.,
       0., 1., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 1., 0., 1.,
       1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0., 1., 1., 0.,
       1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1.,
       0., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1.])

In [120]:
#check balanced accuracy score

balanced_accuracy_score(y_test, y_pred)

0.850297619047619

In [121]:
#display confusion matrix

confusion_matrix(y_test,y_pred)

array([[35, 13],
       [ 2, 68]], dtype=int64)

In [122]:
#display classification report for Gradient Descent Model
print(f"Gradient Descent Model")
print(classification_report(y_test,y_pred))

Gradient Descent Model
              precision    recall  f1-score   support

         0.0       0.95      0.73      0.82        48
         1.0       0.84      0.97      0.90        70

    accuracy                           0.87       118
   macro avg       0.89      0.85      0.86       118
weighted avg       0.88      0.87      0.87       118



# Conclusion Model Chosen

nn 0.8644,
random classifier 0.84,
Gaussian 0.69,
AdaBoost 0.88,
Classification Tree 0.78,
Gradient 0.87

In conclusion, the ADABOOST Model slightly outperforms the rest of the models and we will be choosing AdaBoost as the ML model used to predict user input information