In [1]:
# Imports
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import confusion_matrix,classification_report,balanced_accuracy_score

In [2]:
# Read the startup_data.csv file from the Resources folder into a Pandas DataFrame
df = pd.read_csv(
    Path("./Resources/startup_data.csv")
)

# Review the DataFrame
df.head()

Unnamed: 0,Company_Name,Dependent-Company Status,year of founding,Age of company in years,Internet Activity Score,Short Description of company profile,Industry of company,Focus functions of company,Investors,Employee Count,...,Percent_skill_Data Science,Percent_skill_Business Strategy,Percent_skill_Product Management,Percent_skill_Sales,Percent_skill_Domain,Percent_skill_Law,Percent_skill_Consulting,Percent_skill_Finance,Percent_skill_Investment,Renown score
0,Company1,Success,No Info,No Info,-1.0,Video distribution,,operation,KPCB Holdings|Draper Fisher Jurvetson (DFJ)|Kl...,3.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
1,Company2,Success,2011,3,125.0,,Market Research|Marketing|Crowdfunding,"Marketing, sales",,,...,8.823529412,21.76470588,10.88235294,2.941176471,0.0,0,0,0,0,8
2,Company3,Success,2011,3,455.0,Event Data Analytics API,Analytics|Cloud Computing|Software Development,operations,TechStars|Streamlined Ventures|Amplify Partner...,14.0,...,3.846153846,17.09401709,9.401709402,0.0,2.777777778,0,0,0,0,9
3,Company4,Success,2009,5,-99.0,The most advanced analytics for mobile,Mobile|Analytics,Marketing & Sales,Michael Birch|Max Levchin|Sequoia Capital|Keit...,45.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,5
4,Company5,Success,2010,4,496.0,The Location-Based Marketing Platform,Analytics|Marketing|Enterprise Software,Marketing & Sales,DFJ Frontier|Draper Nexus Ventures|Gil Elbaz|A...,39.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,6


In [3]:
# Review the data types associated with the columns
df.dtypes

Company_Name                 object
Dependent-Company Status     object
year of founding             object
Age of company in years      object
Internet Activity Score     float64
                             ...   
Percent_skill_Law            object
Percent_skill_Consulting     object
Percent_skill_Finance        object
Percent_skill_Investment     object
Renown score                 object
Length: 116, dtype: object

In [4]:
# Keep relevant columns in the dataframe
startup_df = df[['Dependent-Company Status', # y target
                 'Age of company in years',
                 'Internet Activity Score', 
                 # 'Industry of company', 
                 'Focus functions of company', 
                 # 'Employee Count', 
                 # 'Has the team size grown', 
                 # 'Last Funding Date', 
                 'Last Funding Amount', 
                 'Country of company', 
                 # 'Continent of company', 
                 'Number of Investors in Seed', 
                 'Number of Investors in Angel and or VC', 
                 # 'Number of Co-founders', 
                 'Number of of advisors', 
                 'Team size Senior leadership', 
                 # 'Team size all employees', 
                 'Presence of a top angel or venture fund in previous round of investment', 
                 'Number of of repeat investors', 
                 # 'Number of  Sales Support material', 
                 # 'Worked in top companies', 
                 'Have been part of successful startups in the past?',
                 # 'Product or service company?', 
                 # 'Catering to product/service across verticals',
                 # 'Subscription based business', 
                 'Local or global player', 
                 'Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive',
                 # 'B2C or B2B venture?',
                 'Proprietary or patent position (competitive position)',
                 'Barriers of entry for the competitors',
                 # 'google page rank of company website',
                 'Disruptiveness of technology',
                 'Number of Direct competitors',
                 # 'Last round of funding received (in milionUSD)',
                 'Time to 1st investment (in months)', 
                 'Avg time to investment - average across all rounds, measured from previous investment']]
startup_df              

Unnamed: 0,Dependent-Company Status,Age of company in years,Internet Activity Score,Focus functions of company,Last Funding Amount,Country of company,Number of Investors in Seed,Number of Investors in Angel and or VC,Number of of advisors,Team size Senior leadership,...,Number of of repeat investors,Have been part of successful startups in the past?,Local or global player,"Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive",Proprietary or patent position (competitive position),Barriers of entry for the competitors,Disruptiveness of technology,Number of Direct competitors,Time to 1st investment (in months),"Avg time to investment - average across all rounds, measured from previous investment"
0,Success,No Info,-1.0,operation,450000.0,United States,2,0,2,2,...,4,No,Global,Yes,No,Yes,Low,0,No Info,11.56
1,Success,3,125.0,"Marketing, sales",,United States,5,0,0,4,...,0,Yes,Local,No,Yes,Yes,Medium,0,10,9
2,Success,3,455.0,operations,2350000.0,United States,15,0,0,7,...,0,No,Local,No,Yes,Yes,Medium,0,2,7.344444444
3,Success,5,-99.0,Marketing & Sales,10250000.0,United States,6,0,0,4,...,0,Yes,Local,No,Yes,Yes,Medium,2,1,8.7
4,Success,4,496.0,Marketing & Sales,5500000.0,United States,7,0,1,8,...,0,No,Local,Yes,Yes,Yes,Medium,0,13,9.822222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,Failed,3,-5.0,,,United States,0,0,0,1,...,0,No Info,local,No,No Info,Yes,No Info,No Info,No Info,No Info
468,Failed,,,consumer web,,,0,4,0,1,...,0,No Info,global,Yes,No Info,Yes,No Info,No Info,No Info,No Info
469,Failed,4,0.0,CAMPAIGN MANAGEMENT,,United States,No Info,No Info,0,1,...,No Info,No Info,GLOBAL,No,No Info,Yes,No Info,No Info,No Info,No Info
470,Failed,5,0.0,SOCIALMEDIA CAMPAIGN,100000.0,United States,No Info,No Info,2,1,...,No Info,No Info,GLOBAL,No,No Info,Yes,No Info,No Info,No Info,No Info


### Rename Columns

In [5]:
startup_df.fillna(0)

Unnamed: 0,Dependent-Company Status,Age of company in years,Internet Activity Score,Focus functions of company,Last Funding Amount,Country of company,Number of Investors in Seed,Number of Investors in Angel and or VC,Number of of advisors,Team size Senior leadership,...,Number of of repeat investors,Have been part of successful startups in the past?,Local or global player,"Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive",Proprietary or patent position (competitive position),Barriers of entry for the competitors,Disruptiveness of technology,Number of Direct competitors,Time to 1st investment (in months),"Avg time to investment - average across all rounds, measured from previous investment"
0,Success,No Info,-1.0,operation,450000.0,United States,2,0,2,2,...,4,No,Global,Yes,No,Yes,Low,0,No Info,11.56
1,Success,3,125.0,"Marketing, sales",0.0,United States,5,0,0,4,...,0,Yes,Local,No,Yes,Yes,Medium,0,10,9
2,Success,3,455.0,operations,2350000.0,United States,15,0,0,7,...,0,No,Local,No,Yes,Yes,Medium,0,2,7.344444444
3,Success,5,-99.0,Marketing & Sales,10250000.0,United States,6,0,0,4,...,0,Yes,Local,No,Yes,Yes,Medium,2,1,8.7
4,Success,4,496.0,Marketing & Sales,5500000.0,United States,7,0,1,8,...,0,No,Local,Yes,Yes,Yes,Medium,0,13,9.822222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,Failed,3,-5.0,0,0.0,United States,0,0,0,1,...,0,No Info,local,No,No Info,Yes,No Info,No Info,No Info,No Info
468,Failed,0,0.0,consumer web,0.0,0,0,4,0,1,...,0,No Info,global,Yes,No Info,Yes,No Info,No Info,No Info,No Info
469,Failed,4,0.0,CAMPAIGN MANAGEMENT,0.0,United States,No Info,No Info,0,1,...,No Info,No Info,GLOBAL,No,No Info,Yes,No Info,No Info,No Info,No Info
470,Failed,5,0.0,SOCIALMEDIA CAMPAIGN,100000.0,United States,No Info,No Info,2,1,...,No Info,No Info,GLOBAL,No,No Info,Yes,No Info,No Info,No Info,No Info


In [6]:
# Create a list of categorical variables 
categorical_variables = list(startup_df.dtypes[startup_df.dtypes == "object"].index)

# Display the categorical variables list
categorical_variables

['Dependent-Company Status',
 'Age of company in years',
 'Focus functions of company',
 'Country of company',
 'Number of Investors in Seed',
 'Number of Investors in Angel and or VC',
 'Presence of a top angel or venture fund in previous round of investment',
 'Number of of repeat investors',
 'Have been part of successful startups in the past?',
 'Local or global player',
 'Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive',
 'Proprietary or patent position (competitive position)',
 'Barriers of entry for the competitors',
 'Disruptiveness of technology',
 'Number of Direct competitors',
 'Time to 1st investment (in months)',
 'Avg time to investment - average across all rounds, measured from previous investment']

In [7]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [8]:
# Encode the categorical variables using OneHotEncoder
encoded_data = enc.fit_transform(startup_df[categorical_variables])
encoded_data

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.]])

In [9]:
# Create a DataFrame with the encoded variables
encoded_df = pd.DataFrame(
    encoded_data,
    columns = enc.get_feature_names(categorical_variables)
)

# Display the DataFrame
encoded_df.fillna(0)

Unnamed: 0,Dependent-Company Status_Failed,Dependent-Company Status_Success,Age of company in years_1,Age of company in years_10,Age of company in years_11,Age of company in years_12,Age of company in years_14,Age of company in years_15,Age of company in years_17,Age of company in years_2,...,"Avg time to investment - average across all rounds, measured from previous investment_9.077777778","Avg time to investment - average across all rounds, measured from previous investment_9.233333333","Avg time to investment - average across all rounds, measured from previous investment_9.322222222","Avg time to investment - average across all rounds, measured from previous investment_9.375","Avg time to investment - average across all rounds, measured from previous investment_9.45","Avg time to investment - average across all rounds, measured from previous investment_9.466666667","Avg time to investment - average across all rounds, measured from previous investment_9.688888889","Avg time to investment - average across all rounds, measured from previous investment_9.822222222","Avg time to investment - average across all rounds, measured from previous investment_96","Avg time to investment - average across all rounds, measured from previous investment_No Info"
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
468,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
469,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
470,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
# Create a DataFrame with the columnns containing numerical variables from the original dataset
numerical_variables_df = startup_df.drop(columns = categorical_variables)

# Review the DataFrame
numerical_variables_df.fillna(0)

Unnamed: 0,Internet Activity Score,Last Funding Amount,Number of of advisors,Team size Senior leadership
0,-1.0,450000.0,2,2
1,125.0,0.0,0,4
2,455.0,2350000.0,0,7
3,-99.0,10250000.0,0,4
4,496.0,5500000.0,1,8
...,...,...,...,...
467,-5.0,0.0,0,1
468,0.0,0.0,0,1
469,0.0,0.0,0,1
470,0.0,100000.0,2,1


In [11]:
# Add the numerical variables from the original DataFrame to the one-hot encoding DataFrame
encoded_df = pd.concat(
    [
        numerical_variables_df,
        encoded_df
    ],
    axis=1
)

# Review the Dataframe
encoded_df.fillna(0)

Unnamed: 0,Internet Activity Score,Last Funding Amount,Number of of advisors,Team size Senior leadership,Dependent-Company Status_Failed,Dependent-Company Status_Success,Age of company in years_1,Age of company in years_10,Age of company in years_11,Age of company in years_12,...,"Avg time to investment - average across all rounds, measured from previous investment_9.077777778","Avg time to investment - average across all rounds, measured from previous investment_9.233333333","Avg time to investment - average across all rounds, measured from previous investment_9.322222222","Avg time to investment - average across all rounds, measured from previous investment_9.375","Avg time to investment - average across all rounds, measured from previous investment_9.45","Avg time to investment - average across all rounds, measured from previous investment_9.466666667","Avg time to investment - average across all rounds, measured from previous investment_9.688888889","Avg time to investment - average across all rounds, measured from previous investment_9.822222222","Avg time to investment - average across all rounds, measured from previous investment_96","Avg time to investment - average across all rounds, measured from previous investment_No Info"
0,-1.0,450000.0,2,2,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,125.0,0.0,0,4,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,455.0,2350000.0,0,7,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-99.0,10250000.0,0,4,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,496.0,5500000.0,1,8,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,-5.0,0.0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
468,0.0,0.0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
469,0.0,0.0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
470,0.0,100000.0,2,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
# Define the target set y using the Dependent-Company Status column
y = encoded_df["Dependent-Company Status_Success"]

# Display a sample of y
y

0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
467    0.0
468    0.0
469    0.0
470    0.0
471    0.0
Name: Dependent-Company Status_Success, Length: 472, dtype: float64

In [13]:
# Define features set X by selecting all columns but Dependent-Company Status_Success
X = encoded_df.drop(columns=["Dependent-Company Status_Success"])

# Review the features DataFrame
X.fillna(0, inplace=True)
X

Unnamed: 0,Internet Activity Score,Last Funding Amount,Number of of advisors,Team size Senior leadership,Dependent-Company Status_Failed,Age of company in years_1,Age of company in years_10,Age of company in years_11,Age of company in years_12,Age of company in years_14,...,"Avg time to investment - average across all rounds, measured from previous investment_9.077777778","Avg time to investment - average across all rounds, measured from previous investment_9.233333333","Avg time to investment - average across all rounds, measured from previous investment_9.322222222","Avg time to investment - average across all rounds, measured from previous investment_9.375","Avg time to investment - average across all rounds, measured from previous investment_9.45","Avg time to investment - average across all rounds, measured from previous investment_9.466666667","Avg time to investment - average across all rounds, measured from previous investment_9.688888889","Avg time to investment - average across all rounds, measured from previous investment_9.822222222","Avg time to investment - average across all rounds, measured from previous investment_96","Avg time to investment - average across all rounds, measured from previous investment_No Info"
0,-1.0,450000.0,2,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,125.0,0.0,0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,455.0,2350000.0,0,7,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-99.0,10250000.0,0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,496.0,5500000.0,1,8,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,-5.0,0.0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
468,0.0,0.0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
469,0.0,0.0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
470,0.0,100000.0,2,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [14]:
# Split the preprocessed data into a training and testing dataset
# Assign the function a random_state equal to 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [15]:
# Review the distinct values from y
y_train.value_counts()

1.0    235
0.0    119
Name: Dependent-Company Status_Success, dtype: int64

In [16]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Fit the scaler to the features training dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Neural Network Model

In [17]:
# Define the the number of inputs (features) to the model
# number_input_features = len(X_train.iloc[0])

# # Review the number of features
# number_input_features

In [18]:
# # Define the number of neurons in the output layer
# number_output_neurons = 1

In [19]:
# # Define the number of hidden nodes for the first hidden layer
# hidden_nodes_layer1 =  (number_input_features + number_output_neurons) // 2 

# # Review the number hidden nodes in the first layer
# hidden_nodes_layer1

In [20]:
# # Define the number of hidden nodes for the second hidden layer
# hidden_nodes_layer2 =  (hidden_nodes_layer1 + number_output_neurons) // 2 

# # Review the number hidden nodes in the second layer
# hidden_nodes_layer2

In [21]:
# # Create the Sequential model instance
# nn = Sequential()

In [22]:
# # Add the first hidden layer
# nn.add(Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

In [23]:
# # Add the second hidden layer
# nn.add(Dense(units=hidden_nodes_layer2, activation="relu"))

In [24]:
# # Add the output layer to the model specifying the number of output neurons and activation function
# nn.add(Dense(units=1, activation="sigmoid"))

In [25]:
# # Display the Sequential model summary
# nn.summary()

In [26]:
# # Compile the Sequential model
# nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [27]:
# # Fit the model using 50 epochs and the training data
# fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

In [28]:
# # Evaluate the model loss and accuracy metrics using the evaluate method and the test data
# model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)

# # Display the model loss and accuracy results
# print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

### RandomClassifier Model

In [44]:
# Import the RandomForestClassifier from sklearn
from sklearn.ensemble import RandomForestClassifier

# Instantiate a RandomForestClassifier instance
model = RandomForestClassifier()

# Fit the traning data to the model
model.fit(X_train_scaled, y_train)

RandomForestClassifier()

In [45]:
# Predict labels for original scaled testing features
y_pred = model.predict(X_test_scaled)

In [46]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, y_pred)

0.96875

In [47]:
# Print confusion matrix
confusion_matrix(y_test,y_pred)

array([[45,  3],
       [ 0, 70]])

In [48]:
# Print classification reports
print(f"Classifiction Report")
print(classification_report(y_pred, y_test))

Classifiction Report
              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97        45
         1.0       1.00      0.96      0.98        73

    accuracy                           0.97       118
   macro avg       0.97      0.98      0.97       118
weighted avg       0.98      0.97      0.97       118

