In [162]:
# Imports
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import confusion_matrix,classification_report,balanced_accuracy_score

In [163]:
# Read the startup_data.csv file from the Resources folder into a Pandas DataFrame
df = pd.read_csv(
    Path("./Resources/startup_data.csv")
)

# Review the DataFrame
df.head()

Unnamed: 0,Company_Name,Dependent-Company Status,year of founding,Age of company in years,Internet Activity Score,Short Description of company profile,Industry of company,Focus functions of company,Investors,Employee Count,...,Percent_skill_Data Science,Percent_skill_Business Strategy,Percent_skill_Product Management,Percent_skill_Sales,Percent_skill_Domain,Percent_skill_Law,Percent_skill_Consulting,Percent_skill_Finance,Percent_skill_Investment,Renown score
0,Company1,Success,No Info,No Info,-1.0,Video distribution,,operation,KPCB Holdings|Draper Fisher Jurvetson (DFJ)|Kl...,3.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
1,Company2,Success,2011,3,125.0,,Market Research|Marketing|Crowdfunding,"Marketing, sales",,,...,8.823529412,21.76470588,10.88235294,2.941176471,0.0,0,0,0,0,8
2,Company3,Success,2011,3,455.0,Event Data Analytics API,Analytics|Cloud Computing|Software Development,operations,TechStars|Streamlined Ventures|Amplify Partner...,14.0,...,3.846153846,17.09401709,9.401709402,0.0,2.777777778,0,0,0,0,9
3,Company4,Success,2009,5,-99.0,The most advanced analytics for mobile,Mobile|Analytics,Marketing & Sales,Michael Birch|Max Levchin|Sequoia Capital|Keit...,45.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,5
4,Company5,Success,2010,4,496.0,The Location-Based Marketing Platform,Analytics|Marketing|Enterprise Software,Marketing & Sales,DFJ Frontier|Draper Nexus Ventures|Gil Elbaz|A...,39.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,6


In [164]:
# Review the data types associated with the columns
df.dtypes

Company_Name                 object
Dependent-Company Status     object
year of founding             object
Age of company in years      object
Internet Activity Score     float64
                             ...   
Percent_skill_Law            object
Percent_skill_Consulting     object
Percent_skill_Finance        object
Percent_skill_Investment     object
Renown score                 object
Length: 116, dtype: object

In [165]:
# Keep relevant columns in the dataframe
startup_df = df[['Dependent-Company Status', # y target
                 'Age of company in years',
                 'Internet Activity Score', 
                 # 'Industry of company', 
                 'Focus functions of company', 
                 # 'Employee Count', 
                 # 'Has the team size grown', 
                 # 'Last Funding Date', 
                 'Last Funding Amount', 
                 'Country of company', 
                 # 'Continent of company', 
                 'Number of Investors in Seed', 
                 'Number of Investors in Angel and or VC', 
                 # 'Number of Co-founders', 
                 'Number of of advisors', 
                 'Team size Senior leadership', 
                 # 'Team size all employees', 
                 'Presence of a top angel or venture fund in previous round of investment', 
                 'Number of of repeat investors', 
                 # 'Number of  Sales Support material', 
                 # 'Worked in top companies', 
                 'Have been part of successful startups in the past?',
                 # 'Product or service company?', 
                 # 'Catering to product/service across verticals',
                 # 'Subscription based business', 
                 'Local or global player', 
                 'Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive',
                 # 'B2C or B2B venture?',
                 'Proprietary or patent position (competitive position)',
                 'Barriers of entry for the competitors',
                 # 'google page rank of company website',
                 'Disruptiveness of technology',
                 'Number of Direct competitors',
                 # 'Last round of funding received (in milionUSD)',
                 'Time to 1st investment (in months)', 
                 'Avg time to investment - average across all rounds, measured from previous investment']]
startup_df              

Unnamed: 0,Dependent-Company Status,Age of company in years,Internet Activity Score,Focus functions of company,Last Funding Amount,Country of company,Number of Investors in Seed,Number of Investors in Angel and or VC,Number of of advisors,Team size Senior leadership,...,Number of of repeat investors,Have been part of successful startups in the past?,Local or global player,"Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive",Proprietary or patent position (competitive position),Barriers of entry for the competitors,Disruptiveness of technology,Number of Direct competitors,Time to 1st investment (in months),"Avg time to investment - average across all rounds, measured from previous investment"
0,Success,No Info,-1.0,operation,450000.0,United States,2,0,2,2,...,4,No,Global,Yes,No,Yes,Low,0,No Info,11.56
1,Success,3,125.0,"Marketing, sales",,United States,5,0,0,4,...,0,Yes,Local,No,Yes,Yes,Medium,0,10,9
2,Success,3,455.0,operations,2350000.0,United States,15,0,0,7,...,0,No,Local,No,Yes,Yes,Medium,0,2,7.344444444
3,Success,5,-99.0,Marketing & Sales,10250000.0,United States,6,0,0,4,...,0,Yes,Local,No,Yes,Yes,Medium,2,1,8.7
4,Success,4,496.0,Marketing & Sales,5500000.0,United States,7,0,1,8,...,0,No,Local,Yes,Yes,Yes,Medium,0,13,9.822222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,Failed,3,-5.0,,,United States,0,0,0,1,...,0,No Info,local,No,No Info,Yes,No Info,No Info,No Info,No Info
468,Failed,,,consumer web,,,0,4,0,1,...,0,No Info,global,Yes,No Info,Yes,No Info,No Info,No Info,No Info
469,Failed,4,0.0,CAMPAIGN MANAGEMENT,,United States,No Info,No Info,0,1,...,No Info,No Info,GLOBAL,No,No Info,Yes,No Info,No Info,No Info,No Info
470,Failed,5,0.0,SOCIALMEDIA CAMPAIGN,100000.0,United States,No Info,No Info,2,1,...,No Info,No Info,GLOBAL,No,No Info,Yes,No Info,No Info,No Info,No Info


### Rename Columns

In [166]:
startup_df.fillna(0)

Unnamed: 0,Dependent-Company Status,Age of company in years,Internet Activity Score,Focus functions of company,Last Funding Amount,Country of company,Number of Investors in Seed,Number of Investors in Angel and or VC,Number of of advisors,Team size Senior leadership,...,Number of of repeat investors,Have been part of successful startups in the past?,Local or global player,"Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive",Proprietary or patent position (competitive position),Barriers of entry for the competitors,Disruptiveness of technology,Number of Direct competitors,Time to 1st investment (in months),"Avg time to investment - average across all rounds, measured from previous investment"
0,Success,No Info,-1.0,operation,450000.0,United States,2,0,2,2,...,4,No,Global,Yes,No,Yes,Low,0,No Info,11.56
1,Success,3,125.0,"Marketing, sales",0.0,United States,5,0,0,4,...,0,Yes,Local,No,Yes,Yes,Medium,0,10,9
2,Success,3,455.0,operations,2350000.0,United States,15,0,0,7,...,0,No,Local,No,Yes,Yes,Medium,0,2,7.344444444
3,Success,5,-99.0,Marketing & Sales,10250000.0,United States,6,0,0,4,...,0,Yes,Local,No,Yes,Yes,Medium,2,1,8.7
4,Success,4,496.0,Marketing & Sales,5500000.0,United States,7,0,1,8,...,0,No,Local,Yes,Yes,Yes,Medium,0,13,9.822222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,Failed,3,-5.0,0,0.0,United States,0,0,0,1,...,0,No Info,local,No,No Info,Yes,No Info,No Info,No Info,No Info
468,Failed,0,0.0,consumer web,0.0,0,0,4,0,1,...,0,No Info,global,Yes,No Info,Yes,No Info,No Info,No Info,No Info
469,Failed,4,0.0,CAMPAIGN MANAGEMENT,0.0,United States,No Info,No Info,0,1,...,No Info,No Info,GLOBAL,No,No Info,Yes,No Info,No Info,No Info,No Info
470,Failed,5,0.0,SOCIALMEDIA CAMPAIGN,100000.0,United States,No Info,No Info,2,1,...,No Info,No Info,GLOBAL,No,No Info,Yes,No Info,No Info,No Info,No Info


In [167]:
# Create a list of categorical variables 
categorical_variables = list(startup_df.dtypes[startup_df.dtypes == "object"].index)

# Display the categorical variables list
categorical_variables

['Dependent-Company Status',
 'Age of company in years',
 'Focus functions of company',
 'Country of company',
 'Number of Investors in Seed',
 'Number of Investors in Angel and or VC',
 'Presence of a top angel or venture fund in previous round of investment',
 'Number of of repeat investors',
 'Have been part of successful startups in the past?',
 'Local or global player',
 'Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive',
 'Proprietary or patent position (competitive position)',
 'Barriers of entry for the competitors',
 'Disruptiveness of technology',
 'Number of Direct competitors',
 'Time to 1st investment (in months)',
 'Avg time to investment - average across all rounds, measured from previous investment']

In [168]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [169]:
# Encode the categorical variables using OneHotEncoder
encoded_data = enc.fit_transform(startup_df[categorical_variables])
encoded_data

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.]])

In [170]:
# Create a DataFrame with the encoded variables
encoded_df = pd.DataFrame(
    encoded_data,
    columns = enc.get_feature_names(categorical_variables)
)

# Display the DataFrame
encoded_df.fillna(0)

Unnamed: 0,Dependent-Company Status_Failed,Dependent-Company Status_Success,Age of company in years_1,Age of company in years_10,Age of company in years_11,Age of company in years_12,Age of company in years_14,Age of company in years_15,Age of company in years_17,Age of company in years_2,...,"Avg time to investment - average across all rounds, measured from previous investment_9.077777778","Avg time to investment - average across all rounds, measured from previous investment_9.233333333","Avg time to investment - average across all rounds, measured from previous investment_9.322222222","Avg time to investment - average across all rounds, measured from previous investment_9.375","Avg time to investment - average across all rounds, measured from previous investment_9.45","Avg time to investment - average across all rounds, measured from previous investment_9.466666667","Avg time to investment - average across all rounds, measured from previous investment_9.688888889","Avg time to investment - average across all rounds, measured from previous investment_9.822222222","Avg time to investment - average across all rounds, measured from previous investment_96","Avg time to investment - average across all rounds, measured from previous investment_No Info"
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
468,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
469,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
470,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [171]:
# Create a DataFrame with the columnns containing numerical variables from the original dataset
numerical_variables_df = startup_df.drop(columns = categorical_variables)

# Review the DataFrame
numerical_variables_df.fillna(0)

Unnamed: 0,Internet Activity Score,Last Funding Amount,Number of of advisors,Team size Senior leadership
0,-1.0,450000.0,2,2
1,125.0,0.0,0,4
2,455.0,2350000.0,0,7
3,-99.0,10250000.0,0,4
4,496.0,5500000.0,1,8
...,...,...,...,...
467,-5.0,0.0,0,1
468,0.0,0.0,0,1
469,0.0,0.0,0,1
470,0.0,100000.0,2,1


In [172]:
# Add the numerical variables from the original DataFrame to the one-hot encoding DataFrame
encoded_df = pd.concat(
    [
        numerical_variables_df,
        encoded_df
    ],
    axis=1
)

# Review the Dataframe
encoded_df.fillna(0)

Unnamed: 0,Internet Activity Score,Last Funding Amount,Number of of advisors,Team size Senior leadership,Dependent-Company Status_Failed,Dependent-Company Status_Success,Age of company in years_1,Age of company in years_10,Age of company in years_11,Age of company in years_12,...,"Avg time to investment - average across all rounds, measured from previous investment_9.077777778","Avg time to investment - average across all rounds, measured from previous investment_9.233333333","Avg time to investment - average across all rounds, measured from previous investment_9.322222222","Avg time to investment - average across all rounds, measured from previous investment_9.375","Avg time to investment - average across all rounds, measured from previous investment_9.45","Avg time to investment - average across all rounds, measured from previous investment_9.466666667","Avg time to investment - average across all rounds, measured from previous investment_9.688888889","Avg time to investment - average across all rounds, measured from previous investment_9.822222222","Avg time to investment - average across all rounds, measured from previous investment_96","Avg time to investment - average across all rounds, measured from previous investment_No Info"
0,-1.0,450000.0,2,2,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,125.0,0.0,0,4,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,455.0,2350000.0,0,7,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-99.0,10250000.0,0,4,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,496.0,5500000.0,1,8,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,-5.0,0.0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
468,0.0,0.0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
469,0.0,0.0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
470,0.0,100000.0,2,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [173]:
# Define the target set y using the Dependent-Company Status column
y = encoded_df["Dependent-Company Status_Success"]

# Display a sample of y
y

0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
467    0.0
468    0.0
469    0.0
470    0.0
471    0.0
Name: Dependent-Company Status_Success, Length: 472, dtype: float64

In [174]:
# Define features set X by selecting all columns but Dependent-Company Status_Success
X = encoded_df.drop(columns=["Dependent-Company Status_Success"])

# Review the features DataFrame
X.fillna(0, inplace=True)
X

Unnamed: 0,Internet Activity Score,Last Funding Amount,Number of of advisors,Team size Senior leadership,Dependent-Company Status_Failed,Age of company in years_1,Age of company in years_10,Age of company in years_11,Age of company in years_12,Age of company in years_14,...,"Avg time to investment - average across all rounds, measured from previous investment_9.077777778","Avg time to investment - average across all rounds, measured from previous investment_9.233333333","Avg time to investment - average across all rounds, measured from previous investment_9.322222222","Avg time to investment - average across all rounds, measured from previous investment_9.375","Avg time to investment - average across all rounds, measured from previous investment_9.45","Avg time to investment - average across all rounds, measured from previous investment_9.466666667","Avg time to investment - average across all rounds, measured from previous investment_9.688888889","Avg time to investment - average across all rounds, measured from previous investment_9.822222222","Avg time to investment - average across all rounds, measured from previous investment_96","Avg time to investment - average across all rounds, measured from previous investment_No Info"
0,-1.0,450000.0,2,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,125.0,0.0,0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,455.0,2350000.0,0,7,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-99.0,10250000.0,0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,496.0,5500000.0,1,8,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,-5.0,0.0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
468,0.0,0.0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
469,0.0,0.0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
470,0.0,100000.0,2,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [175]:
# Split the preprocessed data into a training and testing dataset
# Assign the function a random_state equal to 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [176]:
# Review the distinct values from y
y_train.value_counts()

1.0    235
0.0    119
Name: Dependent-Company Status_Success, dtype: int64

In [177]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Fit the scaler to the features training dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## after this section above (cleaning data) we input the information into the various different ML models below

# Neural Network Model (2 layers 1 output)

In [178]:
# Define the the number of inputs (features) to the model
number_input_features = len(X_train.iloc[0])

# Review the number of features
number_input_features

510

In [179]:
# Define the number of neurons in the output layer
number_output_neurons = 1

In [180]:
# Define the number of hidden nodes for the first hidden layer
hidden_nodes_layer1 =  (number_input_features + number_output_neurons) // 2 

# Review the number hidden nodes in the first layer
hidden_nodes_layer1

255

In [181]:
# Define the number of hidden nodes for the second hidden layer
hidden_nodes_layer2 =  (hidden_nodes_layer1 + number_output_neurons) // 2 

# Review the number hidden nodes in the second layer
hidden_nodes_layer2

128

In [182]:
# Create the Sequential model instance
nn = Sequential()

In [183]:
# Add the first hidden layer
nn.add(Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

In [184]:
# Add the second hidden layer
nn.add(Dense(units=hidden_nodes_layer2, activation="relu"))

In [185]:
# Add the output layer to the model specifying the number of output neurons and activation function
nn.add(Dense(units=1, activation="sigmoid"))

In [186]:
# Display the Sequential model summary
nn.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 255)               130305    
_________________________________________________________________
dense_7 (Dense)              (None, 128)               32768     
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 129       
Total params: 163,202
Trainable params: 163,202
Non-trainable params: 0
_________________________________________________________________


In [187]:
# Compile the Sequential model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [188]:
# Fit the model using 50 epochs and the training data
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [189]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

4/4 - 0s - loss: 0.1904 - accuracy: 0.9407
Loss: 0.1904088407754898, Accuracy: 0.9406779408454895


# RandomClassifier Model

In [190]:
# Import the RandomForestClassifier from sklearn
from sklearn.ensemble import RandomForestClassifier

# Instantiate a RandomForestClassifier instance
model = RandomForestClassifier()

# Fit the traning data to the model
model.fit(X_train_scaled, y_train)

RandomForestClassifier()

In [191]:
# Predict labels for original scaled testing features
y_pred = model.predict(X_test_scaled)

In [192]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, y_pred)

0.9791666666666667

In [193]:
# Print confusion matrix
confusion_matrix(y_test,y_pred)

array([[46,  2],
       [ 0, 70]])

In [194]:
# Print classification reports
print(f"Classifiction Report")
print(classification_report(y_pred, y_test))

Classifiction Report
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98        46
         1.0       1.00      0.97      0.99        72

    accuracy                           0.98       118
   macro avg       0.98      0.99      0.98       118
weighted avg       0.98      0.98      0.98       118



# Gaussian Process Classifier Model

In [195]:
from sklearn.gaussian_process import GaussianProcessClassifier

In [196]:
gaussian_model = GaussianProcessClassifier(max_iter_predict=1000,n_restarts_optimizer=3, random_state=1)

In [197]:
gaussian_model.fit(X_train_scaled, y_train)

GaussianProcessClassifier(max_iter_predict=1000, n_restarts_optimizer=3,
                          random_state=1)

In [198]:
y_pred = gaussian_model.predict(X_test_scaled)

In [199]:
balanced_accuracy_score(y_test, y_pred)

0.7550595238095239

In [200]:
confusion_matrix(y_test,y_pred)

array([[43,  5],
       [27, 43]])

In [201]:
print(f" Gaussian Classifiction Report")
print(classification_report(y_pred, y_test))

 Gaussian Classifiction Report
              precision    recall  f1-score   support

         0.0       0.90      0.61      0.73        70
         1.0       0.61      0.90      0.73        48

    accuracy                           0.73       118
   macro avg       0.76      0.76      0.73       118
weighted avg       0.78      0.73      0.73       118



# AdaBoost Model

In [202]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris

In [203]:
X, y = load_iris(return_X_y=True)

In [204]:
adaboost_model = AdaBoostClassifier(n_estimators=100)

In [205]:
scores = cross_val_score(adaboost_model, X, y, cv=5)

In [206]:
scores.mean()

0.9466666666666665

In [207]:
adaboost_model.fit(X_train_scaled, y_train)

AdaBoostClassifier(n_estimators=100)

In [208]:
y_pred = adaboost_model.predict(X_test_scaled)

In [209]:
balanced_accuracy_score(y_test, y_pred)

1.0

In [210]:
confusion_matrix(y_test,y_pred)

array([[48,  0],
       [ 0, 70]])

In [211]:
print(f" AdaBoost Classifiction Report")
print(classification_report(y_pred, y_test))

 AdaBoost Classifiction Report
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        48
         1.0       1.00      1.00      1.00        70

    accuracy                           1.00       118
   macro avg       1.00      1.00      1.00       118
weighted avg       1.00      1.00      1.00       118



## Classification Tree

In [212]:
from sklearn import tree
X = [[0,0], [1,1]]
Y = [0,1]


In [213]:
training_data ()

TypeError: 'DataFrame' object is not callable

In [214]:
# Use CART which is a Classification and Regression Tree-- uses true false to unmix label-- has purest possible distribtuion 

In [215]:
def partition (rows, questions):
    """""Partitions a dataset.
    
    For each row in the dataset check if it matches the question. 
    If so add it to the 'true rows', otherwise add it to the 'false rows'.
    """
    true_rows,false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_row.append(row)
    return true_rows, false_rows

In [216]:
training_data = pd.read_csv(
    Path("./Resources/startup_data.csv")
)

In [217]:
#Column Labels
header = df[['Dependent-Company Status', # y target
                 'Age of company in years',
                 'Internet Activity Score', 
                 # 'Industry of company', 
                 'Focus functions of company', 
                 # 'Employee Count', 
                 # 'Has the team size grown', 
                 # 'Last Funding Date', 
                 'Last Funding Amount', 
                 'Country of company', 
                 # 'Continent of company', 
                 'Number of Investors in Seed', 
                 'Number of Investors in Angel and or VC', 
                 # 'Number of Co-founders', 
                 'Number of of advisors', 
                 'Team size Senior leadership', 
                 # 'Team size all employees', 
                 'Presence of a top angel or venture fund in previous round of investment', 
                 'Number of of repeat investors', 
                 # 'Number of  Sales Support material', 
                 # 'Worked in top companies', 
                 'Have been part of successful startups in the past?',
                 # 'Product or service company?', 
                 # 'Catering to product/service across verticals',
                 # 'Subscription based business', 
                 'Local or global player', 
                 'Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive',
                 # 'B2C or B2B venture?',
                 'Proprietary or patent position (competitive position)',
                 'Barriers of entry for the competitors',
                 # 'google page rank of company website',
                 'Disruptiveness of technology',
                 'Number of Direct competitors',
                 # 'Last round of funding received (in milionUSD)',
                 'Time to 1st investment (in months)', 
                 'Avg time to investment - average across all rounds, measured from previous investment']]             

In [218]:
def unique_values (rows, col):
    """ Find the unique values for a column in a dataset"""
    return set([row[col]for row in row])

In [219]:
unique_values(training_data,0)
print(unique_values)


NameError: name 'row' is not defined

In [220]:
def class_counts(rows):
    """Counts the numer of each type of example in a dataset"""
    counts = {}  # a dictionary of label -> count.
    for row in rows:
        # in our dataset format, the label is always the last column
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [221]:
 class_counts(training_data)

{'e': 20,
 's': 33,
 'g': 5,
 'y': 10,
 't': 8,
 'n': 7,
 'd': 2,
 'C': 1,
 'p': 3,
 'l': 3,
 '?': 13,
 'a': 2,
 'r': 3,
 'i': 1,
 ')': 4,
 'w': 1}

In [None]:
def is_numeric(value):
    """Test if a value is numeric."""
    return isinstance(value, int) or isinstance(value, float)

In [None]:
class Question:
    """A Question is used to partition a dataset.

    This class just records a 'column number' (e.g., 0 for Color) and a
    'column value' (e.g., Green). The 'match' method is used to compare
    the feature value in an example to the feature value stored in the
    question. See the demo below.
    """

    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        # Compare the feature value in an example to the
        # feature value in this question.
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        # This is just a helper method to print
        # the question in a readable format.
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))

In [224]:
#question for numerical attibute 
q = Question(1,3)

NameError: name 'Question' is not defined

In [None]:
q = Question(0, 'Marketing')
q

In [222]:
#partition data to see if sucessful companies have 
true_rows, false_rows = partition(training_data, Questions (0))

NameError: name 'Questions' is not defined