# Machine learning models

We will start with importing the modules and loading the 3 dataframes:

In [22]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn import metrics

In [2]:
bin_df = pd.read_csv('data/dataframes/df_after_cols_reduction.csv').iloc[:,1:]
pca_2d_df = pd.read_csv('data/dataframes/pca_2d_df.csv').iloc[:,1:]
pca_3d_df = pd.read_csv('data/dataframes/pca_3d_df.csv').iloc[:,1:]

print(f'Binary dataframe shape: {bin_df.shape}')
print(f'PCA 2D dataframe shape: {pca_2d_df.shape}')
print(f'PCA 3D dataframe shape: {pca_3d_df.shape}')


Binary dataframe shape: (10070, 1927)
PCA 2D dataframe shape: (10070, 26)
PCA 3D dataframe shape: (10070, 32)


In [3]:
cat_cols = ['company_name', 'company_about','founded', 'business model','employees','product stage','status','funding stage','succeeded']
num_cols = ['total_raised','total_rounds', 'investors','ipo_price', 'geo_market_per']
tag_cols = [col for col in bin_df.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in bin_df.columns if col.startswith('targetmarket_')]
sector_list = [col for col in bin_df.columns if col.startswith("sector_")]
target_ind_list = [col  for col in bin_df.columns if col.startswith("industry_")]
technology_list = [col  for col in bin_df.columns if col.startswith("technology_")]


bin_cols = tag_cols + targetmarket_cols + sector_list + target_ind_list + technology_list
pca_2d_cols = [col for col in pca_2d_df.columns if col not in cat_cols and col not in num_cols]
pca_3d_cols = [col for col in pca_3d_df.columns if col not in cat_cols and col not in num_cols]

In [7]:
print(f"Categorical cols : {len(cat_cols)}")
print(f"Numerical cols : {len(num_cols)}")
print(f"Tag cols : {len(tag_cols)}")
print(f"Targetmarket cols : {len(targetmarket_cols)}")
print(f"Sector cols : {len(sector_list)}")
print(f"Industry cols : {len(target_ind_list)}")
print(f"Technology cols : {len(technology_list)}")
print('---- Totals ----')
print(f"Total binary cols : {len(bin_cols)}")
print(f"Toatl PCA 2D cols : {len(pca_2d_cols)}")
print(f"Total PCA 3D cols : {len(pca_3d_cols)}")



Categorical cols : 9
Numerical cols : 5
Tag cols : 1599
Targetmarket cols : 117
Sector cols : 41
Industry cols : 81
Technology cols : 75
---- Totals ----
Total binary cols : 1913
Toatl PCA 2D cols : 12
Total PCA 3D cols : 18


In [44]:
models_scores_dic = {}

In [6]:
Xtrain, Xtest, ytrain, ytest = train_test_split(bin_df[num_cols + bin_cols], bin_df['succeeded'],test_size = 0.2, stratify=bin_df['succeeded'])
Xtrain_pca, Xtest_pca, ytrain_pca, ytest_pca = train_test_split(pca_df[num_cols + pca_cols], bin_df['succeeded'],test_size = 0.2, stratify=bin_df['succeeded'])

print(f"Xtrain shape: {Xtrain.shape}")
print(f"ytrain shape: {ytrain.shape}")

print(f"Xtrain_pca shape: {Xtrain_pca.shape}")
print(f"ytrain_pca shape: {ytrain_pca.shape}")



Xtrain shape: (8056, 1786)
ytrain shape: (8056,)
Xtrain_pca shape: (8056, 17)
ytrain_pca shape: (8056,)


In [7]:
lr = LogisticRegression(max_iter = 150)
lr.fit(Xtrain, ytrain)

LogisticRegression(max_iter=150)

In [8]:
ytrain_pred = lr.predict(Xtrain)
ytest_pred = lr.predict(Xtest)

In [9]:
print("Train results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytrain_pred, y_true = ytrain))
print("precision is:",metrics.precision_score(y_pred = ytrain_pred, y_true = ytrain))
print("recall is:",metrics.recall_score(y_pred = ytrain_pred, y_true = ytrain))
print("f1 is:",metrics.f1_score(y_pred = ytrain_pred, y_true = ytrain))
print("---------------------")
print("Test results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytest_pred, y_true = ytest))
print("precision is:",metrics.precision_score(y_pred = ytest_pred, y_true = ytest))
print("recall is:",metrics.recall_score(y_pred = ytest_pred, y_true = ytest))
print("f1 is:",metrics.f1_score(y_pred = ytest_pred, y_true = ytest))

Train results:
accuracy is: 0.7848808341608738
precision is: 0.4001384562132226
recall is: 1.0
f1 is: 0.5715698393077874
---------------------
Test results:
accuracy is: 0.7994041708043694
precision is: 0.417027417027417
recall is: 1.0
f1 is: 0.5885947046843177


Train results:  
accuracy is: 0.7919563058589871  
precision is: 0.4092351075079309  
recall is: 1.0  
f1 is: 0.5807903951975988  

Test results:  
accuracy is: 0.79493545183714  
precision is: 0.41251778093883357  
recall is: 1.0  
f1 is: 0.5840886203423967  

In [74]:
# create a new dataframe with succeeded = 1 and succeeded = 0

bin_df_succeeded = bin_df[bin_df['succeeded'] == 1]
bin_df_failed = bin_df[bin_df['succeeded'] == 0]

size = bin_df_succeeded.shape[0]
bin_df_fialed_sampled = bin_df_failed.sample(n = size , random_state = 42)

equal_df = pd.concat([bin_df_succeeded, bin_df_fialed_sampled])

print(equal_df.shape)
print(equal_df['succeeded'].sum())



(3240, 1856)
1620.0


In [11]:
Xtrain, Xtest, ytrain, ytest = train_test_split(equal_df[num_cols + bin_cols], equal_df['succeeded'],test_size = 0.2)
lr = LogisticRegression()
lr.fit(Xtrain, ytrain)


LogisticRegression()

In [12]:
ytrain_pred = lr.predict(Xtrain)
ytest_pred = lr.predict(Xtest)

In [13]:
print("Train results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytrain_pred, y_true = ytrain))
print("precision is:",metrics.precision_score(y_pred = ytrain_pred, y_true = ytrain))
print("recall is:",metrics.recall_score(y_pred = ytrain_pred, y_true = ytrain))
print("f1 is:",metrics.f1_score(y_pred = ytrain_pred, y_true = ytrain))
print("---------------------")
print("Test results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytest_pred, y_true = ytest))
print("precision is:",metrics.precision_score(y_pred = ytest_pred, y_true = ytest))
print("recall is:",metrics.recall_score(y_pred = ytest_pred, y_true = ytest))
print("f1 is:",metrics.f1_score(y_pred = ytest_pred, y_true = ytest))

Train results:
accuracy is: 0.8814878892733564
precision is: 0.8098542678695351
recall is: 1.0
f1 is: 0.8949386503067485
---------------------
Test results:
accuracy is: 0.8737024221453287
precision is: 0.792022792022792
recall is: 1.0
f1 is: 0.8839427662957074


In [76]:
Xtrain, Xtest, ytrain, ytest = train_test_split(equal_df[bin_cols], equal_df['succeeded'],test_size = 0.2)

print(f"Xtrain shape: {Xtrain.shape}")
print(f"ytrain shape: {ytrain.shape}")
print(f"Xtest shape: {Xtest.shape}")
print(f"ytest shape: {ytest.shape}")

lr = LogisticRegression(max_iter=1500)
lr.fit(Xtrain, ytrain)

ytrain_pred = lr.predict(Xtrain)
ytest_pred = lr.predict(Xtest)

print("\n\nTrain results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytrain_pred, y_true = ytrain))
print("precision is:",metrics.precision_score(y_pred = ytrain_pred, y_true = ytrain))
print("recall is:",metrics.recall_score(y_pred = ytrain_pred, y_true = ytrain))
print("f1 is:",metrics.f1_score(y_pred = ytrain_pred, y_true = ytrain))
print("---------------------")
print("Test results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytest_pred, y_true = ytest))
print("precision is:",metrics.precision_score(y_pred = ytest_pred, y_true = ytest))
print("recall is:",metrics.recall_score(y_pred = ytest_pred, y_true = ytest))
print("f1 is:",metrics.f1_score(y_pred = ytest_pred, y_true = ytest))

Xtrain shape: (2592, 1842)
ytrain shape: (2592,)
Xtest shape: (648, 1842)
ytest shape: (648,)


Train results:
accuracy is: 0.8969907407407407
precision is: 0.9033515198752923
recall is: 0.8901689708141322
f1 is: 0.8967117988394585
---------------------
Test results:
accuracy is: 0.6882716049382716
precision is: 0.6847133757961783
recall is: 0.6761006289308176
f1 is: 0.680379746835443


In [15]:
success_rate = 4200000

pca_df.loc[(pca_df["status"]==1)&(pca_df['total_raised']>=success_rate), 'succeeded'] = 1
pca_df.loc[(pca_df["status"]==0)|(pca_df['total_raised']<success_rate), 'succeeded'] = 0

# print(pca_df.head())

pca_df_succeeded = pca_df[pca_df['succeeded'] == 1]
pca_df_failed = pca_df[pca_df['succeeded'] == 0]

size = pca_df_succeeded.shape[0]
pca_df_fialed_sampled = pca_df_failed.sample(n = size , random_state = 42)

equal_df = pd.concat([pca_df_succeeded, pca_df_fialed_sampled])

print(equal_df.shape)
print(equal_df['succeeded'].sum())

pca_cols = [col for col in equal_df.columns if col not in cat_cols and col not in num_cols]
Xtrain, Xtest, ytrain, ytest = train_test_split(equal_df[pca_cols], equal_df['succeeded'],test_size = 0.2)

print(f"Xtrain shape: {Xtrain.shape}")
print(f"ytrain shape: {ytrain.shape}")
print(f"Xtest shape: {Xtest.shape}")
print(f"ytest shape: {ytest.shape}")

lr = LogisticRegression(max_iter=1500)
lr.fit(Xtrain, ytrain)

ytrain_pred = lr.predict(Xtrain)
ytest_pred = lr.predict(Xtest)

print("\n\nTrain results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytrain_pred, y_true = ytrain))
print("precision is:",metrics.precision_score(y_pred = ytrain_pred, y_true = ytrain))
print("recall is:",metrics.recall_score(y_pred = ytrain_pred, y_true = ytrain))
print("f1 is:",metrics.f1_score(y_pred = ytrain_pred, y_true = ytrain))
print("---------------------")
print("Test results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytest_pred, y_true = ytest))
print("precision is:",metrics.precision_score(y_pred = ytest_pred, y_true = ytest))
print("recall is:",metrics.recall_score(y_pred = ytest_pred, y_true = ytest))
print("f1 is:",metrics.f1_score(y_pred = ytest_pred, y_true = ytest))

(2890, 26)
1445.0
Xtrain shape: (2312, 12)
ytrain shape: (2312,)
Xtest shape: (578, 12)
ytest shape: (578,)


Train results:
accuracy is: 0.6600346020761245
precision is: 0.6632173095014111
recall is: 0.6222418358340689
f1 is: 0.6420765027322405
---------------------
Test results:
accuracy is: 0.657439446366782
precision is: 0.7050359712230215
recall is: 0.6282051282051282
f1 is: 0.6644067796610169


In [16]:
from lib.unsupervised_learning import *

In [68]:
bin_df_succeeded = bin_df[bin_df['succeeded'] == 1]
bin_df_failed = bin_df[bin_df['succeeded'] == 0]

size = bin_df_succeeded.shape[0]
bin_df_fialed_sampled = bin_df_failed.sample(n = size , random_state = 42)

equal_df = pd.concat([bin_df_succeeded, bin_df_fialed_sampled])

print(equal_df.shape)
print(equal_df['succeeded'].sum())

# print(get_best_init_params_for_k_means(equal_df[bin_cols],8,['k-means++','random'],range(5,50,5),42))

(4054, 1902)
2027.0


In [66]:
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=15, test_size=0.2, random_state=42)
clf = LogisticRegression(max_iter=150)

scores = cross_val_score(clf, equal_df[bin_cols], equal_df['succeeded'], cv=cv, scoring='f1_macro')
# print(scores)
models_scores_dic['LogisticRegression'] = scores.max()
print(f'Best score after cross-validation: {scores.max()}')


Best score after cross-validation: 0.6201803664005068


In [88]:
scoring = ['precision_macro', 'recall_macro','f1_macro']
scores = cross_validate(clf, equal_df[bin_cols], equal_df['succeeded'],cv = cv, scoring=scoring, return_train_score=True)

In [85]:
for d in ['test','train']:
    print(f"{d}_precision_macro: {scores[f'{d}_precision_macro'].max()}")
    print(f"{d}_recall_macro: {scores[f'{d}_recall_macro'].max()}")
    print(f"{d}_f1_macro: {scores[f'{d}_f1_macro'].max()}")
    print("---------------------")

test_precision_macro: 0.7301166692084795
test_recall_macro: 0.7299785663253155
test_f1_macro: 0.7299067535220494
---------------------
train_precision_macro: 0.9012110170399743
train_recall_macro: 0.9012798250691363
train_f1_macro: 0.9012260996313126
---------------------


In [92]:
df_outliers = pd.read_csv("data/dataframes/final_cleaned.csv").iloc[:,1:]


cat_cols = ['company_name', 'company_about','founded', 'business model','employees','product stage','status','funding stage','succeeded']
num_cols = ['total_raised','total_rounds', 'investors','ipo_price', 'geo_market_per']
tag_cols = [col for col in df_outliers.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in df_outliers.columns if col.startswith('targetmarket_')]
sector_list = [col for col in df_outliers.columns if col.startswith("sector_")]
target_ind_list = [col  for col in df_outliers.columns if col.startswith("industry_")]
technology_list = [col  for col in df_outliers.columns if col.startswith("technology_")]

bin_cols = tag_cols + targetmarket_cols + sector_list + target_ind_list + technology_list

df_outliers_succeeded = df_outliers[df_outliers['succeeded'] == 1]
df_outliers_failed = df_outliers[df_outliers['succeeded'] == 0]

size = min(df_outliers_succeeded.shape[0], df_outliers_failed.shape[0])

df_outliers_failed_sampled = df_outliers_failed.sample(n = size , random_state = 42)
df_outliers_succeded_sampled = df_outliers_succeeded.sample(n = size , random_state = 42)

equal_df = pd.concat([df_outliers_succeded_sampled, df_outliers_failed_sampled])

print(equal_df.shape)
print(equal_df['succeeded'].sum())

from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=15, test_size=0.2, random_state=42)
clf = LogisticRegression(max_iter=150)

scoring = ['precision_macro', 'recall_macro','f1_macro']
scores = cross_validate(clf, equal_df[bin_cols], equal_df['succeeded'],cv = cv, scoring=scoring, return_train_score=True)

for d in ['test','train']:
    print(f"{d}_precision_macro: {scores[f'{d}_precision_macro'].max()}")
    print(f"{d}_recall_macro: {scores[f'{d}_recall_macro'].max()}")
    print(f"{d}_f1_macro: {scores[f'{d}_f1_macro'].max()}")
    print("---------------------")

(6122, 2865)
3061.0
test_precision_macro: 0.6196110115619267
test_recall_macro: 0.619569563432617
test_f1_macro: 0.6195489904402395
---------------------
train_precision_macro: 0.805733760624328
train_recall_macro: 0.8050939426279249
train_f1_macro: 0.8050638668618363
---------------------


In [10]:
bin_df_succeeded = bin_df[bin_df['succeeded'] == 1]
bin_df_failed = bin_df[bin_df['succeeded'] == 0]

size = min(bin_df_succeeded.shape[0], bin_df_failed.shape[0])

bin_df_failed_sampled = bin_df_failed.sample(n = size , random_state = 42)
bin_df_succeded_sampled = bin_df_succeeded.sample(n = size , random_state = 42)

equal_df = pd.concat([bin_df_succeded_sampled, bin_df_failed_sampled])

print(equal_df.shape)
print(equal_df['succeeded'].sum())

(6122, 1927)
3061.0


In [20]:
from sklearn.neighbors import KNeighborsClassifier

XTrain, XTest, yTrain, yTest = train_test_split(equal_df[bin_cols], equal_df['succeeded'], test_size=0.2, random_state=42)

k = 20
clf = KNeighborsClassifier(n_neighbors=k)
clf.fit(XTrain, yTrain)


KNeighborsClassifier(n_neighbors=20)

In [21]:
y_pred = clf.predict(XTest)
print(f'Accuracy: {metrics.accuracy_score(yTest, y_pred)}')
print(f'Precision: {metrics.precision_score(yTest, y_pred)}')
print(f'Recall: {metrics.recall_score(yTest, y_pred)}')
print(f'F1: {metrics.f1_score(yTest, y_pred)}')
print(f'Confusion matrix: \n{metrics.confusion_matrix(yTest, y_pred)}')


Accuracy: 0.5551020408163265
Precision: 0.6652719665271967
Recall: 0.2548076923076923
F1: 0.3684820393974507
Confusion matrix: 
[[521  80]
 [465 159]]


In [46]:
parameters = {'n_neighbors':range(1,50,2) }
knn = KNeighborsClassifier()
clf = GridSearchCV(knn, parameters,scoring=metrics.make_scorer(metrics.f1_score, greater_is_better=True))
clf.fit(XTrain, yTrain)

print(f'Best score: {clf.best_score_}, Best parameters: {clf.best_params_}')
models_scores_dic['KNN'] = clf.best_score_

Best score: 0.5149597810590395, Best parameters: {'n_neighbors': 1}


In [47]:
from sklearn import svm
import tqdm.notebook as tqdm
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

with tqdm.tqdm(total = len(kernels)) as pbar:
    for k in kernels:
        clf = svm.SVC(kernel=k, C=1, random_state=42)
        clf.fit(XTrain, yTrain)
        y_pred = clf.predict(XTest)
        print(k)
        print(f'Accuracy: {metrics.accuracy_score(yTest, y_pred)}')
        print(f'Precision: {metrics.precision_score(yTest, y_pred)}')
        print(f'Recall: {metrics.recall_score(yTest, y_pred)}')
        print(f'F1: {metrics.f1_score(yTest, y_pred)}\n')
        pbar.update(1)
        models_scores_dic[f'SVC_{k}'] = metrics.f1_score(yTest, y_pred)

  0%|          | 0/4 [00:00<?, ?it/s]

linear
Accuracy: 0.5893877551020408
Precision: 0.5990180032733224
Recall: 0.5865384615384616
F1: 0.5927125506072874

poly
Accuracy: 0.6024489795918367
Precision: 0.6940509915014165
Recall: 0.3926282051282051
F1: 0.5015353121801434

rbf
Accuracy: 0.636734693877551
Precision: 0.6460032626427407
Recall: 0.6346153846153846
F1: 0.6402586903799514

sigmoid
Accuracy: 0.6261224489795918
Precision: 0.643598615916955
Recall: 0.5961538461538461
F1: 0.6189683860232945



In [48]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(XTrain, yTrain)
y_pred = gnb.predict(XTest)
print(f'Accuracy: {metrics.accuracy_score(yTest, y_pred)}')
print(f'Precision: {metrics.precision_score(yTest, y_pred)}')
print(f'Recall: {metrics.recall_score(yTest, y_pred)}')
print(f'F1: {metrics.f1_score(yTest, y_pred)}\n')
models_scores_dic['GNB'] = metrics.f1_score(yTest, y_pred)


Accuracy: 0.5004081632653061
Precision: 0.505736137667304
Recall: 0.8477564102564102
F1: 0.6335329341317366



In [49]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

decision_tree = tree.DecisionTreeClassifier()
dt = decision_tree.fit(XTrain, yTrain)
y_pred = dt.predict(XTest)
print(f'Accuracy: {metrics.accuracy_score(yTest, y_pred)}')
print(f'Precision: {metrics.precision_score(yTest, y_pred)}')
print(f'Recall: {metrics.recall_score(yTest, y_pred)}')
print(f'F1: {metrics.f1_score(yTest, y_pred)}\n')
models_scores_dic['DT'] = metrics.f1_score(yTest, y_pred)
# renderTree(dt, bin_cols)

Accuracy: 0.5714285714285714
Precision: 0.5779527559055118
Recall: 0.5881410256410257
F1: 0.5830023828435267



In [50]:
forest = RandomForestClassifier(n_estimators=500, random_state=42)
trained_forest = forest.fit(XTrain, yTrain)
y_pred = trained_forest.predict(XTest)
print(f'Accuracy: {metrics.accuracy_score(yTest, y_pred)}')
print(f'Precision: {metrics.precision_score(yTest, y_pred)}')
print(f'Recall: {metrics.recall_score(yTest, y_pred)}')
print(f'F1: {metrics.f1_score(yTest, y_pred)}\n')
models_scores_dic['RF'] = metrics.f1_score(yTest, y_pred)


Accuracy: 0.6351020408163265
Precision: 0.6376360808709176
Recall: 0.657051282051282
F1: 0.6471981057616417



In [51]:
from sklearn.neural_network import MLPClassifier
for solve in ['lbfgs', 'sgd', 'adam']:
    clf = MLPClassifier(solver=solve, alpha=1e-5, random_state=42, max_iter=1000)
    clf.fit(XTrain, yTrain)
    y_pred = clf.predict(XTest)
    print(f'Accuracy: {metrics.accuracy_score(yTest, y_pred)}')
    print(f'Precision: {metrics.precision_score(yTest, y_pred)}')
    print(f'Recall: {metrics.recall_score(yTest, y_pred)}')
    print(f'F1: {metrics.f1_score(yTest, y_pred)}\n')
    models_scores_dic[f'MLP_{solve}'] = metrics.f1_score(yTest, y_pred)

Accuracy: 0.5787755102040816
Precision: 0.5909090909090909
Recall: 0.5625
F1: 0.5763546798029557





Accuracy: 0.6114285714285714
Precision: 0.6213114754098361
Recall: 0.6073717948717948
F1: 0.6142625607779578

Accuracy: 0.5812244897959183
Precision: 0.5857805255023184
Recall: 0.6073717948717948
F1: 0.5963808025177026



In [63]:
max(list(models_scores_dic.values()))

0.6471981057616417