# Machine learning models

We will start with importing the modules and loading the 3 dataframes:

In [1]:
import pandas as pd
from lib.unsupervised_learning import *

from sklearn import svm, tree, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

import tqdm.notebook as tqdm

In [2]:
bin_df = pd.read_csv('data/dataframes/df_after_cols_reduction.csv').iloc[:,1:]
pca_2d_df = pd.read_csv('data/dataframes/pca_2d_df.csv').iloc[:,1:]
pca_3d_df = pd.read_csv('data/dataframes/pca_3d_df.csv').iloc[:,1:]

print(f'Binary dataframe shape: {bin_df.shape}')
print(f'PCA 2D dataframe shape: {pca_2d_df.shape}')
print(f'PCA 3D dataframe shape: {pca_3d_df.shape}')


Binary dataframe shape: (10070, 1927)
PCA 2D dataframe shape: (10070, 26)
PCA 3D dataframe shape: (10070, 32)


In [3]:
cat_cols = ['company_name', 'company_about','founded', 'business model','employees','product stage','status','funding stage','succeeded']
num_cols = ['total_raised','total_rounds', 'investors','ipo_price', 'geo_market_per']
tag_cols = [col for col in bin_df.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in bin_df.columns if col.startswith('targetmarket_')]
sector_list = [col for col in bin_df.columns if col.startswith("sector_")]
target_ind_list = [col  for col in bin_df.columns if col.startswith("industry_")]
technology_list = [col  for col in bin_df.columns if col.startswith("technology_")]


bin_cols = tag_cols + targetmarket_cols + sector_list + target_ind_list + technology_list
pca_2d_cols = [col for col in pca_2d_df.columns if col not in cat_cols and col not in num_cols]
pca_3d_cols = [col for col in pca_3d_df.columns if col not in cat_cols and col not in num_cols]

In [4]:
print(f"Categorical cols : {len(cat_cols)}")
print(f"Numerical cols : {len(num_cols)}")
print(f"Tag cols : {len(tag_cols)}")
print(f"Targetmarket cols : {len(targetmarket_cols)}")
print(f"Sector cols : {len(sector_list)}")
print(f"Industry cols : {len(target_ind_list)}")
print(f"Technology cols : {len(technology_list)}")
print('---- Totals ----')
print(f"Total binary cols : {len(bin_cols)}")
print(f"Toatl PCA 2D cols : {len(pca_2d_cols)}")
print(f"Total PCA 3D cols : {len(pca_3d_cols)}")



Categorical cols : 9
Numerical cols : 5
Tag cols : 1599
Targetmarket cols : 117
Sector cols : 41
Industry cols : 81
Technology cols : 75
---- Totals ----
Total binary cols : 1913
Toatl PCA 2D cols : 12
Total PCA 3D cols : 18


In this noteook, we will train few machine learning models on our datasets to find the best model to predict the target variable.  
The models we will use are:
- [Logistic Regression](#lr)
- [K-Nearest Neighbours](#knn)
- [Decision Tree](#dt)
- [Random Forest](#rf)
- [Support Vector Machine](#svm)
- [Gaussian Naive Bayes](#gnb)
- [Multi-layer Perceptron](#mlp)  

We will save the scores and models in a directory to compare the results.

In [5]:
# models and scores dictionary

models_scores_dic = {}

First we need to prepare our data.  
As we saw in the visualizations, we have almost 70% of successful companies in the dataset.  
In order to avoid biased results, we need to train the models on an evenly distributed succeeded column in the dataset.  
i.e the train data should contain 50% succesfull companies and 50% failed companies.


In [6]:
bin_df_succeeded = bin_df[bin_df['succeeded'] == 1]
bin_df_failed = bin_df[bin_df['succeeded'] == 0]

size = min(bin_df_succeeded.shape[0], bin_df_failed.shape[0])

bin_df_failed_sampled = bin_df_failed.sample(n = size , random_state = 42)
bin_df_succeded_sampled = bin_df_succeeded.sample(n = size , random_state = 42)

equal_df = pd.concat([bin_df_succeded_sampled, bin_df_failed_sampled])

print(f'Binary equal dataframe shape: {equal_df.shape}')
print(f"Binary equal dataframe succeeded companies: {equal_df['succeeded'].sum()}")

Binary equal dataframe shape: (6122, 1927)
Binary equal dataframe succeeded companies: 3061.0


We will do the same for both pca dataframes:

In [7]:
print("\n2D PCA data:")

pca_2d_df_succeeded = pca_2d_df[pca_2d_df['succeeded'] == 1]
pca_2d_df_failed = pca_2d_df[pca_2d_df['succeeded'] == 0]

size = min(pca_2d_df_succeeded.shape[0], pca_2d_df_failed.shape[0])

pca_2d_df_succeded_sampled = pca_2d_df_succeeded.sample(n = size , random_state = 42)
pca_2d_df_failed_sampled = pca_2d_df_failed.sample(n = size , random_state = 42)

pca_2d_equal_df = pd.concat([pca_2d_df_succeded_sampled, pca_2d_df_failed_sampled])

print(f'2D PCA equal dataframe shape: {pca_2d_equal_df.shape}')
print(f"2D PCA equal dataframe succeeded companies: {pca_2d_equal_df['succeeded'].sum()}")

print("\n3D PCA data:")

pca_3d_df_succeeded = pca_3d_df[pca_3d_df['succeeded'] == 1]
pca_3d_df_failed = pca_3d_df[pca_3d_df['succeeded'] == 0]

size = min(pca_3d_df_succeeded.shape[0], pca_3d_df_failed.shape[0])

pca_3d_df_succeded_sampled = pca_3d_df_succeeded.sample(n = size , random_state = 42)
pca_3d_df_failed_sampled = pca_3d_df_failed.sample(n = size , random_state = 42)

pca_3d_equal_df = pd.concat([pca_3d_df_succeded_sampled, pca_3d_df_failed_sampled])

print(f'3D PCA equal dataframe shape: {pca_3d_equal_df.shape}')
print(f"3D PCA equal dataframe succeeded companies: {pca_3d_equal_df['succeeded'].sum()}")


2D PCA data:
2D PCA equal dataframe shape: (6122, 26)
2D PCA equal dataframe succeeded companies: 3061.0

3D PCA data:
3D PCA equal dataframe shape: (6122, 32)
3D PCA equal dataframe succeeded companies: 3061.0


In [30]:
bin_XTrain, bin_XTest, bin_yTrain, bin_yTest = train_test_split(equal_df[bin_cols], equal_df['succeeded'], test_size=0.2, random_state=42, stratify=equal_df['succeeded'])
pca_2d_XTrain, pca_2d_XTest, pca_2d_yTrain, pca_2d_yTest = train_test_split(pca_2d_equal_df[pca_2d_cols], pca_2d_equal_df['succeeded'], test_size=0.2, random_state=42, stratify=pca_2d_equal_df['succeeded'])
pca_3d_XTrain, pca_3d_XTest, pca_3d_yTrain, pca_3d_yTest = train_test_split(pca_3d_equal_df[pca_3d_cols], pca_3d_equal_df['succeeded'], test_size=0.2, random_state=42, stratify=pca_3d_equal_df['succeeded'])


print(f"bin_XTrain shape: {bin_XTrain.shape}")
print(f"bin_yTrain shape: {bin_yTrain.shape}")

print(f"bin_XTest shape: {bin_XTest.shape}")
print(f"bin_yTest shape: {bin_yTest.shape}")

bin_XTrain shape: (4897, 1913)
bin_yTrain shape: (4897,)
bin_XTest shape: (1225, 1913)
bin_yTest shape: (1225,)


In [28]:
def train_logistic_regression(df, cols):
    cv = ShuffleSplit(n_splits=15, test_size=0.2, random_state=42)
    clf = LogisticRegression(max_iter=150)

    scores = cross_val_score(clf, df[cols], df['succeeded'], cv=cv, scoring='f1_macro')
    return scores.max()

In [33]:
def train_knn(XTrain, yTrain):

    parameters = {'n_neighbors':range(2,50,2), 'weights':['uniform', 'distance']}
    knn = KNeighborsClassifier()
    clf = GridSearchCV(knn, parameters,scoring=metrics.make_scorer(metrics.f1_score, greater_is_better=True))
    clf.fit(XTrain, yTrain)
    return clf.best_score_

In [34]:
def train_svm(XTrain, yTrain):
    parameters = {'C':[0.1,1,10,100,1000], 'kernel':['linear', 'poly', 'rbf', 'sigmoid']}
    s = svm.SVC()
    clf = GridSearchCV(s, parameters,scoring=metrics.make_scorer(metrics.f1_score, greater_is_better=True))
    clf.fit(XTrain, yTrain)
    return clf.best_score_

In [35]:
def train_gnb(XTrain, yTrain, XTest, yTest):
    gnb = GaussianNB()
    gnb.fit(XTrain, yTrain)
    y_pred = gnb.predict(XTest)
    return metrics.f1_score(yTest, y_pred)

In [36]:
def train_dt(XTrain, yTrain, XTest, yTest):
    dt = tree.DecisionTreeClassifier()
    dt.fit(XTrain, yTrain)
    y_pred = dt.predict(XTest)
    return dt.score(yTest, y_pred)

In [37]:
def train_rf(XTrain, yTrain, XTest, yTest):
    rf = RandomForestClassifier(n_estimators=500, random_state=42)
    rf.fit(XTrain, yTrain)
    y_pred = rf.predict(XTest)
    return metrics.f1_score(yTest, y_pred)

In [38]:
def train_mlp(XTrain, yTrain, XTest, yTest):
    best_score = []
    for solve in ['lbfgs', 'sgd', 'adam']:
        mlp = MLPClassifier(solver=solve, hidden_layer_sizes=(100,), max_iter=500, random_state=42)
        mlp.fit(XTrain, yTrain)
        y_pred = mlp.predict(XTest)
        best_score.append(metrics.f1_score(yTest, y_pred))
    return max(best_score)

In [39]:
dfs_scores = {}
dfs ={'bin_df': (bin_df,bin_cols), 'pca_2d_df': (pca_2d_df,pca_2d_cols), 'pca_3d_df': (pca_3d_df,pca_3d_cols)}
for key, df in dfs.items():
    scores = {} 
    XTrain, XTest, yTrain, yTest = train_test_split(df[0][df[1]], df[0]['succeeded'], test_size=0.2, random_state=42, stratify=df[0]['succeeded'])

    scores['LogisticRegression'] = train_logistic_regression(df[0], df[1])
    scores['KNN'] = train_knn(XTrain, yTrain)
    # scores['SVM'] = train_svm(XTrain, yTrain)
    scores['GNB'] = train_gnb(XTrain, yTrain, XTest, yTest)
    scores['DT'] = train_dt(XTrain, yTrain, XTest, yTest)
    # scores['RF'] = train_rf(XTrain, yTrain, XTest, yTest)
    # scores['MLP'] = train_mlp(XTrain, yTrain, XTest, yTest)

    dfs_scores[key] = scores




ValueError: Expected 2D array, got 1D array instead:
array=[0. 1. 1. ... 0. 0. 1.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [32]:
dfs_scores

{'bin_df': {'LogisticRegression': 0.6040409344741021},
 'pca_2d_df': {'LogisticRegression': 0.41885964912280704},
 'pca_3d_df': {'LogisticRegression': 0.4499967766428377}}

In [None]:
# lr = LogisticRegression(max_iter = 150)
# lr.fit(Xtrain, ytrain)

In [None]:
# ytrain_pred = lr.predict(Xtrain)
# ytest_pred = lr.predict(Xtest)

In [None]:
# print("Train results:")
# print("accuracy is:",metrics.accuracy_score(y_pred = ytrain_pred, y_true = ytrain))
# print("precision is:",metrics.precision_score(y_pred = ytrain_pred, y_true = ytrain))
# print("recall is:",metrics.recall_score(y_pred = ytrain_pred, y_true = ytrain))
# print("f1 is:",metrics.f1_score(y_pred = ytrain_pred, y_true = ytrain))
# print("---------------------")
# print("Test results:")
# print("accuracy is:",metrics.accuracy_score(y_pred = ytest_pred, y_true = ytest))
# print("precision is:",metrics.precision_score(y_pred = ytest_pred, y_true = ytest))
# print("recall is:",metrics.recall_score(y_pred = ytest_pred, y_true = ytest))
# print("f1 is:",metrics.f1_score(y_pred = ytest_pred, y_true = ytest))

In [None]:
# # create a new dataframe with succeeded = 1 and succeeded = 0

# bin_df_succeeded = bin_df[bin_df['succeeded'] == 1]
# bin_df_failed = bin_df[bin_df['succeeded'] == 0]

# size = bin_df_succeeded.shape[0]
# bin_df_fialed_sampled = bin_df_failed.sample(n = size , random_state = 42)

# equal_df = pd.concat([bin_df_succeeded, bin_df_fialed_sampled])

# print(equal_df.shape)
# print(equal_df['succeeded'].sum())



In [None]:
# Xtrain, Xtest, ytrain, ytest = train_test_split(equal_df[num_cols + bin_cols], equal_df['succeeded'],test_size = 0.2)
# lr = LogisticRegression()
# lr.fit(Xtrain, ytrain)


In [None]:
# ytrain_pred = lr.predict(Xtrain)
# ytest_pred = lr.predict(Xtest)

In [None]:
# print("Train results:")
# print("accuracy is:",metrics.accuracy_score(y_pred = ytrain_pred, y_true = ytrain))
# print("precision is:",metrics.precision_score(y_pred = ytrain_pred, y_true = ytrain))
# print("recall is:",metrics.recall_score(y_pred = ytrain_pred, y_true = ytrain))
# print("f1 is:",metrics.f1_score(y_pred = ytrain_pred, y_true = ytrain))
# print("---------------------")
# print("Test results:")
# print("accuracy is:",metrics.accuracy_score(y_pred = ytest_pred, y_true = ytest))
# print("precision is:",metrics.precision_score(y_pred = ytest_pred, y_true = ytest))
# print("recall is:",metrics.recall_score(y_pred = ytest_pred, y_true = ytest))
# print("f1 is:",metrics.f1_score(y_pred = ytest_pred, y_true = ytest))

In [None]:
# Xtrain, Xtest, ytrain, ytest = train_test_split(equal_df[bin_cols], equal_df['succeeded'],test_size = 0.2)

# print(f"Xtrain shape: {Xtrain.shape}")
# print(f"ytrain shape: {ytrain.shape}")
# print(f"Xtest shape: {Xtest.shape}")
# print(f"ytest shape: {ytest.shape}")

# lr = LogisticRegression(max_iter=1500)
# lr.fit(Xtrain, ytrain)

# ytrain_pred = lr.predict(Xtrain)
# ytest_pred = lr.predict(Xtest)

# print("\n\nTrain results:")
# print("accuracy is:",metrics.accuracy_score(y_pred = ytrain_pred, y_true = ytrain))
# print("precision is:",metrics.precision_score(y_pred = ytrain_pred, y_true = ytrain))
# print("recall is:",metrics.recall_score(y_pred = ytrain_pred, y_true = ytrain))
# print("f1 is:",metrics.f1_score(y_pred = ytrain_pred, y_true = ytrain))
# print("---------------------")
# print("Test results:")
# print("accuracy is:",metrics.accuracy_score(y_pred = ytest_pred, y_true = ytest))
# print("precision is:",metrics.precision_score(y_pred = ytest_pred, y_true = ytest))
# print("recall is:",metrics.recall_score(y_pred = ytest_pred, y_true = ytest))
# print("f1 is:",metrics.f1_score(y_pred = ytest_pred, y_true = ytest))

In [None]:
# bin_df_succeeded = bin_df[bin_df['succeeded'] == 1]
# bin_df_failed = bin_df[bin_df['succeeded'] == 0]

# size = bin_df_succeeded.shape[0]
# bin_df_fialed_sampled = bin_df_failed.sample(n = size , random_state = 42)

# equal_df = pd.concat([bin_df_succeeded, bin_df_fialed_sampled])

# print(equal_df.shape)
# print(equal_df['succeeded'].sum())

# # print(get_best_init_params_for_k_means(equal_df[bin_cols],8,['k-means++','random'],range(5,50,5),42))

Best score after cross-validation: 0.6201803664005068


Best score: 0.514350687019465, Best parameters: {'n_neighbors': 1}


In [11]:


# kernels = ['linear', 'poly', 'rbf', 'sigmoid']

# with tqdm.tqdm(total = len(kernels)) as pbar:
#     for k in kernels:
#         clf = svm.SVC(kernel=k, C=1, random_state=42)
#         clf.fit(bin_XTrain, bin_yTrain)
#         y_pred = clf.predict(bin_XTest)
#         print(k)
#         print(f'Accuracy: {metrics.accuracy_score(bin_yTest, y_pred)}')
#         print(f'Precision: {metrics.precision_score(bin_yTest, y_pred)}')
#         print(f'Recall: {metrics.recall_score(bin_yTest, y_pred)}')
#         print(f'F1: {metrics.f1_score(bin_yTest, y_pred)}\n')
#         pbar.update(1)
#         models_scores_dic[f'SVC_{k}'] = metrics.f1_score(bin_yTest, y_pred)

  0%|          | 0/4 [00:00<?, ?it/s]

linear
Accuracy: 0.5771428571428572
Precision: 0.5830388692579506
Recall: 0.5392156862745098
F1: 0.5602716468590833

poly
Accuracy: 0.603265306122449
Precision: 0.6863905325443787
Recall: 0.3790849673202614
F1: 0.48842105263157887

rbf
Accuracy: 0.6342857142857142
Precision: 0.6366666666666667
Recall: 0.6241830065359477
F1: 0.6303630363036303

sigmoid
Accuracy: 0.6122448979591837
Precision: 0.6261510128913443
Recall: 0.5555555555555556
F1: 0.5887445887445888



Accuracy: 0.4922448979591837
Precision: 0.49490835030549896
Recall: 0.7941176470588235
F1: 0.6097867001254705



Accuracy: 0.5689795918367347
Precision: 0.5664556962025317
Recall: 0.5849673202614379
F1: 0.5755627009646302



Accuracy: 0.6236734693877551
Precision: 0.6215780998389694
Recall: 0.630718954248366
F1: 0.6261151662611516



In [15]:

    # clf = MLPClassifier(solver=solve, alpha=1e-5, random_state=42, max_iter=1000)
    # clf.fit(bin_XTrain, bin_yTrain)
    # y_pred = clf.predict(bin_XTest)
    # print(f'Accuracy: {metrics.accuracy_score(bin_yTest, y_pred)}')
    # print(f'Precision: {metrics.precision_score(bin_yTest, y_pred)}')
    # print(f'Recall: {metrics.recall_score(bin_yTest, y_pred)}')
    # print(f'F1: {metrics.f1_score(bin_yTest, y_pred)}\n')
    # models_scores_dic[f'MLP_{solve}'] = metrics.f1_score(bin_yTest, y_pred)

Accuracy: 0.5583673469387755
Precision: 0.5606837606837607
Recall: 0.5359477124183006
F1: 0.5480367585630743





Accuracy: 0.603265306122449
Precision: 0.6022727272727273
Recall: 0.6062091503267973
F1: 0.6042345276872965

Accuracy: 0.5746938775510204
Precision: 0.5737439222042139
Recall: 0.5784313725490197
F1: 0.5760781122864117



In [16]:
max(list(models_scores_dic.values()))

0.6303630363036303

In [19]:
cv = ShuffleSplit(n_splits=15, test_size=0.2, random_state=42)
clf = LogisticRegression(max_iter=150)

scores = cross_val_score(clf, pca_3d_df[pca_3d_cols], pca_3d_df['succeeded'], cv=cv, scoring='f1_macro')
# print(scores)
# models_scores_dic['LogisticRegression'] = scores.max()
print(f'Best score after cross-validation: {scores.max()}')

Best score after cross-validation: 0.4499967766428377


In [26]:
gnb = GaussianNB()
gnb.fit(pca_2d_XTrain,pca_2d_yTrain)
y_pred = gnb.predict(pca_2d_XTest)
print(f'Accuracy: {metrics.accuracy_score(pca_2d_yTest, y_pred)}')
print(f'Precision: {metrics.precision_score(pca_2d_yTest, y_pred)}')
print(f'Recall: {metrics.recall_score(pca_2d_yTest, y_pred)}')
print(f'F1: {metrics.f1_score(pca_2d_yTest, y_pred)}\n')
# models_scores_dic['GNB'] = metrics.f1_score(pca_2d_yTest, y_pred)

Accuracy: 0.67974180734856
Precision: 0.7164093767867353
Recall: 0.8937232524964337
F1: 0.7953030783878133



In [25]:
gnb = GaussianNB()
gnb.fit(pca_3d_XTrain,pca_3d_yTrain)
y_pred = gnb.predict(pca_3d_XTest)
print(f'Accuracy: {metrics.accuracy_score(pca_3d_yTest, y_pred)}')
print(f'Precision: {metrics.precision_score(pca_3d_yTest, y_pred)}')
print(f'Recall: {metrics.recall_score(pca_3d_yTest, y_pred)}')
print(f'F1: {metrics.f1_score(pca_3d_yTest, y_pred)}\n')

Accuracy: 0.625124131082423
Precision: 0.7501933488012374
Recall: 0.6918687589158345
F1: 0.719851576994434

