# Machine learning models

We will start with importing the modules and loading the 3 dataframes:

In [1]:
import pandas as pd
from lib.unsupervised_learning import *

from sklearn import svm, tree, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

import tqdm.notebook as tqdm

In [2]:
bin_df = pd.read_csv('data/dataframes/df_after_cols_reduction.csv').iloc[:,1:]
pca_2d_df = pd.read_csv('data/dataframes/pca_2d_df.csv').iloc[:,1:]
pca_3d_df = pd.read_csv('data/dataframes/pca_3d_df.csv').iloc[:,1:]

print(f'Binary dataframe shape: {bin_df.shape}')
print(f'PCA 2D dataframe shape: {pca_2d_df.shape}')
print(f'PCA 3D dataframe shape: {pca_3d_df.shape}')


Binary dataframe shape: (10070, 1927)
PCA 2D dataframe shape: (10070, 26)
PCA 3D dataframe shape: (10070, 32)


In [3]:
cat_cols = ['company_name', 'company_about','founded', 'business model','employees','product stage','status','funding stage','succeeded']
num_cols = ['total_raised','total_rounds', 'investors','ipo_price', 'geo_market_per']
tag_cols = [col for col in bin_df.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in bin_df.columns if col.startswith('targetmarket_')]
sector_list = [col for col in bin_df.columns if col.startswith("sector_")]
target_ind_list = [col  for col in bin_df.columns if col.startswith("industry_")]
technology_list = [col  for col in bin_df.columns if col.startswith("technology_")]


bin_cols = tag_cols + targetmarket_cols + sector_list + target_ind_list + technology_list
pca_2d_cols = [col for col in pca_2d_df.columns if col not in cat_cols and col not in num_cols]
pca_3d_cols = [col for col in pca_3d_df.columns if col not in cat_cols and col not in num_cols]

In [4]:
print(f"Categorical cols : {len(cat_cols)}")
print(f"Numerical cols : {len(num_cols)}")
print(f"Tag cols : {len(tag_cols)}")
print(f"Targetmarket cols : {len(targetmarket_cols)}")
print(f"Sector cols : {len(sector_list)}")
print(f"Industry cols : {len(target_ind_list)}")
print(f"Technology cols : {len(technology_list)}")
print('---- Totals ----')
print(f"Total binary cols : {len(bin_cols)}")
print(f"Toatl PCA 2D cols : {len(pca_2d_cols)}")
print(f"Total PCA 3D cols : {len(pca_3d_cols)}")



Categorical cols : 9
Numerical cols : 5
Tag cols : 1599
Targetmarket cols : 117
Sector cols : 41
Industry cols : 81
Technology cols : 75
---- Totals ----
Total binary cols : 1913
Toatl PCA 2D cols : 12
Total PCA 3D cols : 18


In this noteook, we will train few machine learning models on our datasets to find the best model to predict the target variable.  
The models we will use are:
- [Logistic Regression](#lr)
- [K-Nearest Neighbours](#knn)
- [Support Vector Machine](#svm)
- [Gaussian Naive Bayes](#gnb)
- [Decision Tree](#dt)
- [Random Forest](#rf)
- [Multi-layer Perceptron](#mlp)  


First we need to prepare our data.  
As we saw in the visualizations, we have almost 70% of successful companies in the dataset.  
In order to avoid biased results, we need to train the models on an evenly distributed succeeded column in the dataset.  
i.e the train data should contain 50% succesfull companies and 50% failed companies.


In [5]:
bin_df_succeeded = bin_df[bin_df['succeeded'] == 1]
bin_df_failed = bin_df[bin_df['succeeded'] == 0]

size = min(bin_df_succeeded.shape[0], bin_df_failed.shape[0])

bin_df_failed_sampled = bin_df_failed.sample(n = size , random_state = 42)
bin_df_succeded_sampled = bin_df_succeeded.sample(n = size , random_state = 42)

equal_df = pd.concat([bin_df_succeded_sampled, bin_df_failed_sampled])

print(f'Binary equal dataframe shape: {equal_df.shape}')
print(f"Binary equal dataframe succeeded companies: {equal_df['succeeded'].sum()}")

Binary equal dataframe shape: (6122, 1927)
Binary equal dataframe succeeded companies: 3061.0


We will do the same for both pca dataframes:

In [6]:
print("\n2D PCA data:")

pca_2d_df_succeeded = pca_2d_df[pca_2d_df['succeeded'] == 1]
pca_2d_df_failed = pca_2d_df[pca_2d_df['succeeded'] == 0]

size = min(pca_2d_df_succeeded.shape[0], pca_2d_df_failed.shape[0])

pca_2d_df_succeded_sampled = pca_2d_df_succeeded.sample(n = size , random_state = 42)
pca_2d_df_failed_sampled = pca_2d_df_failed.sample(n = size , random_state = 42)

pca_2d_equal_df = pd.concat([pca_2d_df_succeded_sampled, pca_2d_df_failed_sampled])

print(f'2D PCA equal dataframe shape: {pca_2d_equal_df.shape}')
print(f"2D PCA equal dataframe succeeded companies: {pca_2d_equal_df['succeeded'].sum()}")

print("\n3D PCA data:")

pca_3d_df_succeeded = pca_3d_df[pca_3d_df['succeeded'] == 1]
pca_3d_df_failed = pca_3d_df[pca_3d_df['succeeded'] == 0]

size = min(pca_3d_df_succeeded.shape[0], pca_3d_df_failed.shape[0])

pca_3d_df_succeded_sampled = pca_3d_df_succeeded.sample(n = size , random_state = 42)
pca_3d_df_failed_sampled = pca_3d_df_failed.sample(n = size , random_state = 42)

pca_3d_equal_df = pd.concat([pca_3d_df_succeded_sampled, pca_3d_df_failed_sampled])

print(f'3D PCA equal dataframe shape: {pca_3d_equal_df.shape}')
print(f"3D PCA equal dataframe succeeded companies: {pca_3d_equal_df['succeeded'].sum()}")


2D PCA data:
2D PCA equal dataframe shape: (6122, 26)
2D PCA equal dataframe succeeded companies: 3061.0

3D PCA data:
3D PCA equal dataframe shape: (6122, 32)
3D PCA equal dataframe succeeded companies: 3061.0


In [7]:
bin_XTrain, bin_XTest, bin_yTrain, bin_yTest = train_test_split(equal_df[bin_cols], equal_df['succeeded'], test_size=0.2, random_state=42, stratify=equal_df['succeeded'])
pca_2d_XTrain, pca_2d_XTest, pca_2d_yTrain, pca_2d_yTest = train_test_split(pca_2d_equal_df[pca_2d_cols], pca_2d_equal_df['succeeded'], test_size=0.2, random_state=42, stratify=pca_2d_equal_df['succeeded'])
pca_3d_XTrain, pca_3d_XTest, pca_3d_yTrain, pca_3d_yTest = train_test_split(pca_3d_equal_df[pca_3d_cols], pca_3d_equal_df['succeeded'], test_size=0.2, random_state=42, stratify=pca_3d_equal_df['succeeded'])


print(f"bin_XTrain shape: {bin_XTrain.shape}")
print(f"bin_yTrain shape: {bin_yTrain.shape}")

print(f"bin_XTest shape: {bin_XTest.shape}")
print(f"bin_yTest shape: {bin_yTest.shape}")

bin_XTrain shape: (4897, 1913)
bin_yTrain shape: (4897,)
bin_XTest shape: (1225, 1913)
bin_yTest shape: (1225,)


We will create functions to train different models and return the f1 score :

### Logistic Regression
we will use ShuffleSplit cross validation to train the model. 
<a id='lr'></a>

In [18]:
def train_logistic_regression(df, cols):
    cv = ShuffleSplit(n_splits=15, test_size=0.2, random_state=42)
    clf = LogisticRegression(max_iter=150)
    scoring = ['f1_macro', 'precision_macro', 'recall_macro','accuracy']
    # scores = cross_val_score(clf, df[cols], df['succeeded'], cv=cv, scoring='f1_macro')
    scores = cross_validate(clf, df[cols], df['succeeded'], cv=cv, scoring=scoring)
    f1 = max(list(scores['test_f1_macro']))
    accuracy = max(list(scores['test_precision_macro']))
    precision = max(list(scores['test_recall_macro']))
    recall = max(list(scores['test_accuracy']))
    return  {'test_f1_macro': f1, 'test_accuracy': accuracy, 'test_precision_macro': precision, 'test_recall_macro': recall}

### K Nearest Neighbours
we will use GridSearch to find best parameters for KNN algorithm. 
<a id='knn'></a>

In [19]:
def train_knn(XTrain, yTrain):

    parameters = {'n_neighbors':range(2,50,2), 'weights':['uniform', 'distance']}
    knn = KNeighborsClassifier()
    clf = GridSearchCV(knn, parameters,scoring=metrics.make_scorer(metrics.f1_score, greater_is_better=True))
    clf.fit(XTrain, yTrain)
    y_pred = clf.predict(XTest)
    f1 = metrics.f1_score(yTest, y_pred)
    accuracy = metrics.accuracy_score(yTest, y_pred)
    precision = metrics.precision_score(yTest, y_pred)
    recall = metrics.recall_score(yTest, y_pred)
    return  {'test_f1_macro': f1, 'test_accuracy': accuracy, 'test_precision_macro': precision, 'test_recall_macro': recall}

### Support Vector Machine
we will use GridSearch to find best parameters for SVC algorithm. 
<a id='svc'></a>

In [20]:
def train_svm(XTrain, yTrain):
    parameters = {'C':[0.1,1,10], 'kernel':['linear', 'rbf']}
    s = svm.SVC()
    clf = GridSearchCV(s, parameters,scoring=metrics.make_scorer(metrics.f1_score, greater_is_better=True))
    clf.fit(XTrain, yTrain)
    y_pred = clf.predict(XTest)
    f1 = metrics.f1_score(yTest, y_pred)
    accuracy = metrics.accuracy_score(yTest, y_pred)
    precision = metrics.precision_score(yTest, y_pred)
    recall = metrics.recall_score(yTest, y_pred)
    return  {'test_f1_macro': f1, 'test_accuracy': accuracy, 'test_precision_macro': precision, 'test_recall_macro': recall}

### Gaussian Naive Bayes
<a id='gnb'></a>

In [21]:
def train_gnb(XTrain, yTrain, XTest, yTest):
    gnb = GaussianNB()
    gnb.fit(XTrain, yTrain)
    y_pred = gnb.predict(XTest)
    f1 = metrics.f1_score(yTest, y_pred)
    accuracy = metrics.accuracy_score(yTest, y_pred)
    precision = metrics.precision_score(yTest, y_pred)
    recall = metrics.recall_score(yTest, y_pred)
    return  {'test_f1_macro': f1, 'test_accuracy': accuracy, 'test_precision_macro': precision, 'test_recall_macro': recall}

### Decision Tree
<a id='dt'></a>

In [22]:
def train_dt(XTrain, yTrain, XTest, yTest):
    dt = tree.DecisionTreeClassifier()
    dt.fit(XTrain, yTrain)
    y_pred = dt.predict(XTest)
    f1 = metrics.f1_score(yTest, y_pred)
    accuracy = metrics.accuracy_score(yTest, y_pred)
    precision = metrics.precision_score(yTest, y_pred)
    recall = metrics.recall_score(yTest, y_pred)
    return  {'test_f1_macro': f1, 'test_accuracy': accuracy, 'test_precision_macro': precision, 'test_recall_macro': recall}

### Random Forest Classifier
<a id='rf'></a>

In [23]:
def train_rf(XTrain, yTrain, XTest, yTest):
    rf = RandomForestClassifier(n_estimators=300, random_state=42)
    rf.fit(XTrain, yTrain)
    y_pred = rf.predict(XTest)
    f1 = metrics.f1_score(yTest, y_pred)
    accuracy = metrics.accuracy_score(yTest, y_pred)
    precision = metrics.precision_score(yTest, y_pred)
    recall = metrics.recall_score(yTest, y_pred)
    return  {'test_f1_macro': f1, 'test_accuracy': accuracy, 'test_precision_macro': precision, 'test_recall_macro': recall}

### Neural Network - Multi-layer Perceptron
<a id='mlp'></a>

In [24]:
def train_mlp(XTrain, yTrain, XTest, yTest):
    
    mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
    mlp.fit(XTrain, yTrain)
    y_pred = mlp.predict(XTest)
    f1 = metrics.f1_score(yTest, y_pred)
    accuracy = metrics.accuracy_score(yTest, y_pred)
    precision = metrics.precision_score(yTest, y_pred)
    recall = metrics.recall_score(yTest, y_pred)
    return  {'test_f1_macro': f1, 'test_accuracy': accuracy, 'test_precision_macro': precision, 'test_recall_macro': recall}

Now, we will run every algorithm on the each dataset and compare the results:

In [26]:
# runtime ~ 47 minutes
import tqdm.notebook as tqdm
import timeit
from datetime import timedelta
dfs_scores = {}
t0 = timeit.default_timer()
dfs ={'bin_df': (bin_df,bin_cols), 'pca_2d_df': (pca_2d_df,pca_2d_cols), 'pca_3d_df': (pca_3d_df,pca_3d_cols)}
with tqdm.tqdm(total=len(dfs)*7) as pbar:
    for key, df in dfs.items():
        scores = {} 
        
        XTrain, XTest, yTrain, yTest = train_test_split(df[0][df[1]], df[0]['succeeded'], test_size=0.2, random_state=42, stratify=df[0]['succeeded'])

        scores['LogisticRegression'] = train_logistic_regression(df[0], df[1])
        print(f'Elapsed time: {timedelta(seconds = timeit.default_timer() - t0)}, scores: {scores["LogisticRegression"]}')
        pbar.update(1)

        scores['KNN'] = train_knn(XTrain, yTrain)
        print(f'Elapsed time: {timedelta(seconds = timeit.default_timer() - t0)}, scores: {scores["KNN"]}')
        pbar.update(1)

        scores['SVM'] = train_svm(XTrain, yTrain)
        print(f'Elapsed time: {timedelta(seconds = timeit.default_timer() - t0)}, scores: {scores["SVM"]}')
        pbar.update(1)

        scores['GNB'] = train_gnb(XTrain, yTrain, XTest, yTest)
        print(f'Elapsed time: {timedelta(seconds = timeit.default_timer() - t0)}, scores: {scores["GNB"]}')
        pbar.update(1)

        scores['DT'] = train_dt(XTrain, yTrain, XTest, yTest)
        print(f'Elapsed time: {timedelta(seconds = timeit.default_timer() - t0)}, scores: {scores["DT"]}')
        pbar.update(1)

        scores['RF'] = train_rf(XTrain, yTrain, XTest, yTest)
        print(f'Elapsed time: {timedelta(seconds = timeit.default_timer() - t0)}, scores: {scores["RF"]}')
        pbar.update(1)

        scores['MLP'] = train_mlp(XTrain, yTrain, XTest, yTest)
        print(f'Elapsed time: {timedelta(seconds = timeit.default_timer() - t0)}, scores: {scores["MLP"]}')
        pbar.update(1)

        dfs_scores[key] = scores


  0%|          | 0/21 [00:00<?, ?it/s]

Elapsed time: 0:00:54.948733, scores: {'test_f1_macro': 0.6040409344741021, 'test_accuracy': 0.6253977513400062, 'test_precision_macro': 0.599258192770106, 'test_recall_macro': 0.6951340615690169}
Elapsed time: 0:03:30.154019, scores: {'test_f1_macro': 0.7912159947558177, 'test_accuracy': 0.6837140019860973, 'test_precision_macro': 0.7319587628865979, 'test_recall_macro': 0.8609129814550642}


Now we 

In [None]:
model_scores = {'LogisticRegression': [], 'KNN': [], 'SVM': [], 'GNB': [], 'DT': [], 'RF': [], 'MLP': []}
model_list = {}
for key, scores in dfs_scores.items():
    for model, score in scores.items():
        model_scores[model].append(score[1])
        model_list[model] = score[0]

In [None]:
model_scores

In [None]:
import plotly.graph_objects as go
x = ['Binary Data', 'PCA 2D Data', 'PCA 3D Data']

fig = go.Figure(data=[
    go.Bar(name='Logistic Regression', x=x, y=list(model_scores.values())[0]),
    go.Bar(name='Knn', x=x, y=list(model_scores.values())[1]),
    go.Bar(name='SVM', x=x, y=list(model_scores.values())[2]),
    go.Bar(name='GNB', x=x, y=list(model_scores.values())[3]),
    go.Bar(name='DT', x=x, y=list(model_scores.values())[4]),
    go.Bar(name='RF', x=x, y=list(model_scores.values())[5]),
    go.Bar(name='MLP', x=x, y=list(model_scores.values())[6])
])
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()

In [None]:
for key, model in model_list.items():
    y_pred = model.predict(bin_XTest)
    print(f"{key} Accuracy Score: {metrics.accuracy_score(bin_yTest, y_pred)}")


In [None]:
model_list