# Training and comparison

Objective: Trainining of multiple models using the data and comparison of the results

In [92]:
#Importing most used modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import phik
sns.set_palette('viridis')
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from yellowbrick.classifier import ConfusionMatrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

Loading the datasets

In [27]:
#Loading the datasets
test_sets = ['numeric','categoricals_binned','one_hot_encoded','outliers_removed',
             'one_hot_encoded_rescaled','oversampled+','oversampled-','smoted+','smoted-']
datasets = {}
for key in test_sets:
    datasets[key] = pd.read_csv(f'datasets/{key}.csv')

In [47]:
#Defining the X,y spliter for ease of use
def df_splitter(dataset, target = 'Response'):
    X = dataset.drop(columns=[target])
    y = dataset[target].values
    return X, y

In [29]:
# Defining the printing function
def display_scores(scores):
    print("Scores:", scores)
    print("\nMean:", scores.mean())
    print("Standard deviation:", scores.std())

## Models

## Preliminary tests
Since we have some datasets options, we will try a first run of the models so we can choose a model to try improving the hyperparameters.

In [82]:
first_results = []

#### K-Neighbors Classifier

In [60]:
knn_clf = KNeighborsClassifier(n_neighbors=5, n_jobs = -1)
for key in test_sets:
    X, y = df_splitter(datasets[key])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1216)
    knn_test = cross_val_score(rnd_clf,X_train,y_train, n_jobs=-1, cv=3, scoring='accuracy') #Default cv value = 5
    first_results.append({'model':'KNN', 'dataset':key, 'score':knn_test.mean()})
    print(f'\n-----\nDataset: {key}')
    display_scores(knn_test)


-----
Dataset: numeric
Scores: [0.75421918 0.75803361 0.75552771]

Mean: 0.7559268322726007
Standard deviation: 0.0015826025552473936

-----
Dataset: categoricals_binned
Scores: [0.74891296 0.75058962 0.74985259]

Mean: 0.7497850601178416
Standard deviation: 0.000686157042040265

-----
Dataset: one_hot_encoded
Scores: [0.74847078 0.75117925 0.7500737 ]

Mean: 0.7499079090315698
Standard deviation: 0.0011119241923675974

-----
Dataset: outliers_removed
Scores: [0.75093633 0.75054311 0.7498689 ]

Mean: 0.7504494484897847
Standard deviation: 0.0004407788139344932

-----
Dataset: one_hot_encoded_rescaled
Scores: [0.74847078 0.75117925 0.7500737 ]

Mean: 0.7499079090315698
Standard deviation: 0.0011119241923675974

-----
Dataset: oversampled+
Scores: [0.8315718  0.83563636 0.83292121]

Mean: 0.8333764594456149
Standard deviation: 0.00169028631529191

-----
Dataset: oversampled-
Scores: [0.83114068 0.83078742 0.83290628]

Mean: 0.831611459205101
Standard deviation: 0.000926863045997699

---

#### Naive Bayes (Gaussian)

In [89]:
nbg_clf = GaussianNB()
for key in test_sets:
    X, y = df_splitter(datasets[key])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1216)
    nbg_test = cross_val_score(nbg_clf,X_train,y_train, n_jobs=-1, cv=3, scoring='accuracy') #Default cv value = 5
    first_results.append({'model':'Naive Bayes', 'dataset':key, 'score':nbg_test.mean()})
    print(f'\n-----\nDataset: {key}')
    display_scores(nbg_test)


-----
Dataset: numeric
Scores: [0.75827253 0.74447229 0.74572524]

Mean: 0.7494900188548305
Standard deviation: 0.006231205635468579

-----
Dataset: categoricals_binned
Scores: [0.75893581 0.7446934  0.74579894]

Mean: 0.7498093814905276
Standard deviation: 0.006469122716941241

-----
Dataset: one_hot_encoded
Scores: [0.75849363 0.74439858 0.74498821]

Mean: 0.7492934725426205
Standard deviation: 0.006509942143090918

-----
Dataset: outliers_removed
Scores: [0.75191011 0.74724698 0.74844558]

Mean: 0.7492008912000309
Standard deviation: 0.0019772143632113518

-----
Dataset: one_hot_encoded_rescaled
Scores: [0.73292063 0.72059257 0.71506486]

Mean: 0.7228593523823728
Standard deviation: 0.007463727823177525

-----
Dataset: oversampled+
Scores: [0.53626491 0.53575758 0.52809697]

Mean: 0.5333731512742083
Standard deviation: 0.0037365684454601896

-----
Dataset: oversampled-
Scores: [0.54047795 0.53178279 0.53345816]

Mean: 0.5352396360508814
Standard deviation: 0.0037666669149133527

--

#### Support Vector Machine

In [95]:
svm = SVC(kernel = 'poly', random_state = 1216, C = 1.0)
for key in test_sets:
    X, y = df_splitter(datasets[key])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1216)
    svc = cross_val_score(nbg_clf,X_train,y_train, n_jobs=-1, cv=3, scoring='accuracy') #Default cv value = 5
    first_results.append({'model':'SVC', 'dataset':key, 'score':svc.mean()})
    print(f'\n-----\nDataset: {key}')
    display_scores(svc)


-----
Dataset: numeric
Scores: [0.75827253 0.74447229 0.74572524]

Mean: 0.7494900188548305
Standard deviation: 0.006231205635468579

-----
Dataset: categoricals_binned
Scores: [0.75893581 0.7446934  0.74579894]

Mean: 0.7498093814905276
Standard deviation: 0.006469122716941241

-----
Dataset: one_hot_encoded
Scores: [0.75849363 0.74439858 0.74498821]

Mean: 0.7492934725426205
Standard deviation: 0.006509942143090918

-----
Dataset: outliers_removed
Scores: [0.75191011 0.74724698 0.74844558]

Mean: 0.7492008912000309
Standard deviation: 0.0019772143632113518

-----
Dataset: one_hot_encoded_rescaled
Scores: [0.73292063 0.72059257 0.71506486]

Mean: 0.7228593523823728
Standard deviation: 0.007463727823177525

-----
Dataset: oversampled+
Scores: [0.53626491 0.53575758 0.52809697]

Mean: 0.5333731512742083
Standard deviation: 0.0037365684454601896

-----
Dataset: oversampled-
Scores: [0.54047795 0.53178279 0.53345816]

Mean: 0.5352396360508814
Standard deviation: 0.0037666669149133527

--

#### Decision Trees

In [56]:
dtc = DecisionTreeClassifier(random_state=1216)
for key in test_sets:
    X, y = df_splitter(datasets[key])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1216)
    nbg_test = cross_val_score(nbg_clf,X_train,y_train, n_jobs=-1, cv=3, scoring='accuracy') #Default cv value = 5
    print(f'\n-----\nDataset: {key}')
    display_scores(nbg_test)

#### Extra Trees

#### Random Forest

In [30]:
knn = KNeighborsClassifier(n_jobs=-1) #KNN Classifier

In [39]:
rnd_clf = RandomForestClassifier(n_estimators=200,
                                 max_leaf_nodes=None,
                                 bootstrap=True,
                                 oob_score=True,n_jobs = -1,
                                 random_state = 1216)

In [48]:
for key in test_sets:
    X, y = df_splitter(datasets[key])
    knn_accs = cross_val_score(rnd_clf,X,y, n_jobs=-1, cv=2, scoring='accuracy') #Default cv value = 5
    print(f'\n-----\nDataset: {key}')
    display_scores(knn_accs)


-----
Dataset: numeric
Scores: [0.75771393 0.75586651]

Mean: 0.7567902205102001
Standard deviation: 0.000923705829173338

-----
Dataset: categoricals_binned
Scores: [0.75327228 0.75107111]

Mean: 0.7521716913643332
Standard deviation: 0.001100585668802334

-----
Dataset: one_hot_encoded
Scores: [0.75024567 0.74957745]

Mean: 0.7499115600801856
Standard deviation: 0.0003341063637435915

-----
Dataset: outliers_removed
Scores: [0.74999001 0.74938074]

Mean: 0.7496853777464565
Standard deviation: 0.00030463463818647396

-----
Dataset: one_hot_encoded_rescaled
Scores: [0.75024567 0.74965607]

Mean: 0.7499508667112142
Standard deviation: 0.000294799732714901

-----
Dataset: oversampled+
Scores: [0.83407028 0.86688387]

Mean: 0.8504770770304864
Standard deviation: 0.016406795438678123

-----
Dataset: oversampled-
Scores: [0.83052139 0.86655104]

Mean: 0.8485362136024388
Standard deviation: 0.018014821822768867

-----
Dataset: smoted+
Scores: [0.67972487 0.80924676]

Mean: 0.744485816978253

In [None]:
rnd_clf.fit(X_train, y_train)
rnd_clf_accs = cross_val_score(rnd_clf, X_train, y_train, scoring="accuracy", cv=5)

display_scores(rnd_clf_accs)

In [42]:
X_train, y_train = df_splitter(datasets['oversampled+'], target = 'Response')

In [43]:
rnd_clf.fit(X_train, y_train)
rnd_clf_accs = cross_val_score(rnd_clf, X_train, y_train, scoring="accuracy", cv=5)

display_scores(rnd_clf_accs)

Scores: [0.89650937 0.8983774  0.9272739  0.93276876 0.9358071 ]

Mean: 0.9181473073970784
Standard deviation: 0.01713475629743149


In [41]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()

gb.fit(X_train, y_train)
gb_accs = cross_val_score(gb, X_train, y_train, scoring="accuracy", cv=5)

display_scores(gb_accs)

Scores: [0.50284421 0.65220764 0.8363178  0.84433383 0.83592992]

Mean: 0.7343266811789435
Standard deviation: 0.136495163536079


XGBoost

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1216)