# Scopo del notebook di Training

Lo scopo di questa parte è di selezionare la miglior famiglia di modelli e i corrispondenti iper-parametri, per poi addestrare il modello da usare nelle fasi successive.

# Setting

In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [None]:
# IMPORTAZIONE LIBRERIE
import utils
import datetime
import importlib
importlib.reload(utils) # ricarica lo script utils.py

# PACCHETTI DI BASE
import numpy as np
import pandas as pd

# SKLEARN
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
RANDOM_STATE = 123
SCORING = ['accuracy', 'precision', 'f1', 'recall', 'roc_auc']
N_SPLITS = 5
N_REPEATS = 5

# Dataset Loading

In [None]:
# Training
X_train = pd.read_pickle("../Data/Prepared/train_prep.pkl.zip", compression='zip')
y_train = X_train.pop('Target')

In [None]:
X_train

# Dummy Classifier

## Fit model

In [None]:
now = datetime.datetime.now()

clf_DUMMY = DummyClassifier(random_state=RANDOM_STATE,
                            strategy="most_frequent")

# Fit del modello sui valori di training
clf_DUMMY = clf_DUMMY.fit(X_train, y_train)

time_stop = datetime.datetime.now()

In [None]:
time_duration_DUMMY = time_stop - now
print(time_duration_DUMMY)

## Performances

In [None]:
mean_test_accuracy_DUMMY = 0
mean_test_precision_DUMMY = 0
mean_test_f1_DUMMY = 0
mean_test_recall_DUMMY = 0
mean_test_roc_auc_DUMMY = 0.5

# Logistic regression

## Fit model

In [None]:
now = datetime.datetime.now()

clf_LOG_REG = LogisticRegression(random_state=RANDOM_STATE)

# Impostazione dei parametri del modello per la GridSearchCV
tuned_parameters_LOG_REG = [
  {'penalty': ['l1'],
   'C':[0.001, .009, 0.01, .09, 1, 5, 10, 25],
   'solver': ['liblinear', 'saga']},
  {'penalty': ['l2'],
   'C':[0.001, .009, 0.01, .09, 1, 5, 10, 25],
   'solver': ['lbfgs', 'sag', 'newton-cg']},
 ]

cv = RepeatedStratifiedKFold(n_splits=N_SPLITS,
                             n_repeats=N_REPEATS,
                             random_state=RANDOM_STATE)

grid_clf_LOG_REG = GridSearchCV(clf_LOG_REG,
                            param_grid = tuned_parameters_LOG_REG,
                            scoring = SCORING,
                            refit='recall',
                            cv = cv,
                            verbose=3)


# Fit del modello sui valori di training
grid_clf_LOG_REG = grid_clf_LOG_REG.fit(X_train, y_train)

grid_clf_LOG_REG.best_params_

time_stop = datetime.datetime.now()

In [None]:
time_duration_LOG_REG = time_stop - now
print(time_duration_LOG_REG)

## Performances

In [None]:
testing_metrics_LOG_REG = pd.DataFrame(grid_clf_LOG_REG.cv_results_).sort_values(by=['rank_test_recall'])[['mean_test_accuracy', 'mean_test_precision', 'mean_test_f1', 'mean_test_recall', 'mean_test_roc_auc', 'rank_test_accuracy', 'rank_test_precision', 'rank_test_f1', 'rank_test_recall', 'rank_test_roc_auc']]
testing_metrics_LOG_REG

In [None]:
mean_test_accuracy_LOG_REG, mean_test_precision_LOG_REG, mean_test_f1_LOG_REG, mean_test_recall_LOG_REG, mean_test_roc_auc_LOG_REG = utils.metrics(testing_metrics_LOG_REG)

In [None]:
# best_score_LOG_REG = grid_clf_LOG_REG.best_score_
# best_score_LOG_REG

# Linear Discriminant Analysis

## Fit model

In [None]:
now = datetime.datetime.now()

clf_LIN_DISC = LinearDiscriminantAnalysis()

# Impostazione dei parametri del modello per la GridSearchCV
tuned_parameters_LDA = [
  {},
 ]

cv = RepeatedStratifiedKFold(n_splits=N_SPLITS,
                             n_repeats=N_REPEATS,
                             random_state=RANDOM_STATE)

grid_clf_LIN_DISC = GridSearchCV(clf_LIN_DISC,
                            param_grid = tuned_parameters_LDA,
                            scoring = SCORING,
                            refit='recall',
                            cv = cv,
                            verbose=3)

# Fit del modello sui valori di training
grid_clf_LIN_DISC = grid_clf_LIN_DISC.fit(X_train, y_train)

time_stop = datetime.datetime.now()

In [None]:
time_duration_LDA = time_stop - now
print(time_duration_LDA)

## Performances

In [None]:
# pd.DataFrame(grid_clf_LIN_DISC.cv_results_)

testing_metrics_LIN_DISC = pd.DataFrame(grid_clf_LIN_DISC.cv_results_).sort_values(by=['rank_test_recall'])[['mean_test_accuracy', 'mean_test_precision', 'mean_test_f1', 'mean_test_recall', 'mean_test_roc_auc', 'rank_test_accuracy', 'rank_test_precision', 'rank_test_f1', 'rank_test_recall', 'rank_test_roc_auc']]
testing_metrics_LIN_DISC

In [None]:
mean_test_accuracy_LIN_DISC, mean_test_precision_LIN_DISC, mean_test_f1_LIN_DISC, mean_test_recall_LIN_DISC, mean_test_roc_auc_LIN_DISC = utils.metrics(testing_metrics_LIN_DISC)

In [None]:
# best_score_LIN_DISC = grid_clf_LIN_DISC.best_score_
# best_score_LIN_DISC

# K-Nearest Neighbors

## Fit model

In [None]:
now = datetime.datetime.now()

clf_KNN = KNeighborsClassifier()

# Impostazione dei parametri del modello per la GridSearchCV
tuned_parameters_KNN = [
  {'n_neighbors': [3, 5, 11, 19],
   'weights': ['uniform', 'distance'],
   'metric': ['euclidean', 'manhattan']}
 ]

cv = RepeatedStratifiedKFold(n_splits=N_SPLITS,
                             n_repeats=N_REPEATS,
                             random_state=RANDOM_STATE)

grid_clf_KNN = GridSearchCV(clf_KNN,
                            param_grid = tuned_parameters_KNN,
                            scoring = SCORING,
                            refit='recall',
                            cv = cv,
                            verbose=3)

# Fit del modello sui valori di training
grid_clf_KNN = grid_clf_KNN.fit(X_train, y_train)

time_stop = datetime.datetime.now()

In [None]:
time_duration_KNN = time_stop - now
print(time_duration_KNN)

## Performances

In [None]:
testing_metrics_KNN = pd.DataFrame(grid_clf_KNN.cv_results_).sort_values(by=['rank_test_recall'])[['mean_test_accuracy', 'mean_test_precision', 'mean_test_f1', 'mean_test_recall', 'mean_test_roc_auc', 'rank_test_accuracy', 'rank_test_precision', 'rank_test_f1', 'rank_test_recall', 'rank_test_roc_auc']]
testing_metrics_KNN

In [None]:
mean_test_accuracy_KNN, mean_test_precision_KNN, mean_test_f1_KNN, mean_test_recall_KNN, mean_test_roc_auc_KNN = utils.metrics(testing_metrics_KNN)

In [None]:
# best_score_KNN = grid_clf_KNN.best_score_  # è quello che ha rank test recall pari a 1
# best_score_KNN

# Decision tree

## Fit model

In [None]:
now = datetime.datetime.now()

clf_DEC_TREE = DecisionTreeClassifier(random_state=RANDOM_STATE)

# Impostazione dei parametri del modello per la GridSearchCV
tuned_parameters_DEC_TREE = [
  {'criterion': ['gini', 'entropy'],
   'max_depth': [2,4,6,8,10,12]}
 ]

cv = RepeatedStratifiedKFold(n_splits=N_SPLITS,
                             n_repeats=N_REPEATS,
                             random_state=RANDOM_STATE)

grid_clf_DEC_TREE = GridSearchCV(clf_DEC_TREE,
                            param_grid = tuned_parameters_DEC_TREE,
                            scoring = SCORING,
                            refit='recall',
                            cv = cv,
                            verbose=3)

# Fit del modello sui valori di training
grid_clf_DEC_TREE = grid_clf_DEC_TREE.fit(X_train, y_train)

time_stop = datetime.datetime.now()

In [None]:
time_duration_DEC_TREE = time_stop - now
print(time_duration_DEC_TREE)

## Performances

In [None]:
# pd.DataFrame(grid_clf_DEC_TREE.cv_results_)

testing_metrics_DEC_TREE = pd.DataFrame(grid_clf_DEC_TREE.cv_results_).sort_values(by=['rank_test_recall'])[['mean_test_accuracy', 'mean_test_precision', 'mean_test_f1', 'mean_test_recall', 'mean_test_roc_auc', 'rank_test_accuracy', 'rank_test_precision', 'rank_test_f1', 'rank_test_recall', 'rank_test_roc_auc']]
testing_metrics_DEC_TREE

In [None]:
mean_test_accuracy_DEC_TREE, mean_test_precision_DEC_TREE, mean_test_f1_DEC_TREE, mean_test_recall_DEC_TREE, mean_test_roc_auc_DEC_TREE = utils.metrics(testing_metrics_DEC_TREE)

In [None]:
# best_score_DEC_TREE = grid_clf_DEC_TREE.best_score_
# best_score_DEC_TREE

# Random Forest

## Fit model

In [None]:
now = datetime.datetime.now()

clf_RAND_FOR = RandomForestClassifier(random_state=RANDOM_STATE)

# Impostazione dei parametri del modello per la GridSearchCV
tuned_parameters_RAND_FOR = [
  {'n_estimators': [200, 500],
   'max_features': ['auto', 'sqrt', 'log2'],
   'max_depth' : [4,5,6,7,8],
   'criterion' :['gini', 'entropy']}
 ]

cv = RepeatedStratifiedKFold(n_splits=N_SPLITS,
                             n_repeats=N_REPEATS,
                             random_state=RANDOM_STATE)

grid_clf_RAND_FOR = GridSearchCV(clf_RAND_FOR,
                            param_grid = tuned_parameters_RAND_FOR,
                            scoring = SCORING,
                            refit='recall',
                            cv = cv,
                            verbose=3)

# Fit del modello sui valori di training
grid_clf_RAND_FOR = grid_clf_RAND_FOR.fit(X_train, y_train)

time_stop = datetime.datetime.now()

In [None]:
time_duration_RAND_FOR = time_stop - now
print(time_duration_RAND_FOR)

## Performances

In [None]:
testing_metrics_RAND_FOR = pd.DataFrame(grid_clf_RAND_FOR.cv_results_).sort_values(by=['rank_test_recall'])[['mean_test_accuracy', 'mean_test_precision', 'mean_test_f1', 'mean_test_recall', 'mean_test_roc_auc', 'rank_test_accuracy', 'rank_test_precision', 'rank_test_f1', 'rank_test_recall', 'rank_test_roc_auc']]
testing_metrics_RAND_FOR

In [None]:
mean_test_accuracy_RAND_FOR, mean_test_precision_RAND_FOR, mean_test_f1_RAND_FOR, mean_test_recall_RAND_FOR, mean_test_roc_auc_RAND_FOR = utils.metrics(testing_metrics_RAND_FOR)

In [None]:
# best_score_RAND_FOR = grid_clf_RAND_FOR.best_score_
# best_score_RAND_FOR

# Naive Bayes

## Fit model

In [None]:
now = datetime.datetime.now()

clf_NB = GaussianNB()

# Impostazione dei parametri del modello per la GridSearchCV
tuned_parameters_NB = [
  {'var_smoothing': np.logspace(0,-9, num=100)}
 ]

cv = RepeatedStratifiedKFold(n_splits=N_SPLITS,
                             n_repeats=N_REPEATS,
                             random_state=RANDOM_STATE)

grid_clf_NB = GridSearchCV(clf_NB,
                           param_grid = tuned_parameters_NB,
                           scoring = SCORING,
                           refit= 'recall',
                           cv = cv,
                           verbose=3)

# Fit del modello sui valori di training
grid_clf_NB = grid_clf_NB.fit(X_train, y_train)

time_stop = datetime.datetime.now()

In [None]:
time_duration_NB = time_stop - now
print(time_duration_NB)

## Performances

In [None]:
testing_metrics_NB = pd.DataFrame(grid_clf_NB.cv_results_).sort_values(by=['rank_test_recall'])[['mean_test_accuracy', 'mean_test_precision', 'mean_test_f1', 'mean_test_recall', 'mean_test_roc_auc', 'rank_test_accuracy', 'rank_test_precision', 'rank_test_f1', 'rank_test_recall', 'rank_test_roc_auc']]
testing_metrics_NB

In [None]:
mean_test_accuracy_NB, mean_test_precision_NB, mean_test_f1_NB, mean_test_recall_NB, mean_test_roc_auc_NB = utils.metrics(testing_metrics_NB)

In [None]:
# best_score_NB = grid_clf_NB.best_score_
# best_score_NB

# Support Vector Machine

## Fit model

In [None]:
now = datetime.datetime.now()

clf_SVM = SVC(random_state=RANDOM_STATE,
              probability=True)

# Impostazione dei parametri del modello per la GridSearchCV
tuned_parameters_SVM = [
  {'C': [0.1,1, 10, 100],
   'gamma': [0.0001,0.001,0.1,1],
   'kernel': ['rbf', 'sigmoid']
   }]

cv = RepeatedStratifiedKFold(n_splits=N_SPLITS,
                             n_repeats=N_REPEATS,
                             random_state=RANDOM_STATE)

grid_clf_SVM = GridSearchCV(clf_SVM,
                            param_grid = tuned_parameters_SVM,
                            scoring = SCORING,
                            refit='recall',
                            cv = cv,
                            verbose=3)

# Fit del modello sui valori di training
grid_clf_SVM = grid_clf_SVM.fit(X_train, y_train)

time_stop = datetime.datetime.now()

In [None]:
time_duration_SVM = time_stop - now
print(time_duration_SVM)

## Performances

In [None]:
testing_metrics_SVM = pd.DataFrame(grid_clf_SVM.cv_results_).sort_values(by=['rank_test_recall'])[['mean_test_accuracy', 'mean_test_precision', 'mean_test_f1', 'mean_test_recall', 'mean_test_roc_auc', 'rank_test_accuracy', 'rank_test_precision', 'rank_test_f1', 'rank_test_recall', 'rank_test_roc_auc']]
testing_metrics_SVM

In [None]:
mean_test_accuracy_SVM, mean_test_precision_SVM, mean_test_f1_SVM, mean_test_recall_SVM, mean_test_roc_auc_SVM = utils.metrics(testing_metrics_SVM)

In [None]:
# best_score_SVM = grid_clf_SVM.best_score_
# best_score_SVM

# Riepilogo risultati modelli
Creazione di una tabella riassuntiva dei migliori risultati ottenuti in termini di parametri che ciascun modello utilizza e in termini di performance di accuracy, precision, recall, F1-score.
Decisione del modello finale da utilizzare che poi dovrà essere implementato nella fase di testing. Salvataggio del modello.

## Tabella riassuntiva delle metriche

In [None]:
models_names = ['Dummy Classifier',
               'Logistic Regression',
               'Linear Discriminant Analysis',
               'KNN',
               'Decision Tree',
               'Random Forest',
               'Naive Bayes',
               'Support Vector Machine']

accuracy = [mean_test_accuracy_DUMMY,
            mean_test_accuracy_LOG_REG,
            mean_test_accuracy_LIN_DISC,
            mean_test_accuracy_KNN,
            mean_test_accuracy_DEC_TREE,
            mean_test_accuracy_RAND_FOR,
            mean_test_accuracy_NB,
            mean_test_accuracy_SVM]

precision = [mean_test_precision_DUMMY,
             mean_test_precision_LOG_REG,
             mean_test_precision_LIN_DISC,
             mean_test_precision_KNN,
             mean_test_precision_DEC_TREE,
             mean_test_precision_RAND_FOR,
             mean_test_precision_NB,
             mean_test_precision_SVM]

f1 = [mean_test_f1_DUMMY,
      mean_test_f1_LOG_REG,
      mean_test_f1_LIN_DISC,
      mean_test_f1_KNN,
      mean_test_f1_DEC_TREE,
      mean_test_f1_RAND_FOR,
      mean_test_f1_NB,
      mean_test_f1_SVM]

recall = [mean_test_recall_DUMMY,
          mean_test_recall_LOG_REG,
          mean_test_recall_LIN_DISC,
          mean_test_recall_KNN,
          mean_test_recall_DEC_TREE,
          mean_test_recall_RAND_FOR,
          mean_test_recall_NB,
          mean_test_recall_SVM]

roc_auc = [mean_test_roc_auc_DUMMY,
           mean_test_roc_auc_LOG_REG,
           mean_test_roc_auc_LIN_DISC,
           mean_test_roc_auc_KNN,
           mean_test_roc_auc_DEC_TREE,
           mean_test_roc_auc_RAND_FOR,
           mean_test_roc_auc_NB,
           mean_test_roc_auc_SVM]

time_duration = [time_duration_DUMMY,
                time_duration_LOG_REG,
                time_duration_LDA,
                time_duration_KNN,
                time_duration_DEC_TREE,
                time_duration_RAND_FOR,
                time_duration_NB,
                time_duration_SVM]

# Creazione della tabella dei risultati
df_result = pd.DataFrame(list(zip(models_names,
                                  recall,
                                  precision,
                                  f1,
                                  accuracy,
                                  roc_auc,
                                  time_duration)),
               columns =['models_names',
                         'Recall',
                         'Precision',
                         'F1',
                         'Accuracy',
                         'AUC',
                         'time_duration'])


df_result = df_result.set_index('models_names')
df_result = df_result.sort_values(by=['Recall', 'Accuracy'] , ascending = False)
df_result = df_result.round(decimals = 2)

print(df_result)

# Salvataggio dell'output
df_result.to_csv(r'../Models/models_metrics_cropping_ordered.csv', index=True)

# Models persistence

In [None]:
# salvataggio modelli

utils.pickle_dump(clf_DUMMY,  "dummy.pkl")
utils.pickle_dump(grid_clf_LOG_REG,  "log_reg.pkl")
utils.pickle_dump(grid_clf_LIN_DISC, "linear_discriminant_analysis.pkl")
utils.pickle_dump(grid_clf_KNN,      "knn.pkl")
utils.pickle_dump(grid_clf_DEC_TREE, "decision_tree.pkl")
utils.pickle_dump(grid_clf_RAND_FOR, "random_forest.pkl")
utils.pickle_dump(grid_clf_NB,       "naive_bayes.pkl")
utils.pickle_dump(grid_clf_SVM,      "support_vector_machine.pkl")