**420-A52-SF - Algorithmes d'apprentissage supervisé - Automne 2022 - Spécialisation technique en Intelligence Artificielle**<br/>
MIT License - Copyright (c) 2022 Mikaël Swawola
<br/>
![Travaux Pratiques - Optimisation des hyperparamètres 101](static/16-banner.png)
<br/>
**Objectif:** cette séance de travaux pratiques a pour objectif la recherche des meilleurs hyperparamètres appliqués à l'ensemble des algorithmes vus en cours jusqu'à maintenant. Le jeu de données utilisé sera **Titanic**

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Exercice 1 - Chargement et exploration sommaire des données

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('../../data/titanic_train.csv', index_col='PassengerId')

In [None]:
titanic.head()

In [None]:
import seaborn as sns

# Configuration de la visualisation
sns.set(style="darkgrid")
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5, })
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [None]:
_ = sns.scatterplot(x='Age', y='Survived', hue='Sex', size='Pclass', sizes=(20, 200), data=titanic)

#### Conversion des variables `embarked` et `sex`

In [None]:
titanic = pd.get_dummies(titanic, columns=['Embarked'], prefix = ['emb'], drop_first=True)
titanic['Sex'] = (titanic['Sex'] == 'female').astype(int)
titanic.columns

In [None]:
X = titanic[['Age', 'Sex','Pclass','SibSp','Parch','Fare','emb_Q','emb_S']]
y = titanic['Survived'].values

#### Vérification de la proportion des classes positives (Survided) et négatives (Died) 

In [None]:
y.sum()/len(y)

#### Imputation des valeurs manquantes

Les valeurs manquantes seront imputées pour l'exercice pour simplififer

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean').fit(X)
X = imp.transform(X)

#### Préparation du jeu de test

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X_scale = scaler.transform(X)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size=0.3, stratify=y, random_state=2023)

## Exercice 2 - Recherche sur grille

### 2-1 - Régression logistique

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from helpers import plot_roc_curve

[class sklearn.linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [None]:
# Grid
parameters = {'C':[0.01, 0.1, 1, 10, 100],
              'l1_ratio':[0, 0.5, 1]}

# Estimator
clf_logreg = LogisticRegression(penalty='elasticnet',
                                  max_iter=10000,
                                  solver='saga',
                                  n_jobs=-1,
                                  random_state=2023)

# GridSearch avec Validation croisée
clf_logreg_grid = GridSearchCV(clf_logreg, parameters, cv=5, scoring="roc_auc", verbose=1, n_jobs=-1, refit=True)

In [None]:
clf_logreg_grid.fit(X_train, y_train)

In [None]:
print(f'Meilleurs paramètres: {clf_logreg_grid.best_params_}')
print(f'Meilleur score (mean CV): {clf_logreg_grid.best_score_}')

#### Ré-entraînement du meilleur modèle

In [None]:
# Inutile car refit = True
clf_logreg_final = clf_logreg_grid.best_estimator_
clf_logreg_final

#### Aire sous la courbe

In [None]:
y_train_pred_proba_logreg = clf_logreg_final.predict_proba(X_train)[:,1]
print(f'AUC = {roc_auc_score(y_train, y_train_pred_proba_logreg)}')

#### Courbe ROC

In [None]:
results = {}
results['Logistic Regression'] = y_train_pred_proba_logreg
plot_roc_curve(results, y_train)

### 2-2 - K plus proches voisins

In [None]:
from sklearn.neighbors import KNeighborsClassifier

[class sklearn.neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None, **kwargs)](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)

In [None]:
# Grid
parameters = {
    "n_neighbors": [5, 10, 20, 30, 40, 50],
    "weights": ['uniform','distance'],
}

clf_knn = KNeighborsClassifier(algorithm="brute")

clf_knn_grid = GridSearchCV(clf_knn, parameters, cv=5, scoring="roc_auc", verbose=1, n_jobs=-1, refit=True)
clf_knn_grid.fit(X_train, y_train)

In [None]:
print(f'Meilleurs paramètres: {clf_knn_grid.best_params_}')
print(f'Meilleur score (mean CV): {clf_knn_grid.best_score_}')

#### Ré-entraînement du meilleur modèle

In [None]:
# Inutile car refit = True
clf_knn_final = clf_knn_grid.best_estimator_
clf_knn_final

#### Aire sous la courbe

In [None]:
y_train_pred_proba_knn = clf_knn_final.predict_proba(X_train)[:,1]
print(f'AUC = {roc_auc_score(y_train, y_train_pred_proba_knn)}')

#### Courbe ROC

In [None]:
results['KNN'] = y_train_pred_proba_knn
plot_roc_curve(results, y_train)

## Exercice 3 - Recherche aléatoire

### 3-1 - Arbres de décision

In [None]:
from sklearn.utils.fixes import loguniform
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

[class sklearn.tree.DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort='deprecated', ccp_alpha=0.0)](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

In [None]:
# Distributions des paramètres
distributions = dict(
    criterion=['gini', 'entropy'],
    ccp_alpha=loguniform(1e-3, 1e3),
    max_depth=randint(2, 128))

# Estimateur
clf_tree = DecisionTreeClassifier(random_state=2023)

    
# Recherche aléatoire avec avec validation croisée
clf_tree_rnd = RandomizedSearchCV(clf_tree, distributions, n_iter=1000, cv=5, scoring="roc_auc", verbose=1, n_jobs=-1, random_state=2023, refit=True)

In [None]:
clf_tree_rnd.fit(X_train, y_train)

In [None]:
print(f'Meilleurs paramètres: {clf_tree_rnd.best_params_}')
print(f'Meilleur score (mean CV): {clf_tree_rnd.best_score_}')

#### Ré-entraînement du meilleur modèle

In [None]:
# Inutile car refit = True
clf_tree_final = clf_tree_rnd.best_estimator_
clf_tree_final

#### Aire sous la courbe

In [None]:
y_train_pred_proba_tree = clf_tree_final.predict_proba(X_train)[:,1]
print(f'AUC = {roc_auc_score(y_train, y_train_pred_proba_tree)}')

#### Courbe ROC

In [None]:
results['Decision Tree'] = y_train_pred_proba_tree
plot_roc_curve(results, y_train)

### 3-2 - Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

[class sklearn.ensemble.BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0)](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html)

In [None]:
# Distributions des paramètres
distributions = dict(
    n_estimators=randint(2, 500))

# Estimateur
clf_bag = BaggingClassifier(estimator=clf_tree_final, random_state=2023)

# Recherche aléatoire avec validation croisée
clf_bag_rnd = RandomizedSearchCV(clf_bag, distributions, n_iter=100, cv=5, scoring="roc_auc", verbose=1, n_jobs=-1, random_state=2023, refit=True)

In [None]:
clf_bag_rnd.fit(X_train, y_train)

In [None]:
print(f'Meilleurs paramètres: {clf_bag_rnd.best_params_}')
print(f'Meilleur score (mean CV): {clf_bag_rnd.best_score_}')

#### Ré-entraînement du meilleur modèle

In [None]:
# Inutile car refit = True
clf_bag_final = clf_bag_rnd.best_estimator_
clf_bag_final

#### Aire sous la courbe

In [None]:
y_train_pred_proba_bag = clf_bag_final.predict_proba(X_train)[:,1]
print(f'AUC = {roc_auc_score(y_train, y_train_pred_proba_bag)}')

#### Courbe ROC

In [None]:
results['Bagging (Tree)'] = y_train_pred_proba_bag
plot_roc_curve(results, y_train)

## Exercice 4 - Hyperopt avec Forêts aléatoires et gradient boosting

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from hyperopt import hp, fmin, tpe, space_eval

### 4-1 - Gradient boosting

#### Définition de l'hyperespace

In [None]:
hyperspace = {
    'n_estimators': 1 + hp.randint('n_estimators', 500),
    'lr_rate': hp.loguniform('lr_rate', -8.0, 1.0),
    'max_depth': 1 + hp.randint('max_depth', 100),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None]),
    'loss': hp.choice('loss', ['log_loss', 'exponential']),
    'ccp_alpha': hp.loguniform('ccp_alpha', -6, 2),
}

#### Fonction objective

In [None]:
def objective(hyperspace):

    lr = hyperspace['lr_rate']
    md = hyperspace['max_depth']
    n = hyperspace['n_estimators']
    l = hyperspace['loss']
    mf = hyperspace['max_features']
    a = hyperspace['ccp_alpha']

    clf_gb = GradientBoostingClassifier(loss=l, max_features=mf,
                                                n_estimators=n, learning_rate=lr, max_depth=md,
                                                ccp_alpha=a,
                                                random_state=2023)
    clf_gb.fit(X_train, y_train)
    cv_score = cross_val_score(clf_gb, X_train, y_train, cv=5, scoring="roc_auc", verbose=0, n_jobs=-1)

    return -cv_score.mean()

#### Lancement de l'optimisation

In [None]:
best = fmin(objective, hyperspace, algo=tpe.suggest, max_evals=1000)

#### Meilleurs paramètres

In [None]:
best

#### Réentraînement du gradient boosting avec les meilleurs hyperparamètres

In [None]:
clf_gb_final = GradientBoostingClassifier(loss='exponential', max_features='log2',
                                                n_estimators=301, learning_rate=0.26973180768518795, max_depth=36,
                                                ccp_alpha=0.0036803454953677865,
                                                random_state=2023)
clf_gb_final.fit(X_train, y_train)
cv_score = cross_val_score(clf_gb_final, X_train, y_train, cv=5, scoring="roc_auc", verbose=0, n_jobs=-1)
cv_score.mean()

#### Aire sous la courbe

In [None]:
y_train_pred_proba_gb = clf_gb_final.predict_proba(X_train)[:,1]
print(f'AUC = {roc_auc_score(y_train, y_train_pred_proba_gb)}')

#### Courbe ROC

In [None]:
results['Gradient Boosting'] = y_train_pred_proba_gb
plot_roc_curve(results, y_train)

### 4-2 - Gradient boosting et forêts aléatoires

In [None]:
# Définition de l'hyperespace

hyperspace = hp.choice('classifier',[
    {
        'type': 'gradient-boosting',
        'n_estimators': 1 + hp.randint('n_estimators_1', 100),
        'lr_rate': hp.loguniform('lr_rate', -8, 1),
        'max_depth': 1 + hp.randint('max_depth', 100),
        'max_features': hp.choice('max_features1', ['sqrt', 'log2', None]),
        'loss': hp.choice('loss', ['log_loss', 'exponential']),
        'ccp_alpha': hp.loguniform('ccp_alpha1', -6, 2)
    },
    {
        'type': 'random-forests',
        'criterion': hp.choice('criterion', ['gini', 'entropy']),
        'n_estimators': 1 + hp.randint('n_estimators_2', 500),
        'max_features': hp.choice('max_features2', ['sqrt', 'log2', None]),
        'ccp_alpha': hp.loguniform('ccp_alpha2', -6, 2)
    }
])

# Fonction objective

def objective(hyperspace):

    if hyperspace['type'] == 'gradient-boosting':
        lr = hyperspace['lr_rate']
        md = hyperspace['max_depth']
        n = hyperspace['n_estimators']
        l = hyperspace['loss']
        mf = hyperspace['max_features']
        a = hyperspace['ccp_alpha']

        clf_gb = GradientBoostingClassifier(loss=l, max_features=mf,
                                            n_estimators=n, learning_rate=lr, max_depth=md,
                                            ccp_alpha=a,
                                            random_state=2023)
        clf_gb.fit(X_train, y_train)
        cv_score = cross_val_score(clf_gb, X_train, y_train, cv=5, scoring="roc_auc", verbose=0, n_jobs=-1)

        return -cv_score.mean()
    elif hyperspace['type'] == 'random-forests':
        c = hyperspace['criterion']
        n = hyperspace['n_estimators']
        mf = hyperspace['max_features']
        a = hyperspace['ccp_alpha']
        
        clf_rf = RandomForestClassifier(criterion=c, n_estimators=n, max_features=mf, ccp_alpha=a,
                                        random_state=2023,
                                        n_jobs=-1)
        clf_rf.fit(X_train, y_train)
        cv_score = cross_val_score(clf_rf, X_train, y_train, cv=5, scoring="roc_auc", verbose=0, n_jobs=-1)

        return -cv_score.mean()
    else:
        print("Error")
        return None

# Lancement de l'optimisation

best = fmin(objective, hyperspace, algo=tpe.suggest, max_evals=1000)
best

#### Réentraînement du meilleur algorithme avec les meilleurs hyperparamètres

In [None]:
clf_rf_final = RandomForestClassifier(max_features='log2',
                                                n_estimators=83,
                                                ccp_alpha=0.0024827110880081497,
                                                criterion='gini',
                                                random_state=2023)
clf_rf_final.fit(X_train, y_train)
cv_score = cross_val_score(clf_rf_final, X_train, y_train, cv=5, scoring="roc_auc", verbose=0, n_jobs=-1)
cv_score.mean()

#### Aire sous la courbe

In [None]:
y_train_pred_proba_rf = clf_rf_final.predict_proba(X_train)[:,1]
print(f'AUC = {roc_auc_score(y_train, y_train_pred_proba_rf)}')

#### Courbe ROC

In [None]:
results['Random Forests'] = y_train_pred_proba_rf
plot_roc_curve(results, y_train)

## Exercice 5 - Sélection du modèle et performances sur le jeu de test

| Modèle | AUC (CV) | AUC (refit)
| ------ | ------ | ------
| Régression logisitique | 0.8615   | 0.8669  
| KNN |0.8616 | 0.8952
| Arbres de classification | 0.8648 | 0.8956
| Bagging (Arbres) | 0.8697 | 0.9121
| Gradient Boosting | <span style="color:red">0.8817</span>| <span style="color:red">0.9795</span>
| Forêts aléatoires | 0.8789  | 0.9730

#### Aire sous la courbe

In [None]:
y_test_pred_proba_best = clf_gb_final.predict_proba(X_test)[:,1]
print(f'AUC = {roc_auc_score(y_test, y_test_pred_proba_best)}')

#### Courbe ROC

In [None]:
results_test = {}
results_test['BEST'] = y_test_pred_proba_best
plot_roc_curve(results_test, y_test)