In [110]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer

# import ensemble methods
from sklearn.ensemble import (
    BaggingClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    VotingClassifier,
    StackingClassifier,
)
from xgboost import XGBClassifier

# import base estimators
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
)
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.figure_factory as ff

import warnings

warnings.filterwarnings(
    "ignore", category=DeprecationWarning
)  # to avoid deprecation warnings


In [83]:
# Import dataset
print("Loading dataset...")
dataset = pd.read_csv("src/Data.csv")
print("...Done.")
print()

Loading dataset...
...Done.



In [None]:
# Basic statistics and dataset overview
print(f"Number of rows: {dataset.shape[0]}")
print(f"Number of columns: {dataset.shape[1]}")
print()

# Display first few rows of the dataset
print(dataset.head())
print()

# Basic statistics
data_desc = dataset.describe(include="all")
print(data_desc)
print()

# Percentage of missing values for each column
missing_values = (dataset.isnull().sum() / dataset.shape[0]) * 100
print(missing_values)

Number of rows: 10
Number of columns: 4

   Country   Age  Salary Purchased
0   France  44.0   72000        No
1    Spain  27.0   48000       Yes
2  Germany  30.0   54000        No
3    Spain  38.0   61000        No
4  Germany  40.0   69000       Yes

       Country        Age        Salary Purchased
count       10   9.000000     10.000000        10
unique       3        NaN           NaN         2
top     France        NaN           NaN        No
freq         4        NaN           NaN         5
mean       NaN  38.777778  64300.000000       NaN
std        NaN   7.693793  11681.419244       NaN
min        NaN  27.000000  48000.000000       NaN
25%        NaN  35.000000  55000.000000       NaN
50%        NaN  38.000000  64000.000000       NaN
75%        NaN  44.000000  71250.000000       NaN
max        NaN  50.000000  83000.000000       NaN

Country       0.0
Age          10.0
Salary        0.0
Purchased     0.0
dtype: float64


# Preprocessing

In [85]:
# Separate target variable Y from features X
print("Separating labels from features...")
target_variable = "Purchased"

X = dataset.drop(columns=[target_variable], axis= 1)
Y = dataset[target_variable]

print("...Done.")
print()

print("Y (Target variable):")
print(Y.head())
print("\nX (Features):")
print(X.head())

Separating labels from features...
...Done.

Y (Target variable):
0     No
1    Yes
2     No
3     No
4    Yes
Name: Purchased, dtype: object

X (Features):
   Country   Age  Salary
0   France  44.0   72000
1    Spain  27.0   48000
2  Germany  30.0   54000
3    Spain  38.0   61000
4  Germany  40.0   69000


In [86]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(exclude=["int64", "float64"]).columns.tolist()
print("numeric_features:")
print(numeric_features)
print("\ncategorical_features:")
print(categorical_features)

numeric_features:
['Age', 'Salary']

categorical_features:
['Country']


In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)

In [88]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [89]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop="first"))
])

In [90]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
])

In [91]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
X_train = preprocessor.fit_transform(X_train)
print("X_train done")
print(X_train[0:5])
print()

# Label encoding
print("Label Encoding on train set:")
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
print("y_train done")
print(y_train[0:5])
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
X_test = preprocessor.transform(X_test)
print(X_test[0:5,:])
print()

# Label encoding
print("Encoding labels...")
print(y_test[0:5])
Y_test = encoder.transform(y_test)
print("...Done")
print(Y_test[0:5])

Performing preprocessings on train set...
X_train done
[[ 1.61706195e+00  1.46885753e+00  0.00000000e+00  0.00000000e+00]
 [ 8.22715727e-01  1.09777773e+00  1.00000000e+00  0.00000000e+00]
 [-1.41104234e-15 -1.00500778e+00  0.00000000e+00  1.00000000e+00]
 [ 2.26956063e-01  8.50391200e-01  0.00000000e+00  0.00000000e+00]
 [ 4.25542617e-01  1.08231607e-01  0.00000000e+00  1.00000000e+00]]

Label Encoding on train set:
y_train done
[0 1 0 1 0]

Performing preprocessings on test set...
[[2.80858127 2.82948345 1.         0.        ]
 [2.41140816 2.33471038 0.         0.        ]]

Encoding labels...
8     No
7    Yes
Name: Purchased, dtype: object
...Done
[0 1]


# Bagging

Bagging with logistic regression as base estimator

In [92]:
# Initialize logistic regression with increased max_iter to handle convergence warnings
logistic_regression = LogisticRegression(max_iter=1000)

# Initialize BaggingClassifier with logistic regression as the base estimator
model = BaggingClassifier(estimator=logistic_regression)

# Define parameter grid for hyperparameter tuning
params = {
    "estimator__C": [0.01, 0.05, 0.1, 0.5],  # C is a hyperparameter of LogisticRegression
    "n_estimators": [5, 10, 20, 30]  # n_estimators is a hyperparameter of BaggingClassifier
}

# Set up GridSearchCV with 3-fold cross-validation
gridsearch = GridSearchCV(
    estimator=model,
    param_grid=params,
    cv=3,
    n_jobs=-1,  # Utilize all available cores for parallel processing
    verbose=2   # Increase verbosity for detailed output
)

# Perform grid search
print("Performing grid search...")
gridsearch.fit(X_train, y_train)
print("Grid search completed.")

# Display results
print(f"Best hyperparameters: {gridsearch.best_params_}")
print(f"Best cross-validation accuracy: {gridsearch.best_score_:.4f}")
print(f"Training accuracy: {gridsearch.score(X_train, y_train):.4f}")
print(f"Test accuracy: {gridsearch.score(X_test, Y_test):.4f}")

Performing grid search...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END ..................estimator__C=0.01, n_estimators=5; total time=   0.0s
[CV] END ..................estimator__C=0.01, n_estimators=5; total time=   0.0s
[CV] END .................estimator__C=0.01, n_estimators=10; total time=   0.0s
[CV] END ..................estimator__C=0.01, n_estimators=5; total time=   0.0s
[CV] END .................estimator__C=0.01, n_estimators=10; total time=   0.0s
[CV] END ..................estimator__C=0.05, n_estimators=5; total time=   0.0s
[CV] END .................estimator__C=0.01, n_estimators=10; total time=   0.0s
[CV] END ..................estimator__C=0.05, n_estimators=5; total time=   0.0s
[CV] END ..................estimator__C=0.05, n_estimators=5; total time=   0.0s
[CV] END .................estimator__C=0.01, n_estimators=20; total time=   0.1s
[CV] END .................estimator__C=0.01, n_estimators=30; total time=   0.0s
[CV] END .............

Bagging with decision tree as base estimator

In [93]:
# Initialisation du modèle de base et de l'algorithme d'ensemble
decision_tree = DecisionTreeClassifier()

# Initialize BaggingClassifier with decision_tree as the base estimator
model = BaggingClassifier(estimator=decision_tree)

# Grille d'hyperparamètres pour la recherche par grille
params = {
    "estimator__max_depth": [1, 2, 3],
    "estimator__min_samples_leaf": [1, 2, 3],
    "estimator__min_samples_split": [2, 3, 4],
    "n_estimators": [2, 4, 6, 8, 10],
}

# Exécution de la recherche par grille avec validation croisée
print("Running grid search...")
gridsearch = GridSearchCV(
    estimator=model,
    param_grid=params,
    cv=3, verbose=1,
    n_jobs=-1  # Parallel processing for efficiency
)
gridsearch.fit(X_train, y_train)

# Affichage des meilleurs paramètres et performances
print("Grid search complete.")
print(f"Best hyperparameters: {gridsearch.best_params_}")
print(f"Best cross-validation accuracy: {gridsearch.best_score_:.4f}")

# Évaluation des performances sur les ensembles d'entraînement et de test
train_accuracy = gridsearch.score(X_train, y_train)
test_accuracy = gridsearch.score(X_test, Y_test)
print(f"Training set accuracy: {train_accuracy:.4f}")
print(f"Test set accuracy: {test_accuracy:.4f}")

Running grid search...
Fitting 3 folds for each of 135 candidates, totalling 405 fits
Grid search complete.
Best hyperparameters: {'estimator__max_depth': 1, 'estimator__min_samples_leaf': 2, 'estimator__min_samples_split': 2, 'n_estimators': 4}
Best cross-validation accuracy: 0.6111
Training set accuracy: 0.6250
Test set accuracy: 0.5000


  _data = np.array(data, dtype=dtype, copy=copy,


In [94]:
# vérifier si X_train et X_test contiennent des valeurs manquantes ou infinies :
print("Checking for NaN or infinite values in X_train and X_test...")
print("NaN values in X_train:", np.isnan(X_train).sum())
print("NaN values in X_test:", np.isnan(X_test).sum())
print("Infinite values in X_train:", np.isinf(X_train).sum())
print("Infinite values in X_test:", np.isinf(X_test).sum())

Checking for NaN or infinite values in X_train and X_test...
NaN values in X_train: 0
NaN values in X_test: 0
Infinite values in X_train: 0
Infinite values in X_test: 0


In [95]:
# vérifier le type de données
print("Data type of X_train:", X_train.dtype)
print("Data type of X_test:", X_test.dtype)

Data type of X_train: float64
Data type of X_test: float64


# Boosting
Adaboost

Adaboost with logistic regression as base estimator

In [97]:
# Initialisation du modèle de base et de l'algorithme d'ensemble
logistic_regression = LogisticRegression(max_iter=1000)  # max_iter augmenté pour éviter les avertissements de convergence
model = AdaBoostClassifier(estimator=logistic_regression, algorithm='SAMME')  # Utilisation explicite de SAMME pour éviter l'avertissement

# Grille d'hyperparamètres pour la recherche par grille
params = {
    "estimator__C": [0.01, 0.05, 0.1, 0.5],  # 'estimator__' nécessaire pour les hyperparamètres de LogisticRegression
    "n_estimators": [5, 10, 20, 30],         # Hyperparamètre de AdaBoost pour le nombre d'estimateurs
}

# Exécution de la recherche par grille avec validation croisée
print("Running grid search...")
gridsearch = GridSearchCV(
    estimator=model,
    param_grid=params,
    cv=3,
    verbose=1,  # Affiche la progression de la recherche
    n_jobs=-1   # Utilise tous les cœurs de processeur pour accélérer la recherche
)
gridsearch.fit(X_train, y_train)

# Affichage des meilleurs paramètres et performances
print("Grid search complete.")
print(f"Best hyperparameters: {gridsearch.best_params_}")
print(f"Best cross-validation accuracy: {gridsearch.best_score_:.4f}")

# Évaluation des performances sur les ensembles d'entraînement et de test
train_accuracy = gridsearch.score(X_train, y_train)
test_accuracy = gridsearch.score(X_test, Y_test)
print(f"Training set accuracy: {train_accuracy:.4f}")
print(f"Test set accuracy: {test_accuracy:.4f}")

Running grid search...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Grid search complete.
Best hyperparameters: {'estimator__C': 0.5, 'n_estimators': 5}
Best cross-validation accuracy: 0.6111
Training set accuracy: 0.8750
Test set accuracy: 0.5000


Adaboost with decision tree as base estimator

In [98]:
# Initialisation du modèle de base et de l'algorithme d'ensemble avec SAMME pour éviter l'avertissement de dépréciation
decision_tree = DecisionTreeClassifier()
model = AdaBoostClassifier(estimator=decision_tree, algorithm='SAMME')

# Grille d'hyperparamètres pour la recherche par grille
params = {
    "estimator__max_depth": [1, 2, 3],           # Profondeur de l'arbre de décision
    "estimator__min_samples_leaf": [1, 2, 3],    # Nombre min. d'échantillons par feuille
    "estimator__min_samples_split": [2, 3, 4],   # Nombre min. d'échantillons pour diviser un nœud
    "n_estimators": [2, 4, 6, 8, 10],            # Nombre d'estimateurs dans AdaBoost
}

# Exécution de la recherche par grille avec validation croisée
print("Running grid search...")
gridsearch = GridSearchCV(
    estimator=model,
    param_grid=params,
    cv=3,
    verbose=1,  # Affiche la progression de la recherche
    n_jobs=-1   # Utilise tous les cœurs pour accélérer la recherche
)
gridsearch.fit(X_train, y_train)

# Affichage des meilleurs paramètres et performances
print("Grid search complete.")
print(f"Best hyperparameters: {gridsearch.best_params_}")
print(f"Best cross-validation accuracy: {gridsearch.best_score_:.4f}")

# Évaluation des performances sur les ensembles d'entraînement et de test
train_accuracy = gridsearch.score(X_train, y_train)
test_accuracy = gridsearch.score(X_test, Y_test)
print(f"Training set accuracy: {train_accuracy:.4f}")
print(f"Test set accuracy: {test_accuracy:.4f}")

Running grid search...
Fitting 3 folds for each of 135 candidates, totalling 405 fits
Grid search complete.
Best hyperparameters: {'estimator__max_depth': 1, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'n_estimators': 2}
Best cross-validation accuracy: 0.3889
Training set accuracy: 0.6250
Test set accuracy: 0.5000


# Scikit-learn's GradientBoosting
Boosting with logistic regression as base estimator: forbidden ⛔️

In [101]:
# Initialisation du modèle de gradient boosting
model = GradientBoostingClassifier()

# Grille d'hyperparamètres pour la recherche par grille
params = {
    "max_depth": [1, 2, 3],                 # Profondeur maximale des arbres
    "min_samples_leaf": [1, 2, 3],          # Nombre min. d'échantillons dans une feuille
    "min_samples_split": [2, 3, 4],         # Nombre min. d'échantillons pour diviser un nœud
    "n_estimators": [2, 4, 6, 8, 10],       # Nombre d'arbres dans l'ensemble
}

# Exécution de la recherche par grille avec validation croisée
print("Running grid search...")
gridsearch = GridSearchCV(
    estimator=model,
    param_grid=params,
    cv=3,
    verbose=1,  # Affiche la progression de la recherche
    n_jobs=-1   # Utilise tous les cœurs pour accélérer la recherche
)
gridsearch.fit(X_train, y_train)

# Affichage des meilleurs paramètres et performances
print("Grid search complete.")
print(f"Best hyperparameters: {gridsearch.best_params_}")
print(f"Best cross-validation accuracy: {gridsearch.best_score_:.4f}")

# Évaluation des performances sur les ensembles d'entraînement et de test
train_accuracy = gridsearch.score(X_train, y_train)
test_accuracy = gridsearch.score(X_test, Y_test)
print(f"Training set accuracy: {train_accuracy:.4f}")
print(f"Test set accuracy: {test_accuracy:.4f}")

Running grid search...
Fitting 3 folds for each of 135 candidates, totalling 405 fits
Grid search complete.
Best hyperparameters: {'max_depth': 1, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 2}
Best cross-validation accuracy: 0.3889
Training set accuracy: 0.7500
Test set accuracy: 0.5000


# XGBoost
XGBoost is a different library, but it provides a scikit-learn API that allows to train a model as if it had been built from a scikit-learn class.

In [103]:
# Initialisation du modèle XGBoost
xgboost = XGBClassifier(eval_metric='logloss')  # Spécifie eval_metric pour éviter un avertissement

# Grille d'hyperparamètres pour la recherche par grille
params = {
    "max_depth": [2, 4, 6],               # Profondeur maximale des arbres
    "min_child_weight": [1, 2, 3],        # Contraintes similaires à min_samples_leaf de scikit-learn
    "n_estimators": [2, 4, 6, 8],         # Nombre d'arbres dans l'ensemble
}

# Exécution de la recherche par grille avec validation croisée
print("Running grid search...")
gridsearch = GridSearchCV(
    estimator=xgboost,
    param_grid=params,
    cv=3,
    verbose=1,  # Affiche la progression de la recherche
    n_jobs=-1   # Utilise tous les cœurs pour accélérer la recherche
)
gridsearch.fit(X_train, y_train)

# Affichage des meilleurs paramètres et performances
print("Grid search complete.")
print(f"Best hyperparameters: {gridsearch.best_params_}")
print(f"Best cross-validation accuracy: {gridsearch.best_score_:.4f}")

# Évaluation des performances sur les ensembles d'entraînement et de test
train_accuracy = gridsearch.score(X_train, y_train)
test_accuracy = gridsearch.score(X_test, Y_test)
print(f"Training set accuracy: {train_accuracy:.4f}")
print(f"Test set accuracy: {test_accuracy:.4f}")

Running grid search...
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Grid search complete.
Best hyperparameters: {'max_depth': 2, 'min_child_weight': 1, 'n_estimators': 2}
Best cross-validation accuracy: 0.3889
Training set accuracy: 0.5000
Test set accuracy: 0.5000


# Voting
Contrary to bagging and boosting, the voting classifier is meant to mix different base estimators. Let's see an example with three base estimators:

logistic regression

decision tree

SVM with rbf kernel


In [104]:
# Initialisation du modèle de régression logistique avec un nombre d'itérations plus élevé pour éviter les avertissements de convergence
logreg = LogisticRegression(max_iter=1000)  # Augmenté pour des modèles complexes

# Grille d'hyperparamètres pour la recherche par grille
params = {"C": [0.1, 1.0, 10.0]}  # Valeurs de régularisation à tester

# Exécution de la recherche par grille avec validation croisée
print("Running grid search...")
logreg_opt = GridSearchCV(
    estimator=logreg,
    param_grid=params,
    cv=3,
    verbose=1,  # Affiche la progression de la recherche
    n_jobs=-1   # Utilise tous les cœurs disponibles pour accélérer la recherche
)
logreg_opt.fit(X_train, y_train)

# Affichage des meilleurs paramètres et performances
print("Grid search complete.")
print(f"Best hyperparameters: {logreg_opt.best_params_}")
print(f"Best cross-validation accuracy: {logreg_opt.best_score_:.4f}")

# Évaluation des performances sur les ensembles d'entraînement et de test
train_accuracy = logreg_opt.score(X_train, y_train)
test_accuracy = logreg_opt.score(X_test, Y_test)
print(f"Training set accuracy: {train_accuracy:.4f}")
print(f"Test set accuracy: {test_accuracy:.4f}")

Running grid search...
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Grid search complete.
Best hyperparameters: {'C': 10.0}
Best cross-validation accuracy: 0.6111
Training set accuracy: 0.7500
Test set accuracy: 0.5000


Voting with decision tree

In [105]:
# Initialisation du modèle d'arbre de décision
dt = DecisionTreeClassifier()

# Grille d'hyperparamètres pour la recherche par grille
params = {
    "max_depth": [1, 2, 3],               # Profondeur maximale de l'arbre
    "min_samples_leaf": [1, 2, 3],        # Nombre min. d'échantillons par feuille
    "min_samples_split": [2, 3, 4],       # Nombre min. d'échantillons pour diviser un nœud
}

# Exécution de la recherche par grille avec validation croisée
print("Running grid search...")
dt_opt = GridSearchCV(
    estimator=dt,
    param_grid=params,
    cv=3,
    verbose=1,  # Affiche la progression de la recherche
    n_jobs=-1   # Utilise tous les cœurs pour accélérer la recherche
)
dt_opt.fit(X_train, y_train)

# Affichage des meilleurs paramètres et performances
print("Grid search complete.")
print(f"Best hyperparameters: {dt_opt.best_params_}")
print(f"Best cross-validation accuracy: {dt_opt.best_score_:.4f}")

# Évaluation des performances sur les ensembles d'entraînement et de test
train_accuracy = dt_opt.score(X_train, y_train)
test_accuracy = dt_opt.score(X_test, Y_test)
print(f"Training set accuracy: {train_accuracy:.4f}")
print(f"Test set accuracy: {test_accuracy:.4f}")

Running grid search...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Grid search complete.
Best hyperparameters: {'max_depth': 1, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best cross-validation accuracy: 0.5000
Training set accuracy: 0.6250
Test set accuracy: 0.5000


Voting with SVM with rbf kernel

In [106]:
# Initialisation du modèle SVM avec un noyau radial de base (RBF) et des probabilités activées
svm = SVC(kernel="rbf", probability=True)

# Grille d'hyperparamètres pour la recherche par grille
params = {
    "C": [0.1, 1.0, 10.0],         # Paramètre de régularisation
    "gamma": [0.1, 1.0, 10.0],     # Coefficient du noyau RBF
}

# Exécution de la recherche par grille avec validation croisée
print("Running grid search...")
svm_opt = GridSearchCV(
    estimator=svm,
    param_grid=params,
    cv=3,
    verbose=1,  # Affiche la progression de la recherche
    n_jobs=-1   # Utilise tous les cœurs pour accélérer la recherche
)
svm_opt.fit(X_train, y_train)

# Affichage des meilleurs paramètres et performances
print("Grid search complete.")
print(f"Best hyperparameters: {svm_opt.best_params_}")
print(f"Best cross-validation accuracy: {svm_opt.best_score_:.4f}")

# Évaluation des performances sur les ensembles d'entraînement et de test
train_accuracy = svm_opt.score(X_train, y_train)
test_accuracy = svm_opt.score(X_test, Y_test)
print(f"Training set accuracy: {train_accuracy:.4f}")
print(f"Test set accuracy: {test_accuracy:.4f}")

Running grid search...
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Grid search complete.
Best hyperparameters: {'C': 0.1, 'gamma': 10.0}
Best cross-validation accuracy: 0.5556
Training set accuracy: 1.0000
Test set accuracy: 0.5000


VOTING CLASSIFIER

In [107]:
# Initialisation du VotingClassifier avec un vote soft basé sur les probabilités des modèles optimisés
voting = VotingClassifier(
    estimators=[
        ("logistic", logreg_opt.best_estimator_),  # Meilleur modèle de régression logistique
        ("tree", dt_opt.best_estimator_),          # Meilleur modèle d'arbre de décision
        ("svm", svm_opt.best_estimator_)           # Meilleur modèle SVM
    ],
    voting="soft"  # Utilise les probabilités pour le vote
)

# Entraînement du VotingClassifier sur l'ensemble d'entraînement
voting.fit(X_train, y_train)

# Évaluation des performances du VotingClassifier sur les ensembles d'entraînement et de test
train_accuracy = voting.score(X_train, y_train)
test_accuracy = voting.score(X_test, Y_test)

print(f"Training set accuracy: {train_accuracy:.4f}")
print(f"Test set accuracy: {test_accuracy:.4f}")

Training set accuracy: 0.7500
Test set accuracy: 0.5000


# Stacking

As voting, the stacking concerns different base estimators, but this time they will be used to estimate independent probabilities that will be plugged as input of a final estimator. The default final estimator is LogisticRegression, but it can be changed using the final_estimator parameter.

In the examples below, the models logreg, tree and svm have already been optimized through a grid search.

In [109]:
# Initialisation du StackingClassifier avec les modèles optimisés
print("Training stacking classifier...")
stacking = StackingClassifier(
    estimators=[
        ("logistic", logreg_opt.best_estimator_),  # Meilleur modèle de régression logistique
        ("tree", dt_opt.best_estimator_),          # Meilleur modèle d'arbre de décision
        ("svm", svm_opt.best_estimator_)           # Meilleur modèle SVM
    ],
    final_estimator=LogisticRegression(),  # Estimateur final par défaut (régression logistique)
    cv=3  # Validation croisée à 3 plis pour le niveau 1
)

# Entraînement et transformation des données avec le StackingClassifier
preds = stacking.fit_transform(X_train, y_train)
predictions = pd.DataFrame(preds, columns=stacking.named_estimators_.keys())
print("...Done.")
display(predictions)

# Évaluation des performances sur les ensembles d'entraînement et de test
train_accuracy = stacking.score(X_train, y_train)
test_accuracy = stacking.score(X_test, Y_test)

print(f"Training set accuracy: {train_accuracy:.4f}")
print(f"Test set accuracy: {test_accuracy:.4f}")

Training stacking classifier...
...Done.


Unnamed: 0,logistic,tree,svm
0,0.427365,0.4,0.543419
1,0.436559,0.4,0.456637
2,0.132388,0.4,0.543419
3,0.801265,0.4,0.456637
4,0.278854,0.4,0.543419
5,0.669254,0.666667,0.456637
6,0.65324,0.666667,0.543419
7,0.601132,0.666667,0.456637


Training set accuracy: 0.2500
Test set accuracy: 0.5000


TIP

Check for correlations

As the predictions used in stacking are supposed to be independent, it's a good practice to check the correlation matrix of the outputs from the different estimators. If some predictions have a strong correlation, it's better to re-train the stacking model by dropping one of the estimators

In [111]:
# Calcul de la matrice de corrélation et arrondi des valeurs
corr_matrix = predictions.corr().round(2)

# Création d'une heatmap annotée pour la matrice de corrélation
fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,                     # Valeurs de la matrice de corrélation
    x=corr_matrix.columns.tolist(),           # Noms des colonnes (modèles de base)
    y=corr_matrix.index.tolist(),             # Noms des lignes (modèles de base)
    colorscale="Viridis"                      # Palette de couleurs pour une meilleure lisibilité
)

# Affichage de la heatmap
fig.update_layout(
    title="Correlation Matrix of Base Model Predictions",
    xaxis_title="Models",
    yaxis_title="Models",
    margin=dict(l=100, r=100, t=50, b=50)     # Ajustement des marges pour une meilleure visualisation
)
fig.show()


In [113]:
# Réentraîner le StackingClassifier en supprimant l'arbre de décision
print("Training stacking classifier without the tree estimator due to correlation with SVM...")
stacking = StackingClassifier(
    estimators=[
        ("logistic", logreg_opt.best_estimator_),  # Meilleur modèle de régression logistique
        ("svm", svm_opt.best_estimator_)           # Meilleur modèle SVM
    ],
    final_estimator=LogisticRegression(),          # Estimateur final par défaut
    cv=3                                           # Validation croisée à 3 plis pour le niveau 1
)

# Entraînement et transformation des données avec le StackingClassifier
preds = stacking.fit_transform(X_train, y_train)
predictions = pd.DataFrame(preds, columns=stacking.named_estimators_.keys())
print("...Done.")
display(predictions)

# Évaluation des performances sur les ensembles d'entraînement et de test
train_accuracy = stacking.score(X_train, y_train)
test_accuracy = stacking.score(X_test, Y_test)

print(f"Training set accuracy: {train_accuracy:.4f}")
print(f"Test set accuracy: {test_accuracy:.4f}")

Training stacking classifier without the tree estimator due to correlation with SVM...
...Done.


Unnamed: 0,logistic,svm
0,0.427365,0.544746
1,0.436559,0.459113
2,0.132388,0.544746
3,0.801265,0.459113
4,0.278854,0.544746
5,0.669254,0.459113
6,0.65324,0.544746
7,0.601132,0.459113


Training set accuracy: 0.1250
Test set accuracy: 0.5000


In [114]:
# Entraînement du StackingClassifier avec un arbre de décision comme estimateur final
print("Training stacking classifier with DecisionTree as the final estimator...")
stacking = StackingClassifier(
    estimators=[
        ("logistic", logreg_opt.best_estimator_),  # Meilleur modèle de régression logistique
        ("svm", svm_opt.best_estimator_)           # Meilleur modèle SVM
    ],
    final_estimator=DecisionTreeClassifier(),      # Arbre de décision comme estimateur final
    cv=3                                           # Validation croisée à 3 plis pour le niveau 1
)

# Entraînement et transformation des données avec le StackingClassifier
preds = stacking.fit_transform(X_train, y_train)
predictions = pd.DataFrame(preds, columns=stacking.named_estimators_.keys())
print("...Done.")
display(predictions)

# Évaluation des performances sur les ensembles d'entraînement et de test
train_accuracy = stacking.score(X_train, y_train)
test_accuracy = stacking.score(X_test, Y_test)

print(f"Training set accuracy: {train_accuracy:.4f}")
print(f"Test set accuracy: {test_accuracy:.4f}")

Training stacking classifier with DecisionTree as the final estimator...
...Done.


Unnamed: 0,logistic,svm
0,0.427365,0.544746
1,0.436559,0.459113
2,0.132388,0.544746
3,0.801265,0.459113
4,0.278854,0.544746
5,0.669254,0.459113
6,0.65324,0.544746
7,0.601132,0.459113


Training set accuracy: 0.6250
Test set accuracy: 0.5000
