In [92]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer

# import ensemble methods
from sklearn.ensemble import (
    BaggingClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    VotingClassifier,
    StackingClassifier,
    RandomForestClassifier
)
from xgboost import XGBClassifier

# import base estimators
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
)
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.figure_factory as ff

import warnings

warnings.filterwarnings(
    "ignore", category=DeprecationWarning
)  # to avoid deprecation warnings


# Feature Exploration, Engineering and Cleaning
Import the data using the following link : "https://full-stack-bigdata-datasets.s3.eu-west-3.amazonaws.com/Machine+Learning+Supervis%C3%A9/stacking/titanic.csv" , and display the first lines. Are there any missing values in the dataset?

In [93]:
print('I\'m fine')

I'm fine


In [94]:
url = "https://full-stack-bigdata-datasets.s3.eu-west-3.amazonaws.com/Machine+Learning+Supervis%C3%A9/stacking/titanic.csv"
dataset = pd.read_csv(url)
print(dataset[0:5])

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [95]:
print(f'shape: {dataset.shape}')
print()

print(f'Number of rows: {dataset.shape[0]}')
print(f'Number of columns: {dataset.shape[1]}')
print()

data_desc = dataset.describe(include="all")
print(f'Desc of shape: \n{data_desc}')
print()

shape: (891, 12)

Number of rows: 891
Number of columns: 12

Desc of shape: 
        PassengerId    Survived      Pclass                     Name   Sex  \
count    891.000000  891.000000  891.000000                      891   891   
unique          NaN         NaN         NaN                      891     2   
top             NaN         NaN         NaN  Braund, Mr. Owen Harris  male   
freq            NaN         NaN         NaN                        1   577   
mean     446.000000    0.383838    2.308642                      NaN   NaN   
std      257.353842    0.486592    0.836071                      NaN   NaN   
min        1.000000    0.000000    1.000000                      NaN   NaN   
25%      223.500000    0.000000    2.000000                      NaN   NaN   
50%      446.000000    0.000000    3.000000                      NaN   NaN   
75%      668.500000    1.000000    3.000000                      NaN   NaN   
max      891.000000    1.000000    3.000000                      

In [96]:
# Percentage of missing values for each column
missing_values = (dataset.isnull().sum() / dataset.shape[0]) * 100
print(f'Missed values: \n{missing_values}')

Missed values: 
PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64


The following columns have missing values, so we'll have to choose an imputation technique to handle NAs: Age, Cabin, Embarked

What types of variables are present in this dataset? What kind of preprocessing could you run on these variables?

In [97]:
print(f'dataset_type : \n{dataset.dtypes}')

dataset_type : 
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


The dataset is a mix of numeric and categorical variables. The quantitative variables have to be normalized, whereas the categorical have to be encoded.

Here are some guidelines you can follow to clean the dataset as well as create new variables (feature engineering).

Create a Name_length variable that measures the number of characters in the variable Name for each observations.

In [98]:
dataset['Name_length'] = dataset['Name'].apply(len)
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_length
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,51
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,22
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,44
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,24


Create a variable Has_Cabin that indicates whether the passenger has a cabin or not.

In [99]:
dataset['Has_Cabin'] = dataset['Cabin'].notna()

dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_length,Has_Cabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,51,True
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,22,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,44,True
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,24,False


Create a variable FamilySize that gives the size of each passenger's family.

In [100]:
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_length,Has_Cabin,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23,False,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,51,True,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,22,False,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,44,True,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,24,False,1


Create a variable IsAlone that indicates whether the passenger is traveling on their own.

In [101]:
dataset['IsAlone'] = dataset['FamilySize'] == 1

dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_length,Has_Cabin,FamilySize,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23,False,2,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,51,True,2,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,22,False,1,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,44,True,2,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,24,False,1,True


Extract the title from each passenger in order to create a variable Title

In [102]:
def get_title(name):
    try:
        # Extraction du titre en supposant que le format est "Nom, Titre. Prénom"
        return name.split(", ")[1].split(".")[0]
    except IndexError:
        # Retourne "Unknown" si le format ne correspond pas
        return "Unknown"

dataset['Title'] = dataset['Name'].apply(get_title)

dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_length,Has_Cabin,FamilySize,IsAlone,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23,False,2,False,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,51,True,2,False,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,22,False,1,True,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,44,True,2,False,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,24,False,1,True,Mr


In [103]:
print(dataset['Title'].unique())

['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'the Countess' 'Jonkheer']


In [104]:
# Remplacement des valeurs dans la colonne 'Title' avec un dictionnaire
dataset['Title'] = dataset['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs', 'Lady': 'Miss', 'Sir': 'Mr', 'Master':'Mr' })

print(dataset['Title'].unique())

['Mr' 'Mrs' 'Miss' 'Don' 'Rev' 'Dr' 'Major' 'Col' 'Capt' 'the Countess'
 'Jonkheer']


Are any of the remaining titles underrepresented among the observations? If it is the case, group them in a unique modality "Rare"

In [105]:
value_counts = dataset['Title'].value_counts()

value_counts

Title
Mr              558
Miss            186
Mrs             126
Dr                7
Rev               6
Major             2
Col               2
Don               1
Capt              1
the Countess      1
Jonkheer          1
Name: count, dtype: int64

In [106]:
rare_names = ['Don', 'Rev', 'Dr', 'Major', 'Col', 'Capt', 'the Countess', 'Jonkheer']
dataset['Title'] = dataset['Title'].replace(rare_names, 'Rare')

print(dataset['Title'].unique())

['Mr' 'Mrs' 'Miss' 'Rare']


Drop the columns 'PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp' du dataset. Why don't we need these columns for what's next?

In [107]:
dataset_cleaned = dataset.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp'])

dataset_cleaned.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare,Embarked,Name_length,Has_Cabin,FamilySize,IsAlone,Title
0,0,3,male,22.0,0,7.25,S,23,False,2,False,Mr
1,1,1,female,38.0,0,71.2833,C,51,True,2,False,Mrs
2,1,3,female,26.0,0,7.925,S,22,False,1,True,Miss
3,1,1,female,35.0,0,53.1,S,44,True,2,False,Mrs
4,0,3,male,35.0,0,8.05,S,24,False,1,True,Mr


Separate the features from the target and split the data between train and test (with random_state = 0)

In [108]:
# Separate target variable Y from features X
print("Separating labels from features...")
target_variable = "Survived"

X = dataset_cleaned.drop(columns=[target_variable], axis= 1)
Y = dataset_cleaned[target_variable]

print("...Done.")
print()

print("Y (Target variable):")
print(Y.head())
print("\nX (Features):")
print(X.head())

Separating labels from features...
...Done.

Y (Target variable):
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

X (Features):
   Pclass     Sex   Age  Parch     Fare Embarked  Name_length  Has_Cabin  \
0       3    male  22.0      0   7.2500        S           23      False   
1       1  female  38.0      0  71.2833        C           51       True   
2       3  female  26.0      0   7.9250        S           22      False   
3       1  female  35.0      0  53.1000        S           44       True   
4       3    male  35.0      0   8.0500        S           24      False   

   FamilySize  IsAlone Title  
0           2    False    Mr  
1           2    False   Mrs  
2           1     True  Miss  
3           2    False   Mrs  
4           1     True    Mr  


In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)

Using the Pipeline and ColumnTransformer, make all the preprocessings at once. Use the KNN imputer to handle the missing values in the numeric variables, and the SimpleImputer for categorical data.

In [110]:
numeric_features = X.select_dtypes(include=["int64", "float64", "bool"]).columns.tolist()
categorical_features = X.select_dtypes(exclude=["int64", "float64", "bool"]).columns.tolist()
print("numeric_features:")
print(numeric_features)
print("\ncategorical_features:")
print(categorical_features)


numeric_features:
['Pclass', 'Age', 'Parch', 'Fare', 'Name_length', 'Has_Cabin', 'FamilySize', 'IsAlone']

categorical_features:
['Sex', 'Embarked', 'Title']


In [111]:
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('scaler', StandardScaler())
])

In [112]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop="first"))
])

In [113]:
preprocessor = ColumnTransformer(transformers=[
   ('num', numeric_transformer, numeric_features),
   ('cat', categorical_transformer, categorical_features)
])

In [114]:
# Preprocessing on train set
print('performing preprocessing on train set')
print(X_train.head())

X_train = preprocessor.fit_transform(X_train)
print('preprocessor on train set done')
X_train[0:5]
print()

# Preprocessing on test set
print('performing preprocessing on test set')
print(X_test.head())

X_test = preprocessor.transform(X_test)
print('preprocessor on test set done')
X_test[0:5]


performing preprocessing on train set
     Pclass     Sex   Age  Parch     Fare Embarked  Name_length  Has_Cabin  \
502       3  female   NaN      0   7.6292        Q           30      False   
464       3    male   NaN      0   8.0500        S           18      False   
198       3  female   NaN      0   7.7500        Q           32      False   
765       1  female  51.0      0  77.9583        S           36       True   
421       3    male  21.0      0   7.7333        Q           19      False   

     FamilySize  IsAlone Title  
502           1     True  Miss  
464           1     True    Mr  
198           1     True  Miss  
765           2    False   Mrs  
421           1     True    Mr  
preprocessor on train set done

performing preprocessing on test set
     Pclass     Sex   Age  Parch     Fare Embarked  Name_length  Has_Cabin  \
153       3    male  40.5      2  14.5000        S           31      False   
752       3    male  33.0      0   9.5000        S           32      F

array([[ 0.8342464 ,  0.82047558,  2.1035633 , -0.35183846,  0.42980276,
        -0.5383819 ,  0.7186833 , -1.23846995,  1.        ,  0.        ,
         1.        ,  1.        ,  0.        ,  0.        ],
       [ 0.8342464 ,  0.25328956, -0.46765914, -0.44660331,  0.53706514,
        -0.5383819 , -0.56377913,  0.80744793,  1.        ,  0.        ,
         1.        ,  1.        ,  0.        ,  0.        ],
       [ 0.8342464 ,  0.70703837,  5.96039696, -0.03390238,  3.2186246 ,
        -0.5383819 ,  3.28360818, -1.23846995,  0.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ],
       [ 0.8342464 , -0.12483446, -0.46765914, -0.44660331,  0.32254038,
        -0.5383819 , -0.56377913,  0.80744793,  1.        ,  0.        ,
         1.        ,  1.        ,  0.        ,  0.        ],
       [-1.56084809, -0.42733367, -0.46765914,  0.94944027,  0.32254038,
         1.85741756, -0.56377913,  0.80744793,  0.        ,  0.        ,
         0.        ,  0.  

# Pearson Correlation Heatmap

Produce a figure that contains the correlation table for all the explanatory variables of X_train, what do you think?

In [115]:
corr_matrix = pd.DataFrame(X_train).corr().round(2)


In [116]:
# Création de la heatmap annotée
fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns.tolist(),
    y=corr_matrix.index.tolist(),
    colorscale="Viridis",  # Ajout d'une palette de couleurs pour un meilleur contraste
    showscale=True         # Ajout d'une barre de couleur pour indiquer les valeurs
)

# Mise à jour du layout pour améliorer l'affichage
fig.update_layout(
    title="Correlation Matrix Heatmap",
    xaxis_title="Features",
    yaxis_title="Features",
    margin=dict(l=100, r=100, t=50, b=50)  # Ajustement des marges pour éviter la coupure des étiquettes
)

# Affichage de la heatmap
fig.show()

Correlations between the variables are not very high, we can hope that they will each bring complementary information in order to predict the target variable.

# Ensembling & Stacking models

Now that we have finished our preprocessing and made sure our data was fit for prediction, let's move on to creating our ensemble models. We'll train different models with different ensembling strategies and store their train and test scores for comparison.

## Random Forest

Train a Random Forest by tuning the hyperparameters with a grid search. Which ensemble method is related to random forests?

Evaluate the best model's accuracy on train and test sets. Save the scores into a pandas DataFrame.

In [117]:
scores_df = pd.DataFrame(columns = ['model', 'accuracy', 'set'])
scores_df

Unnamed: 0,model,accuracy,set


In [118]:
# initialisation du modèle de random forest:
random_forest = RandomForestClassifier()

# grille d'hyperparamètres pour la recherche par grille:
params = {
    'max_depth': [2, 4, 6, 8, 10],          # Profondeur maximale des arbres
    'min_samples_leaf': [1, 2, 5],         # Nombre minimal d'échantillons dans une feuille
    'min_samples_split': [2, 4, 8],        # Nombre minimal d'échantillons pour diviser un nœud
    'n_estimators': [10, 20, 40, 60, 80, 100]  # Nombre d'arbres dans la forêt
}

print(params)

# initialisation la GridSearch avec validation croisée:
gridsearch = GridSearchCV(
    estimator= random_forest,
    param_grid= params,
    cv=3,
    verbose=1,
)
gridsearch.fit(X_train, y_train)

# Affichage des meilleurs paramètres et performances
print(f'Meilleurs hyperparamètres: {gridsearch.best_params_}')
print(f'Meilleure cross-validation accuracy: {gridsearch.best_score_:.4f}')

# Evaluation des meilleures performances sur les ensembles d'entrainement et de test
train_accuracy = gridsearch.score(X_train, y_train)
test_accuracy = gridsearch.score(X_test, y_test)
print(f'Meilleur accuracy score sur le train set: {train_accuracy:.4f}')
print(f'Meilleur accuracy score sur le test set: {test_accuracy:.4f}')


{'max_depth': [2, 4, 6, 8, 10], 'min_samples_leaf': [1, 2, 5], 'min_samples_split': [2, 4, 8], 'n_estimators': [10, 20, 40, 60, 80, 100]}
Fitting 3 folds for each of 270 candidates, totalling 810 fits
Meilleurs hyperparamètres: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 20}
Meilleure cross-validation accuracy: 0.8314
Meilleur accuracy score sur le train set: 0.9382
Meilleur accuracy score sur le test set: 0.7933



invalid value encountered in cast



In [119]:
# Ajout des nouvelles lignes au DataFrame
new_rows = pd.DataFrame([
    {'model': 'random_forest', 'accuracy': train_accuracy, 'set': 'train'},
    {'model': 'random_forest', 'accuracy': test_accuracy, 'set': 'test'}
])

# Fusion avec le DataFrame existant
scores_df = pd.concat([scores_df, new_rows], ignore_index=True)

# Affichage du DataFrame mis à jour
scores_df


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



Unnamed: 0,model,accuracy,set
0,random_forest,0.938202,train
1,random_forest,0.793296,test


Create your own Bagging of decision tree (with the same hyperparameters as the optimal ones for Random Forest) and check you get compatible performances.

In [120]:
# Initialisation du modèle de base et de l'algorithme d'ensemble
decision_tree = DecisionTreeClassifier(
    max_depth= 8,
    min_samples_leaf= 1,
    min_samples_split= 4
)

# Initialize BaggingClassifier with decision_tree as the base estimator
bagging = BaggingClassifier(
    estimator=decision_tree,
    n_estimators=10)
bagging.fit(X_train, y_train)

# Evaluation des meilleures performances sur les ensembles d'entrainement et de test
train_accuracy = bagging.score(X_train, y_train)
test_accuracy = bagging.score(X_test, y_test)
print(f'Meilleur accuracy score sur le train set: {train_accuracy:.4f}')
print(f'Meilleur accuracy score sur le test set: {test_accuracy:.4f}')

# Ajout des nouvelles lignes au DataFrame
new_rows = pd.DataFrame([
    {'model': 'bagging_dt', 'accuracy': train_accuracy, 'set': 'train'},
    {'model': 'bagging_dt', 'accuracy': test_accuracy, 'set': 'test'}
])

# Fusion avec le DataFrame existant
scores_df = pd.concat([scores_df, new_rows], ignore_index=True)

# Affichage du DataFrame mis à jour
scores_df

Meilleur accuracy score sur le train set: 0.9213
Meilleur accuracy score sur le test set: 0.8156


Unnamed: 0,model,accuracy,set
0,random_forest,0.938202,train
1,random_forest,0.793296,test
2,bagging_dt,0.921348,train
3,bagging_dt,0.815642,test


The performances are not exactly the same ! However, let's remember that this dataset is quite small, so actually the difference is not significant 😌 (if you're curious: use a cross-validation to estimate the uncertainty on the test scores)

Train an AdaBoost model by tuning the hyperparameters:

With a logistic regression as base estimator

With a decision tree as base estimator

For each model, evaluate the performances on the test set.

In [121]:
# Initialisation du modèle de base et de l'algorithme d'ensemble
log_reg = LogisticRegression(max_iter=1000)

# Initialize BaggingClassifier with decision_tree as the base estimator
boosting = AdaBoostClassifier(
    estimator=log_reg
)

# Define parameter grid for hyperparameter tuning
params = {
    'estimator__C': [0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0],  # C is a hyperparameter of LogisticRegression
    'n_estimators': [5, 10, 20, 40, 60, 80, 100]  # n_estimators is a hyperparameter of BaggingClassifier
}

print(params)

# initialisation la GridSearch avec validation croisée:
gridsearch = GridSearchCV(
    estimator= boosting,
    param_grid= params,
    cv=3,
    verbose=1,
)
gridsearch.fit(X_train, y_train)

# Affichage des meilleurs paramètres et performances
print(f'Meilleurs hyperparamètres: {gridsearch.best_params_}')
print(f'Meilleure cross-validation accuracy: {gridsearch.best_score_:.4f}')

# Evaluation des meilleures performances sur les ensembles d'entrainement et de test
train_accuracy = gridsearch.score(X_train, y_train)
test_accuracy = gridsearch.score(X_test, y_test)
print(f'Meilleur accuracy score sur le train set: {train_accuracy:.4f}')
print(f'Meilleur accuracy score sur le test set: {test_accuracy:.4f}')


# Ajout des nouvelles lignes au DataFrame
new_rows = pd.DataFrame([
    {'model': 'adaboost_logreg', 'accuracy': train_accuracy, 'set': 'train'},
    {'model': 'adaboost_logreg', 'accuracy': test_accuracy, 'set': 'test'}
])

# Fusion avec le DataFrame existant
scores_df = pd.concat([scores_df, new_rows], ignore_index=True)

# Affichage du DataFrame mis à jour
scores_df





















{'estimator__C': [0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0], 'n_estimators': [5, 10, 20, 40, 60, 80, 100]}
Fitting 3 folds for each of 56 candidates, totalling 168 fits




































































































































































































































































































































Meilleurs hyperparamètres: {'estimator__C': 1.0, 'n_estimators': 100}
Meilleure cross-validation accuracy: 0.8076
Meilleur accuracy score sur le train set: 0.8146
Meilleur accuracy score sur le test set: 0.7765


Unnamed: 0,model,accuracy,set
0,random_forest,0.938202,train
1,random_forest,0.793296,test
2,bagging_dt,0.921348,train
3,bagging_dt,0.815642,test
4,adaboost_logreg,0.814607,train
5,adaboost_logreg,0.776536,test


In [122]:
# Initialisation du modèle de base et de l'algorithme d'ensemble
decision_tree = DecisionTreeClassifier()

# Initialize BaggingClassifier with decision_tree as the base estimator
boosting = AdaBoostClassifier(
    estimator=decision_tree
)

# Define parameter grid for hyperparameter tuning
params = {
    'estimator__max_depth': [8, 10, 12],
    'estimator__min_samples_leaf': [1, 2, 3],
    'estimator__min_samples_split': [6, 8, 10],
    'n_estimators': [2, 4, 6, 8, 10, 12]
}

print(params)

# initialisation la GridSearch avec validation croisée:
gridsearch = GridSearchCV(
    estimator= boosting,
    param_grid= params,
    cv=3,
    verbose=1,
)
gridsearch.fit(X_train, y_train)

# Affichage des meilleurs paramètres et performances
print(f'Meilleurs hyperparamètres: {gridsearch.best_params_}')
print(f'Meilleure cross-validation accuracy: {gridsearch.best_score_:.4f}')

# Evaluation des meilleures performances sur les ensembles d'entrainement et de test
train_accuracy = gridsearch.score(X_train, y_train)
test_accuracy = gridsearch.score(X_test, y_test)
print(f'Meilleur accuracy score sur le train set: {train_accuracy:.4f}')
print(f'Meilleur accuracy score sur le test set: {test_accuracy:.4f}')


# Ajout des nouvelles lignes au DataFrame
new_rows = pd.DataFrame([
    {'model': 'adaboost_dt', 'accuracy': train_accuracy, 'set': 'train'},
    {'model': 'adaboost_dt', 'accuracy': test_accuracy, 'set': 'test'}
])

# Fusion avec le DataFrame existant
scores_df = pd.concat([scores_df, new_rows], ignore_index=True)

# Affichage du DataFrame mis à jour
scores_df

{'estimator__max_depth': [8, 10, 12], 'estimator__min_samples_leaf': [1, 2, 3], 'estimator__min_samples_split': [6, 8, 10], 'n_estimators': [2, 4, 6, 8, 10, 12]}
Fitting 3 folds for each of 162 candidates, totalling 486 fits






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































Meilleurs hyperparamètres: {'estimator__max_depth': 10, 'estimator__min_samples_leaf': 3, 'estimator__min_samples_split': 10, 'n_estimators': 10}
Meilleure cross-validation accuracy: 0.8019
Meilleur accuracy score sur le train set: 1.0000
Meilleur accuracy score sur le test set: 0.7989













invalid value encountered in cast





Unnamed: 0,model,accuracy,set
0,random_forest,0.938202,train
1,random_forest,0.793296,test
2,bagging_dt,0.921348,train
3,bagging_dt,0.815642,test
4,adaboost_logreg,0.814607,train
5,adaboost_logreg,0.776536,test
6,adaboost_dt,1.0,train
7,adaboost_dt,0.798883,test


Train scikit-learn's GradientBoosting model (by tuning hyperparameters) and evaluate the performances.