# [2-BENCHMARK] - Phase de sélection des modèles

## Import des modules
> cf. [pyproject.toml](pyproject.toml) pour connaître les librairies à installer

In [1]:
import os
import io
import json
import boto3
import joblib
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Récupération des données dans le stockage objet AWS S3

In [2]:
# Configutations S3 access and data
load_dotenv()
aws_access_key_id = os.getenv('aws_access_key_id')
aws_secret_access_key = os.getenv('aws_secret_access_key')

In [3]:
# Specific bucket data
bucket_name = "hotel-resa-prediction"
prefix = "datasets/"
filename = "processed_hotel_bookings.csv"

In [4]:
# S3 connection
s3 = boto3.client(
    service_name = "s3",
    region_name = "eu-west-3",
    aws_access_key_id = aws_access_key_id,
    aws_secret_access_key = aws_secret_access_key,
)

In [5]:
# get datas 
result = s3.list_objects(Bucket=bucket_name)
for obj in result.get('Contents'):
    if (obj["Key"].startswith(prefix)) and (obj["Key"].endswith(filename)):
        data = s3.get_object(Bucket=bucket_name, Key=obj.get('Key'))
        contents = data['Body'].read().decode("utf-8")
        data = pd.read_csv(io.StringIO(contents), low_memory=False)

In [6]:
data.head().T

Unnamed: 0,0,1,2,3,4
lead_time,342,737,7,13,14
stays_in_weekend_nights,0,0,0,0,0
stays_in_week_nights,0,0,1,1,2
adults,2,2,1,1,2
children,0.0,0.0,0.0,0.0,0.0
babies,0,0,0,0,0
previous_cancellations,0,0,0,0,0
previous_bookings_not_canceled,0,0,0,0,0
booking_changes,3,4,0,0,0
adr,0.0,0.0,75.0,75.0,98.0


## Découpe des données en train et test

**Info :** Nous allons séparer les données en variable à expliquer (target) y et en variable expliquer (features).

In [7]:
X = data.drop("is_canceled",  axis=1)
y = data["is_canceled"]
print("Features : ", X.shape)
print("Target : ", y.shape) # s'assurer de n'avoir qu'une colonne ici

Features :  (119210, 30)
Target :  (119210,)


**Info** : Pour l'apprentissage, nous avons besoin de séparer en deux partie, un jeu d'entraînement et un jeu de test. Nous faisons le choix de découper en 70/30 soit 70 en jeu d'entraînement et 30 en jeu de test. Ici, nous faisons le choix de ne par garder de dépendance temporelle c'est la raison pour laquelle on a mélanger le jeux de données avec la méthode shuffle.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
print("Features train : ", X_train.shape)
print("Target train : ", y_train.shape)
print("Features test : ", X_test.shape)
print("Target test : ", y_test.shape)

Features train :  (83447, 30)
Target train :  (83447,)
Features test :  (35763, 30)
Target test :  (35763,)


In [9]:
categorical_data = selector(dtype_include=object)
numerical_data = selector(dtype_exclude=object)

In [10]:
n_categorical = categorical_data(X)
n_numerical = numerical_data(X)

In [11]:
print(f"Categorial : {len(n_categorical)}")
print(f"Numerical : {len(n_numerical)}")

Categorial : 15
Numerical : 15


In [12]:
numeric_preprocessor = StandardScaler()
categoric_preprocessor = OneHotEncoder(
    handle_unknown='ignore'
)

In [13]:
preprocessor = ColumnTransformer(
    [
        ("numerical", numeric_preprocessor, n_numerical),
        ("Categorical", categoric_preprocessor, n_categorical)
    ],
    remainder = "passthrough",
)
preprocessor

0,1,2
,transformers,"[('numerical', ...), ('Categorical', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [14]:
model_linear = LogisticRegression() 

In [15]:
model_tree = DecisionTreeClassifier() 

In [16]:
model_ensemble = RandomForestClassifier() 

In [17]:
linear_pipeline = make_pipeline(
    preprocessor,
    model_linear
)
linear_pipeline

0,1,2
,steps,"[('columntransformer', ...), ('logisticregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numerical', ...), ('Categorical', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [18]:
tree_pipeline = make_pipeline(
    preprocessor,
    model_tree
)
tree_pipeline

0,1,2
,steps,"[('columntransformer', ...), ('decisiontreeclassifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numerical', ...), ('Categorical', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [19]:
ensemble_pipeline = make_pipeline(
    preprocessor,
    model_ensemble
)
ensemble_pipeline

0,1,2
,steps,"[('columntransformer', ...), ('randomforestclassifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numerical', ...), ('Categorical', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [20]:
cv_linear = cross_validate(
    linear_pipeline,
    X_train,
    y_train,
    cv=10,
    error_score='raise',
    scoring='accuracy',
    n_jobs=-1
)
cv_linear = pd.DataFrame(cv_linear)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

In [21]:
cv_tree = cross_validate(
    tree_pipeline,
    X_train,
    y_train,
    cv=10,
    error_score='raise',
    scoring='accuracy',
    n_jobs=-1
)
cv_tree = pd.DataFrame(cv_tree)

In [None]:
cv_ensemble = cross_validate(
    ensemble_pipeline,
    X_train,
    y_train,
    cv=10,
    error_score='raise',
    scoring='accuracy',
    n_jobs=-1
)
cv_ensemble = pd.DataFrame(cv_ensemble)

In [None]:
linear_scores = -cv_linear["test_score"] * 100
tree_scores = -cv_tree["test_score"] * 100
ensemble_scores = -cv_ensemble["test_score"] * 100

In [None]:
indices = np.arange(len(cv_linear)) 

In [None]:
plt.scatter(
    indices,
    -linear_scores,
    color="#944E63",
    label="Logistic Regression model"
)

plt.scatter(
    indices,
    -tree_scores,
    color="#B47B84",
    label="Decision Tree model"
)

plt.scatter(
    indices,
    -ensemble_scores,
    color="#0C2D57",
    label="Random Forest model"
)

plt.ylim((0,100))
plt.legend()
plt.xlabel("Folds")
plt.ylabel("Scores")
_ = plt.title("Comparing the 3 models")

In [None]:
model_scores = {
    "Logistic Regression": -linear_scores.mean(),
    "Decision Tree Model": -tree_scores.mean(),
    "Random Forest": -ensemble_scores.mean()
}

In [None]:
highest_score = max(model_scores.values())

In [None]:
best_models = [model for model, score in model_scores.items() if score == highest_score] 

In [None]:
if len(best_models) == 1:
    print(f"The model with the highest accuracy score ({highest_score:.2f}%) is: {best_models[0]}")
else:
    print(f"Models with the highest accuracy score ({highest_score:.2f}%) are:")
    for model in best_models:
        print(f"\t- {model}")

In [None]:
linear_pipeline.fit(X_train, y_train)
tree_pipeline.fit(X_train, y_train)
ensemble_pipeline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

y_pred = linear_pipeline.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))
print("ROC-AUC:", roc_auc_score(y_test, linear_pipeline.predict_proba(X_test)[:,1]))

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

y_pred = tree_pipeline.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))
print("ROC-AUC:", roc_auc_score(y_test, tree_pipeline.predict_proba(X_test)[:,1]))

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

y_pred = ensemble_pipeline.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))
print("ROC-AUC:", roc_auc_score(y_test, ensemble_pipeline.predict_proba(X_test)[:,1]))