In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score 
from sklearn.metrics import roc_auc_score
import lime
from lime import lime_tabular
import shap
import matplotlib.pyplot as plt

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
# Charger les données
data = pd.read_excel("dataset_new.xlsx")


In [3]:
data

Unnamed: 0,y,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_success,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,0,0,0,1,0,0,0,0,0,0,...,0,1.533034,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680
1,0,0,0,0,0,0,0,1,0,0,...,0,1.628993,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680
2,0,0,0,0,0,0,0,1,0,0,...,0,-0.290186,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680
3,0,0,0,0,0,0,0,0,0,0,...,0,-0.002309,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680
4,0,0,0,0,0,0,0,1,0,0,...,0,1.533034,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,1,0,0,0,0,1,0,0,0,0,...,0,3.164336,-0.565922,0.195414,-0.349494,-0.752343,2.058168,-2.224953,-1.495186,-2.815697
41184,0,1,0,0,0,0,0,0,0,0,...,0,0.573445,-0.565922,0.195414,-0.349494,-0.752343,2.058168,-2.224953,-1.495186,-2.815697
41185,0,0,0,0,0,1,0,0,0,0,...,0,1.533034,-0.204909,0.195414,-0.349494,-0.752343,2.058168,-2.224953,-1.495186,-2.815697
41186,1,0,0,0,0,0,0,0,0,1,...,0,0.381527,-0.565922,0.195414,-0.349494,-0.752343,2.058168,-2.224953,-1.495186,-2.815697


In [4]:
# Séparer les caractéristiques (features) et la variable cible
X = data.drop('y', axis=1)
y = data['y']

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Créer les modèles
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(probability=True)
}


In [None]:
# Entraîner et évaluer les modèles
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{model_name} - Accuracy: {accuracy:.4f}')
    
    # Matrice de Confusion
    confusion_mat = confusion_matrix(y_test, y_pred)
    print(f'{model_name} - Confusion Matrix:\n{confusion_mat}')
    
    # Précision, Recall, F1-Score
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f'{model_name} - Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}')
    
    # AUC-ROC
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    auc_roc = roc_auc_score(y_test, y_pred_prob)
    print(f'{model_name} - AUC-ROC: {auc_roc:.4f}')
    
    print('\n')  # Ajout d'une ligne vide entre les résultats de différents modèles


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression - Accuracy: 0.8968
Logistic Regression - Confusion Matrix:
[[7191  112]
 [ 738  197]]
Logistic Regression - Precision: 0.6375, Recall: 0.2107, F1-Score: 0.3167
Logistic Regression - AUC-ROC: 0.7802


Random Forest - Accuracy: 0.8921
Random Forest - Confusion Matrix:
[[7073  230]
 [ 659  276]]
Random Forest - Precision: 0.5455, Recall: 0.2952, F1-Score: 0.3831
Random Forest - AUC-ROC: 0.7639




In [None]:
explainer = lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns, class_names=['no', 'yes'], discretize_continuous=True)
observation_to_explain = X_test.iloc[4]
exp = explainer.explain_instance(observation_to_explain.values, models['Random Forest'].predict_proba)


In [None]:
# Afficher les explications LIME pour cette observation
exp.show_in_notebook(show_table=True)


In [None]:
# Utiliser SHAP pour expliquer les prédictions pour un échantillon spécifique (par exemple, l'observation n°20)
explainer_shap = shap.TreeExplainer(models['Random Forest'])
shap_values = explainer_shap.shap_values(X_test.iloc[20])
shap.summary_plot(shap_values, X_test.iloc[20])


In [None]:
# Afficher les prédictions du modèle
probabilities = models['Random Forest'].predict_proba(X_test.iloc[20].values.reshape(1, -1))
print(f'Probabilité de souscrire à un dépôt à terme (classe "yes"): {probabilities[0, 1]:.4f}')


In [None]:
# Afficher le diagramme de force de SHAP pour cette observation
shap.force_plot(explainer_shap.expected_value[1], shap_values[1], X_test.iloc[20])
