In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score


In [23]:
# Charger les données
data = pd.read_excel("dataset_new.xlsx")
data

Unnamed: 0,y,education_basic.4ans,education_basic.6ans,education_basic.9ans,education_cours_professionnel,education_diplôme_universitaire,education_illettré,education_inconnue,education_lycée,contact_cellulaire,...,jour_de_semaine_mer,jour_de_semaine_ven,resultat_campagne_precendente_echec,resultat_campagne_precendente_existe_pas,resultat_campagne_precendente_reussite,age,campagne,nombre_de_jour_ecoule,nombre_contact_precedent,duree_appel
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1.533034,-0.565922,0.195414,-0.349494,0.010471
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,1.628993,-0.565922,0.195414,-0.349494,-0.421501
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,-0.290186,-0.565922,0.195414,-0.349494,-0.124520
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,-0.002309,-0.565922,0.195414,-0.349494,-0.413787
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,1.533034,-0.565922,0.195414,-0.349494,0.187888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,1,0,0,0,1,0,0,0,0,1,...,0,1,0,1,0,3.164336,-0.565922,0.195414,-0.349494,0.292025
41184,0,0,0,0,1,0,0,0,0,1,...,0,1,0,1,0,0.573445,-0.565922,0.195414,-0.349494,0.481012
41185,0,0,0,0,0,1,0,0,0,1,...,0,1,0,1,0,1.533034,-0.204909,0.195414,-0.349494,-0.267225
41186,1,0,0,0,1,0,0,0,0,1,...,0,1,0,1,0,0.381527,-0.565922,0.195414,-0.349494,0.708569


In [24]:
# data.columns.tolist()

In [25]:
# Séparer les caractéristiques (features) et la variable cible
X = data.drop('y', axis=1)
y = data['y']

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [26]:
# Créer les modèles
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Naive Bayes': GaussianNB(),
    'Support Vector Machine': SVC(probability=True)
}

In [27]:
import joblib

# Entraîner et évaluer les modèles
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{model_name} - Accuracy: {accuracy:.4f}')
    
    # Matrice de Confusion
    confusion_mat = confusion_matrix(y_test, y_pred)
    print(f'{model_name} - Confusion Matrix:\n{confusion_mat}')
    
    # Précision, Recall, F1-Score
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f'{model_name} - Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}')
    
    # AUC-ROC
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    auc_roc = roc_auc_score(y_test, y_pred_prob)
    print(f'{model_name} - AUC-ROC: {auc_roc:.4f}')
    
    filename = f'{model_name.lower().replace(" ", "_")}_model.pkl'
    joblib.dump(model, filename)

    print('\n')  # Ajout d'une ligne vide entre les résultats de différents modèles


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression - Accuracy: 0.9068
Logistic Regression - Confusion Matrix:
[[7131  172]
 [ 596  339]]
Logistic Regression - Precision: 0.6634, Recall: 0.3626, F1-Score: 0.4689
Logistic Regression - AUC-ROC: 0.9126


Random Forest - Accuracy: 0.9069
Random Forest - Confusion Matrix:
[[7096  207]
 [ 560  375]]
Random Forest - Precision: 0.6443, Recall: 0.4011, F1-Score: 0.4944
Random Forest - AUC-ROC: 0.9216


Gradient Boosting - Accuracy: 0.9101
Gradient Boosting - Confusion Matrix:
[[7117  186]
 [ 555  380]]
Gradient Boosting - Precision: 0.6714, Recall: 0.4064, F1-Score: 0.5063
Gradient Boosting - AUC-ROC: 0.9277


Naive Bayes - Accuracy: 0.8250
Naive Bayes - Confusion Matrix:
[[6211 1092]
 [ 350  585]]
Naive Bayes - Precision: 0.3488, Recall: 0.6257, F1-Score: 0.4479
Naive Bayes - AUC-ROC: 0.8160


Support Vector Machine - Accuracy: 0.9077
Support Vector Machine - Confusion Matrix:
[[7149  154]
 [ 606  329]]
Support Vector Machine - Precision: 0.6812, Recall: 0.3519, F1-Score: 0.

In [28]:

# # Enregistrer chaque modèle dans un fichier
# for model_name, model in models.items():
#     filename = f'{model_name.lower().replace(" ", "_")}_model.pkl'
#     joblib.dump(model, filename)

# Afficher les noms de fichiers pour vérification
print("Modèles enregistrés avec succès:")
for model_name in models.keys():
    filename = f'{model_name.lower().replace(" ", "_")}_model.pkl'
    print(f"- {filename}")

Modèles enregistrés avec succès:
- logistic_regression_model.pkl
- random_forest_model.pkl
- gradient_boosting_model.pkl
- naive_bayes_model.pkl
- support_vector_machine_model.pkl


In [29]:
# Enregistrer le DataFrame encodé dans un fichier
joblib.dump(X, 'encoder.pkl')
joblib.dump(X_test, 'x_test.pkl')

['x_test.pkl']

In [30]:
X

Unnamed: 0,education_basic.4ans,education_basic.6ans,education_basic.9ans,education_cours_professionnel,education_diplôme_universitaire,education_illettré,education_inconnue,education_lycée,contact_cellulaire,contact_téléphone,...,jour_de_semaine_mer,jour_de_semaine_ven,resultat_campagne_precendente_echec,resultat_campagne_precendente_existe_pas,resultat_campagne_precendente_reussite,age,campagne,nombre_de_jour_ecoule,nombre_contact_precedent,duree_appel
0,1,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,1.533034,-0.565922,0.195414,-0.349494,0.010471
1,0,0,0,0,0,0,0,1,0,1,...,0,0,0,1,0,1.628993,-0.565922,0.195414,-0.349494,-0.421501
2,0,0,0,0,0,0,0,1,0,1,...,0,0,0,1,0,-0.290186,-0.565922,0.195414,-0.349494,-0.124520
3,0,1,0,0,0,0,0,0,0,1,...,0,0,0,1,0,-0.002309,-0.565922,0.195414,-0.349494,-0.413787
4,0,0,0,0,0,0,0,1,0,1,...,0,0,0,1,0,1.533034,-0.565922,0.195414,-0.349494,0.187888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,0,0,0,1,0,0,0,0,1,0,...,0,1,0,1,0,3.164336,-0.565922,0.195414,-0.349494,0.292025
41184,0,0,0,1,0,0,0,0,1,0,...,0,1,0,1,0,0.573445,-0.565922,0.195414,-0.349494,0.481012
41185,0,0,0,0,1,0,0,0,1,0,...,0,1,0,1,0,1.533034,-0.204909,0.195414,-0.349494,-0.267225
41186,0,0,0,1,0,0,0,0,1,0,...,0,1,0,1,0,0.381527,-0.565922,0.195414,-0.349494,0.708569
