# I. Modélisation statistique

In [2]:
!pip install s3fs
!pip install pandas
!pip install scikit-learn==1.2.2
!pip install imbalanced-learn



In [3]:
!pip install matplotlib



## I.1. Chargement et préparation des données

### A) Importation des données

In [5]:
import pandas as pd

# Importation des données avec code_insee en index
donnees = pd.read_csv("data/bdd_finale.csv", sep=',', encoding='utf-8', index_col=0)
donnees.head()

Unnamed: 0_level_0,beneficiaire_trans_eco,moyenne_conso_agri_hab,moyenne_conso_indus_hab,moyenne_conso_tertiaire_hab,moyenne_conso_residentiel_hab,moyenne_conso_totale_hab,emissions_ges,nb_actes_france_renov,friche,ecoquartiers,...,CSP_maire,com_variation_encours_dette_ha_pct,part_inactifs,part_actifs,dependance_eco,abstention_municipales,taux_creation_ent,total_entreprises,part_licencies_sportifs,part_jeunes_sans_diplome
code_insee,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22001,0,2.994071,0.000503,0.631181,3.688905,7.378017,14871,88,2,0,...,12,18.9,12.751678,47.147651,107,17.4,15.0,20,23.825503,9.1
22002,0,1.283886,0.03212,0.173206,3.248865,4.775226,8621,38,0,0,...,54,-39.3,11.062718,50.522648,96,53.8,12.1,33,30.836237,4.0
22003,0,0.0,0.0,0.524098,2.908468,3.47274,3028,49,0,0,...,56,99.8,14.23913,48.804348,113,53.7,36.7,30,35.76087,0.0
22004,0,0.200325,0.675332,1.267791,3.034877,5.20315,29130,46,6,0,...,33,-34.1,17.234927,40.540541,126,44.8,12.6,238,24.282744,12.4
22005,0,0.121358,0.082905,1.151104,3.532942,4.926877,4136,10,3,0,...,77,-14.5,14.188533,42.371234,140,22.9,18.8,64,27.891156,2.6


### B) Formatage des données

Nous allons pouvoir observer le type des données que nous avons. Cela va nous permettre de déterminer si on a besoin de modifier certaines variables pour qu'elles soient bien en variables catégorielles. 

In [6]:
donnees.dtypes

beneficiaire_trans_eco                  int64
moyenne_conso_agri_hab                float64
moyenne_conso_indus_hab               float64
moyenne_conso_tertiaire_hab           float64
moyenne_conso_residentiel_hab         float64
moyenne_conso_totale_hab              float64
emissions_ges                           int64
nb_actes_france_renov                   int64
friche                                  int64
ecoquartiers                            int64
part_residences_secondaires           float64
beneficiaire_prog                       int64
climat                                 object
gridens7                                int64
superf_choro                          float64
departement                             int64
gare_tgv                                int64
part_trajets_voiture                  float64
med_disp                                int64
an                                      int64
p_pop                                   int64
CSP_maire                         

In [7]:
# Conversion des variables en variables catégorielles
donnees["beneficiaire_trans_eco"] = donnees["beneficiaire_trans_eco"].astype('category')
donnees["ecoquartiers"] = donnees["ecoquartiers"].astype('category')
donnees["beneficiaire_prog"] = donnees["beneficiaire_prog"].astype('category')
donnees["gridens7"] = donnees["gridens7"].astype('category')
donnees["departement"] = donnees["departement"].astype('category')
donnees["gare_tgv"] = donnees["gare_tgv"].astype('category')
donnees["CSP_maire"] = donnees["CSP_maire"].astype('category')


# Cas de la variable climat 
donnees = pd.get_dummies(donnees, columns = ["climat"]) # On crée des variables binaires pour la variable climat car on a des chaînes de caractères

In [8]:
donnees.dtypes

beneficiaire_trans_eco                category
moyenne_conso_agri_hab                 float64
moyenne_conso_indus_hab                float64
moyenne_conso_tertiaire_hab            float64
moyenne_conso_residentiel_hab          float64
moyenne_conso_totale_hab               float64
emissions_ges                            int64
nb_actes_france_renov                    int64
friche                                   int64
ecoquartiers                          category
part_residences_secondaires            float64
beneficiaire_prog                     category
gridens7                              category
superf_choro                           float64
departement                           category
gare_tgv                              category
part_trajets_voiture                   float64
med_disp                                 int64
an                                       int64
p_pop                                    int64
CSP_maire                             category
com_variation

Nous allons observer si on a des données manquantes dans notre jeu de données.

In [9]:
donnees.isna().sum()

beneficiaire_trans_eco                0
moyenne_conso_agri_hab                0
moyenne_conso_indus_hab               0
moyenne_conso_tertiaire_hab           0
moyenne_conso_residentiel_hab         0
moyenne_conso_totale_hab              0
emissions_ges                         0
nb_actes_france_renov                 0
friche                                0
ecoquartiers                          0
part_residences_secondaires           0
beneficiaire_prog                     0
gridens7                              0
superf_choro                          0
departement                           0
gare_tgv                              0
part_trajets_voiture                  0
med_disp                              0
an                                    0
p_pop                                 0
CSP_maire                             0
com_variation_encours_dette_ha_pct    0
part_inactifs                         0
part_actifs                           0
dependance_eco                        0


### C) Séparation de la variable à expliquer et des variables explicatives

In [10]:
# Définir X et y pour avoir les variables explicatives et la variable à expliquer beneficiaire_trans_eco
X = donnees.drop(columns=['beneficiaire_trans_eco'])

# Variable à expliquer : beneficiaire_trans_eco
y = donnees["beneficiaire_trans_eco"]

In [11]:
import numpy as np

# Convertir les listes en tableaux numpy
X = np.array(X)
y = np.array(y)

In [12]:
print("Dimensions de X:", X.shape)
print("Dimensions de y:", y.shape)

Dimensions de X: (1207, 32)
Dimensions de y: (1207,)


In [13]:
np.unique(y)

array([0, 1], dtype=int64)

## I.2. Modélisation

### A) Comparaison de modèles à l'aide de PyCaret

In [14]:
!pip install pycaret

Collecting scikit-learn>1.4.0 (from pycaret)
  Obtaining dependency information for scikit-learn>1.4.0 from https://files.pythonhosted.org/packages/5d/55/0403bf2031250ac982c8053397889fbc5a3a2b3798b913dae4f51c3af6a4/scikit_learn-1.5.1-cp311-cp311-win_amd64.whl.metadata
  Downloading scikit_learn-1.5.1-cp311-cp311-win_amd64.whl.metadata (12 kB)
  Obtaining dependency information for scikit-learn>1.4.0 from https://files.pythonhosted.org/packages/79/3d/02d5d3ed359498fec3abdf65407d3c07e3b8765af17464969055aaec5171/scikit_learn-1.4.2-cp311-cp311-win_amd64.whl.metadata
  Using cached scikit_learn-1.4.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Using cached scikit_learn-1.4.2-cp311-cp311-win_amd64.whl (10.6 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2


ERROR: Could not install packages due to an OSError: [WinError 5] Accès refusé: 'C:\\Users\\ASUS\\anaconda3\\Lib\\site-packages\\sklearn\\~libs\\msvcp140.dll'
Consider using the `--user` option or check the permissions.



In [15]:
# On place la variable à expliquer en dernier
# Liste des noms de colonnes dans l'ordre souhaité
noms_colonnes = list(donnees.columns)
noms_colonnes.remove("beneficiaire_trans_eco")
noms_colonnes.append("beneficiaire_trans_eco")

# Réindexer le DataFrame avec les nouvelles colonnes
donnees = donnees.reindex(columns=noms_colonnes)
donnees.head()


Unnamed: 0_level_0,moyenne_conso_agri_hab,moyenne_conso_indus_hab,moyenne_conso_tertiaire_hab,moyenne_conso_residentiel_hab,moyenne_conso_totale_hab,emissions_ges,nb_actes_france_renov,friche,ecoquartiers,part_residences_secondaires,...,dependance_eco,abstention_municipales,taux_creation_ent,total_entreprises,part_licencies_sportifs,part_jeunes_sans_diplome,climat_Autre,climat_Estuaire,climat_Mer,beneficiaire_trans_eco
code_insee,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22001,2.994071,0.000503,0.631181,3.688905,7.378017,14871,88,2,0,12.0,...,107,17.4,15.0,20,23.825503,9.1,True,False,False,0
22002,1.283886,0.03212,0.173206,3.248865,4.775226,8621,38,0,0,2.0,...,96,53.8,12.1,33,30.836237,4.0,True,False,False,0
22003,0.0,0.0,0.524098,2.908468,3.47274,3028,49,0,0,3.4,...,113,53.7,36.7,30,35.76087,0.0,True,False,False,0
22004,0.200325,0.675332,1.267791,3.034877,5.20315,29130,46,6,0,5.5,...,126,44.8,12.6,238,24.282744,12.4,True,False,False,0
22005,0.121358,0.082905,1.151104,3.532942,4.926877,4136,10,3,0,16.8,...,140,22.9,18.8,64,27.891156,2.6,True,False,False,0


In [16]:
!pip install pycaret scikit-learn==0.23.2

Collecting scikit-learn==0.23.2
  Using cached scikit-learn-0.23.2.tar.gz (7.2 MB)
  Installing build dependencies: started
  Installing build dependencies: still running...
  Installing build dependencies: still running...
  Installing build dependencies: finished with status 'error'


  error: subprocess-exited-with-error
  
  × pip subprocess to install build dependencies did not run successfully.
  │ exit code: 1
  ╰─> [126 lines of output]
      Ignoring numpy: markers 'python_version == "3.6" and platform_system != "AIX" and platform_python_implementation == "CPython"' don't match your environment
      Ignoring numpy: markers 'python_version == "3.6" and platform_system != "AIX" and platform_python_implementation != "CPython"' don't match your environment
      Ignoring numpy: markers 'python_version == "3.7" and platform_system != "AIX"' don't match your environment
      Ignoring numpy: markers 'python_version == "3.6" and platform_system == "AIX"' don't match your environment
      Ignoring numpy: markers 'python_version == "3.7" and platform_system == "AIX"' don't match your environment
      Ignoring numpy: markers 'python_version >= "3.8" and platform_system == "AIX"' don't match your environment
      Collecting setuptools
        Obtaining dependency in

In [17]:
from pycaret.classification import *
import mlflow

# Conversion des variables catégorielles en string
for col in donnees.select_dtypes(['category']).columns:
    donnees[col] = donnees[col].astype(str)


# Initialisation de l'environnement PyCaret
clf = setup(data = donnees, 
            target = "beneficiaire_trans_eco", 
            session_id=123,   
            fix_imbalance=True, 
            fix_imbalance_method='smote') # fix_imbalance_method='smote' pour la sur-échantillonnage des données


Unnamed: 0,Description,Value
0,Session id,123
1,Target,beneficiaire_trans_eco
2,Target type,Binary
3,Target mapping,"0: 0, 1: 1"
4,Original data shape,"(1207, 33)"
5,Transformed data shape,"(1767, 42)"
6,Transformed train set shape,"(1404, 42)"
7,Transformed test set shape,"(363, 42)"
8,Numeric features,23
9,Categorical features,6


In [18]:
best_model = compare_models()# Comparaison des modèles

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.8318,0.5,0.8318,0.6919,0.7554,0.0,0.0,2.709
et,Extra Trees Classifier,0.8081,0.6588,0.8081,0.7385,0.7646,0.0633,0.0661,0.476
qda,Quadratic Discriminant Analysis,0.8046,0.5439,0.8046,0.7103,0.7458,0.0146,0.0161,0.253
rf,Random Forest Classifier,0.7914,0.6817,0.7914,0.7433,0.7615,0.0719,0.0773,0.859
gbc,Gradient Boosting Classifier,0.7856,0.6464,0.7856,0.7555,0.7667,0.1153,0.1207,1.318
lightgbm,Light Gradient Boosting Machine,0.7844,0.6386,0.7844,0.7411,0.7521,0.0364,0.0494,145.727
ada,Ada Boost Classifier,0.75,0.615,0.75,0.7461,0.747,0.0896,0.0913,0.663
ridge,Ridge Classifier,0.7194,0.6204,0.7194,0.7588,0.7345,0.1285,0.1333,0.783
lda,Linear Discriminant Analysis,0.7086,0.6147,0.7086,0.7528,0.7255,0.105,0.1107,0.251
dt,Decision Tree Classifier,0.7038,0.5269,0.7038,0.734,0.7171,0.0481,0.0488,0.165


### B) Création du modèle

In [19]:
# Création du modèle afin de prédire une probabilité de succès

model_rf = create_model('rf') # Création du modèle RandomForest

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8235,0.7309,0.8235,0.7986,0.8073,0.2513,0.2595
1,0.8235,0.7686,0.8235,0.7759,0.787,0.1356,0.1586
2,0.7882,0.6495,0.7882,0.7367,0.7555,0.0783,0.0858
3,0.7529,0.52,0.7529,0.6959,0.7212,-0.0408,-0.0435
4,0.8095,0.7454,0.8095,0.7607,0.7766,0.1111,0.124
5,0.7738,0.6561,0.7738,0.6857,0.7271,-0.0962,-0.1125
6,0.7976,0.6291,0.7976,0.7667,0.7791,0.15,0.1549
7,0.7381,0.6765,0.7381,0.7222,0.7299,0.0,0.0
8,0.8095,0.7571,0.8095,0.7396,0.763,0.04,0.05
9,0.7976,0.6837,0.7976,0.7511,0.7687,0.0893,0.0963


In [20]:
# Création modèle gradient boosting

model_gbc = create_model('gbc') # Création du modèle Gradient Boosting

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8471,0.7797,0.8471,0.8348,0.8395,0.3919,0.3958
1,0.7765,0.7414,0.7765,0.74,0.756,0.0517,0.0534
2,0.7765,0.5781,0.7765,0.7583,0.7664,0.1654,0.1668
3,0.7176,0.5543,0.7176,0.6838,0.6998,-0.0851,-0.0865
4,0.7857,0.6071,0.7857,0.7595,0.7708,0.129,0.1315
5,0.7976,0.5949,0.7976,0.7295,0.7558,0.0192,0.0225
6,0.7738,0.5102,0.7738,0.7532,0.7626,0.1094,0.1105
7,0.7738,0.6837,0.7738,0.7672,0.7704,0.1618,0.1619
8,0.8333,0.799,0.8333,0.7917,0.7926,0.16,0.2
9,0.7738,0.6153,0.7738,0.737,0.7531,0.05,0.0516


In [21]:
# Création modèle ada boosting

model_ada = create_model('ada') # Création du modèle Ada Boosting

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8,0.6821,0.8,0.7823,0.7901,0.2047,0.2068
1,0.7059,0.6841,0.7059,0.7292,0.7169,0.0157,0.0159
2,0.7765,0.5886,0.7765,0.7447,0.758,0.1151,0.1183
3,0.7059,0.5076,0.7059,0.7134,0.7096,0.0139,0.0139
4,0.7857,0.6796,0.7857,0.7857,0.7857,0.2286,0.2286
5,0.7262,0.5582,0.7262,0.7181,0.7221,-0.0147,-0.0147
6,0.75,0.5388,0.75,0.7427,0.7463,0.0735,0.0736
7,0.6905,0.6418,0.6905,0.7222,0.7053,0.0,0.0
8,0.8452,0.6765,0.8452,0.8259,0.8311,0.35,0.3615
9,0.7143,0.5929,0.7143,0.6968,0.7053,-0.0909,-0.0913


In [22]:
# Création du modèle d'arbre

model_dt = create_model('dt') # Création du modèle Decision Tree

### C) Optimisation des hyperparamètres

In [23]:
# Optimisation du modèle RandomForest pour une meilleure performance

tuned_rf = tune_model(model_rf) # Optimisation du modèle RandomForest

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8353,0.5,0.8353,0.6977,0.7603,0.0,0.0
1,0.8353,0.5,0.8353,0.6977,0.7603,0.0,0.0
2,0.8235,0.5,0.8235,0.6782,0.7438,0.0,0.0
3,0.8235,0.5,0.8235,0.6782,0.7438,0.0,0.0
4,0.8333,0.5,0.8333,0.6944,0.7576,0.0,0.0
5,0.8333,0.5,0.8333,0.6944,0.7576,0.0,0.0
6,0.8333,0.5,0.8333,0.6944,0.7576,0.0,0.0
7,0.8333,0.5,0.8333,0.6944,0.7576,0.0,0.0
8,0.8333,0.5,0.8333,0.6944,0.7576,0.0,0.0
9,0.8333,0.5,0.8333,0.6944,0.7576,0.0,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [24]:
# Optimisation du modèle Gradient Boosting

tuned_gbc = tune_model(model_gbc) # Optimisation du modèle Gradient Boosting

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8,0.7515,0.8,0.7823,0.7901,0.2047,0.2068
1,0.7412,0.7334,0.7412,0.707,0.7232,-0.0625,-0.0637
2,0.7294,0.5838,0.7294,0.6631,0.6947,-0.1399,-0.1492
3,0.7647,0.5229,0.7647,0.7219,0.7401,0.0395,0.0413
4,0.8571,0.6796,0.8571,0.8432,0.8472,0.4194,0.4274
5,0.7976,0.649,0.7976,0.7295,0.7558,0.0192,0.0225
6,0.7976,0.5408,0.7976,0.7511,0.7687,0.0893,0.0963
7,0.7738,0.6816,0.7738,0.7532,0.7626,0.1094,0.1105
8,0.8095,0.8163,0.8095,0.7607,0.7766,0.1111,0.124
9,0.7976,0.65,0.7976,0.7511,0.7687,0.0893,0.0963


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [25]:
# Optimisation du modèle Ada Boosting

tuned_ada = tune_model(model_ada) # Optimisation du modèle Ada Boosting

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7765,0.7082,0.7765,0.7952,0.7849,0.252,0.2537
1,0.7529,0.6992,0.7529,0.7599,0.7563,0.1271,0.1272
2,0.7176,0.5762,0.7176,0.7318,0.7244,0.0769,0.0772
3,0.6588,0.5652,0.6588,0.7118,0.6821,0.008,0.0083
4,0.8214,0.7531,0.8214,0.7963,0.8051,0.25,0.2582
5,0.7619,0.602,0.7619,0.7116,0.7343,-0.0345,-0.0363
6,0.7381,0.5235,0.7381,0.7381,0.7381,0.0571,0.0571
7,0.7143,0.6755,0.7143,0.7441,0.7279,0.0769,0.0778
8,0.7857,0.7276,0.7857,0.7595,0.7708,0.129,0.1315
9,0.7857,0.6776,0.7857,0.7857,0.7857,0.2286,0.2286


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [26]:
# Optimisation du modèle Decision Tree

tuned_dt = tune_model(model_dt) # Optimisation du modèle Decision Tree

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8353,0.5,0.8353,0.6977,0.7603,0.0,0.0
1,0.8353,0.5,0.8353,0.6977,0.7603,0.0,0.0
2,0.8235,0.5,0.8235,0.6782,0.7438,0.0,0.0
3,0.8235,0.5,0.8235,0.6782,0.7438,0.0,0.0
4,0.8333,0.5,0.8333,0.6944,0.7576,0.0,0.0
5,0.8333,0.5,0.8333,0.6944,0.7576,0.0,0.0
6,0.8333,0.5,0.8333,0.6944,0.7576,0.0,0.0
7,0.8333,0.5,0.8333,0.6944,0.7576,0.0,0.0
8,0.8333,0.5,0.8333,0.6944,0.7576,0.0,0.0
9,0.8333,0.5,0.8333,0.6944,0.7576,0.0,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits


### D) Comparaison des modèles optimisés

In [27]:
# Comparaison des modèles optimisés
 
compare_models() # Comparaison des modèles optimisés

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.8318,0.5,0.8318,0.6919,0.7554,0.0,0.0,0.183
et,Extra Trees Classifier,0.8081,0.6588,0.8081,0.7385,0.7646,0.0633,0.0661,0.543
qda,Quadratic Discriminant Analysis,0.8046,0.5439,0.8046,0.7103,0.7458,0.0146,0.0161,0.247
rf,Random Forest Classifier,0.7914,0.6817,0.7914,0.7433,0.7615,0.0719,0.0773,0.675
gbc,Gradient Boosting Classifier,0.7856,0.6464,0.7856,0.7555,0.7667,0.1153,0.1207,1.309
lightgbm,Light Gradient Boosting Machine,0.7844,0.6386,0.7844,0.7411,0.7521,0.0364,0.0494,36.739
ada,Ada Boost Classifier,0.75,0.615,0.75,0.7461,0.747,0.0896,0.0913,0.54
ridge,Ridge Classifier,0.7194,0.6204,0.7194,0.7588,0.7345,0.1285,0.1333,0.218
lda,Linear Discriminant Analysis,0.7086,0.6147,0.7086,0.7528,0.7255,0.105,0.1107,0.234
dt,Decision Tree Classifier,0.7038,0.5269,0.7038,0.734,0.7171,0.0481,0.0488,0.249


In [31]:
# Paramètres du modèle de rf

print(tuned_rf)
print(tuned_gbc)
print(tuned_ada)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=2, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.1, min_samples_leaf=3,
                       min_samples_split=5, min_weight_fraction_leaf=0.0,
                       monotonic_cst=None, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=123, verbose=0,
                       warm_start=False)
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.15, loss='log_loss', max_depth=7,
                           max_features=1.0, max_leaf_nodes=None,
                           min_impurity_decrease=0.02, min_samples_leaf=5,
                           min_samples_split=5, min_weight_fraction_leaf=0.0,
                           n_estimators=230, n_iter_no_change=None,
               

In [28]:
# Prédiction des données de test pour obtenir une probabilité de succès avec le modèle RandomForest

predictions = predict_model(tuned_rf, data=donnees) # Prédiction des données de test pour obtenir une probabilité de succès avec le modèle RandomFores

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8318,0.5,0.8318,0.6919,0.7554,0.0,0.0


ValueError: y contains previously unseen labels: 0