In [None]:
#*****************************************************************************************************************************
# 
# Le but de ce projet est d'effectuer des prédictions sur la variable "RainTomorrow", pour savoir s'il va ou non pleuvoir demain
# C'est donc une problématique de classification que nous rencontrons
#
# On travaille sur un fichier extrait du fichier général et qui regoupe toutes les données relatives à 
# la Région Western Australia.
# Nous allons procéder de la même façon au nettoyage et standardisation des données, avant d'entraîner et d'ajuster les modèles
# de calcul
#
#*****************************************************************************************************************************

In [33]:
import pandas as pd
import numpy as np

In [2]:
df_wa = pd.read_csv('D:\\Données Météo Australie\Weather
                    AUSRegionWEA.csv', sep = ';')
df_wa.info()
df_wa.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23203 entries, 0 to 23202
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Location       23203 non-null  object 
 1   MinTemp        23203 non-null  float64
 2   MaxTemp        23203 non-null  float64
 3   Rainfall       23203 non-null  float64
 4   Evaporation    23203 non-null  float64
 5   Sunshine       23203 non-null  float64
 6   WindGustDir    23203 non-null  object 
 7   WindGustSpeed  23203 non-null  float64
 8   Humidity9am    23203 non-null  float64
 9   Humidity3pm    23203 non-null  float64
 10  Pressure9am    23203 non-null  float64
 11  Pressure3pm    23203 non-null  float64
 12  RainToday      23203 non-null  object 
 13  RainTomorrow   23203 non-null  object 
 14  year           23203 non-null  int64  
 15  month          23203 non-null  int64  
 16  day            23203 non-null  int64  
 17  week           23203 non-null  int64  
 18  Radar 

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,RainToday,RainTomorrow,year,month,day,week,Radar,Region
0,Dartmoor,8.2,17.6,0.8,3.0,0.0,W,26.0,86.0,68.0,1018.2,1016.7,No,No,2009,3,6,10,MountGambier,Western Australia
1,Dartmoor,10.1,19.9,0.8,1.2,5.8,SSW,41.0,98.0,52.0,1017.8,1018.1,No,No,2009,3,7,10,MountGambier,Western Australia
2,Dartmoor,8.8,20.5,0.2,3.6,7.4,SE,41.0,68.0,54.0,1021.9,1020.9,No,No,2009,3,8,10,MountGambier,Western Australia
3,Dartmoor,8.0,23.9,0.0,4.0,11.2,SSE,39.0,82.0,47.0,1021.7,1019.1,No,No,2009,3,9,11,MountGambier,Western Australia
4,Dartmoor,12.9,27.6,0.0,4.2,4.9,ESE,44.0,86.0,44.0,1020.4,1017.8,No,No,2009,3,10,11,MountGambier,Western Australia


In [3]:
df_wa.isna().sum().sum()

0

In [None]:
# Avant de commencer la modélisation, il faut apporter encore quelques modifications au fichier de base:
# 1) Remplacer les Yes, No des variables RainToday et RainTomorrow par 1 et 0 respectivement
# 2) remplacer la variable WindGustDir par des variables indicatrices et supprimer la colonne WindGustDirStr
# 3 appliquer une standardisation des variables numériques

In [4]:
df_wa['RainToday'] = df_wa['RainToday'].replace (to_replace = ['Yes','No'], value = [1,0])
df_wa['RainTomorrow'] = df_wa['RainTomorrow'].replace (to_replace = ['Yes','No'], value = [1,0])
df_wa.head(20)

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,RainToday,RainTomorrow,year,month,day,week,Radar,Region
0,Dartmoor,8.2,17.6,0.8,3.0,0.0,W,26.0,86.0,68.0,1018.2,1016.7,0,0,2009,3,6,10,MountGambier,Western Australia
1,Dartmoor,10.1,19.9,0.8,1.2,5.8,SSW,41.0,98.0,52.0,1017.8,1018.1,0,0,2009,3,7,10,MountGambier,Western Australia
2,Dartmoor,8.8,20.5,0.2,3.6,7.4,SE,41.0,68.0,54.0,1021.9,1020.9,0,0,2009,3,8,10,MountGambier,Western Australia
3,Dartmoor,8.0,23.9,0.0,4.0,11.2,SSE,39.0,82.0,47.0,1021.7,1019.1,0,0,2009,3,9,11,MountGambier,Western Australia
4,Dartmoor,12.9,27.6,0.0,4.2,4.9,ESE,44.0,86.0,44.0,1020.4,1017.8,0,0,2009,3,10,11,MountGambier,Western Australia
5,Dartmoor,14.1,29.7,0.0,3.0,4.6,NW,44.0,91.0,42.0,1019.7,1016.2,0,1,2009,3,11,11,MountGambier,Western Australia
6,Dartmoor,15.8,28.1,19.0,5.0,10.5,SW,35.0,90.0,51.0,1015.6,1014.1,1,0,2009,3,12,11,MountGambier,Western Australia
7,Dartmoor,14.3,29.4,0.0,4.0,8.7,SW,31.0,91.0,43.0,1015.9,1013.6,0,1,2009,3,13,11,MountGambier,Western Australia
8,Dartmoor,14.5,20.1,14.2,6.4,8.1,W,72.0,93.0,81.0,1008.8,1009.5,1,1,2009,3,14,11,MountGambier,Western Australia
9,Dartmoor,9.3,17.8,9.6,4.6,7.8,SW,54.0,91.0,57.0,1012.3,1012.4,1,1,2009,3,15,11,MountGambier,Western Australia


In [6]:
# Remplacement de la variable "WindGustDir" par des variables indicatrices
dummies = pd.get_dummies(df_wa['WindGustDir'])
df_wa = pd.concat([df_wa,dummies], axis = 1)
df_wa.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,Humidity9am,Humidity3pm,...,NNW,NW,S,SE,SSE,SSW,SW,W,WNW,WSW
0,Dartmoor,8.2,17.6,0.8,3.0,0.0,W,26.0,86.0,68.0,...,0,0,0,0,0,0,0,1,0,0
1,Dartmoor,10.1,19.9,0.8,1.2,5.8,SSW,41.0,98.0,52.0,...,0,0,0,0,0,1,0,0,0,0
2,Dartmoor,8.8,20.5,0.2,3.6,7.4,SE,41.0,68.0,54.0,...,0,0,0,1,0,0,0,0,0,0
3,Dartmoor,8.0,23.9,0.0,4.0,11.2,SSE,39.0,82.0,47.0,...,0,0,0,0,1,0,0,0,0,0
4,Dartmoor,12.9,27.6,0.0,4.2,4.9,ESE,44.0,86.0,44.0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Remplacement de la variable "Location" par des variables indicatrices
dummies2 = pd.get_dummies(df_wa['Location'], prefix = 'Location')
df_wa = pd.concat([df_wa,dummies2], axis = 1)

In [8]:
df_wa = df_wa.drop('WindGustDir', axis = 1)
df_wa = df_wa.drop('Location', axis = 1)

In [10]:
df_wa = df_wa.drop('Radar', axis = 1)
df_wa = df_wa.drop('Region', axis = 1)

In [11]:
df_wa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23203 entries, 0 to 23202
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   MinTemp                23203 non-null  float64
 1   MaxTemp                23203 non-null  float64
 2   Rainfall               23203 non-null  float64
 3   Evaporation            23203 non-null  float64
 4   Sunshine               23203 non-null  float64
 5   WindGustSpeed          23203 non-null  float64
 6   Humidity9am            23203 non-null  float64
 7   Humidity3pm            23203 non-null  float64
 8   Pressure9am            23203 non-null  float64
 9   Pressure3pm            23203 non-null  float64
 10  RainToday              23203 non-null  int64  
 11  RainTomorrow           23203 non-null  int64  
 12  year                   23203 non-null  int64  
 13  month                  23203 non-null  int64  
 14  day                    23203 non-null  int64  
 15  we

In [12]:
# Standardisation des variables numériques:
df_wa_std = df_wa
num_var = df_wa_std.select_dtypes(include = 'float64')
num = num_var.columns
# on ne prend pas les 'int64', car ces variables correspondent à year, month, day
#Standardisation:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_wa_std[num] = scaler.fit_transform(df_wa_std[num])
df_wa_std.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,WNW,WSW,Location_Albany,Location_Dartmoor,Location_PearceRAAF,Location_Perth,Location_PerthAirport,Location_SalmonGums,Location_Walpole,Location_Witchcliffe
0,-0.67436,-0.835147,-0.225098,-0.640618,-2.245894,-1.110209,0.891791,0.78722,0.035124,0.121964,...,0,0,0,1,0,0,0,0,0,0
1,-0.28059,-0.474173,-0.225098,-1.16601,-0.671104,0.140188,1.515464,-0.019478,-0.02321,0.333725,...,0,0,0,1,0,0,0,0,0,0
2,-0.550012,-0.380006,-0.332106,-0.465487,-0.236679,0.140188,-0.043719,0.081359,0.574718,0.757247,...,0,0,0,1,0,0,0,0,0,0
3,-0.71581,0.153607,-0.367775,-0.348733,0.795079,-0.026532,0.6839,-0.271571,0.545551,0.484983,...,0,0,0,1,0,0,0,0,0,0
4,0.299704,0.734303,-0.367775,-0.290356,-0.915468,0.390267,0.891791,-0.422827,0.355964,0.288348,...,0,0,0,1,0,0,0,0,0,0


In [13]:
target = df_wa_std['RainTomorrow']
data = df_wa_std.drop('RainTomorrow', axis = 1)

In [14]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 66)

In [15]:
print(X_train.isnull().sum().sum())
print(y_train.isna().sum().sum())

0
0


In [None]:
#-------------------------------------------------------------------------------------------------------------------------------
#              Application d'un modèle de régression logistqiue simple pour prédire la variable cible RainTomorrow
#-------------------------------------------------------------------------------------------------------------------------------

In [16]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C = 0.1, max_iter = 200)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
pd.crosstab(y_test, y_pred, rownames = ['Classe Réelle'], colnames = ['Classe Prédite'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classe Prédite,0,1
Classe Réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3340,209
1,419,673


In [17]:
from sklearn.metrics import classification_report
print('Classification Report:')
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.94      0.91      3549
           1       0.76      0.62      0.68      1092

    accuracy                           0.86      4641
   macro avg       0.83      0.78      0.80      4641
weighted avg       0.86      0.86      0.86      4641



In [None]:
# Le modèle performe plutôt bien quand il s'agit de prédire qu'il ne pleut pas, mais beaucoup moins bien quand il s'agit de 
# prédire qu'il va pleuvoir le lendemain.
# Il est vrai que le jeu de données est déséquilibré. Dans le dataset initial,il y a 22% de "1" (positif) pour la variable cible
# "RainTomorrow".
# Essayons de voir la moyenne géométrique de ce classifieur naïf, qui peut s'avérer utile, dans les problèmes de classification
# déséquilibrée.

In [18]:
from imblearn.metrics import classification_report_imbalanced
print('Classification Report Imbalanced:')
print(classification_report_imbalanced(y_test, y_pred))

Classification Report Imbalanced:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.94      0.62      0.91      0.76      0.60      3549
          1       0.76      0.62      0.94      0.68      0.76      0.56      1092

avg / total       0.86      0.86      0.69      0.86      0.76      0.59      4641



In [None]:
# Essayer de faire un OverSampling pour rééquilibrer les données, puis d'appliquer à nouveau Logistic Regression et pourquoi pas
# KNNeighbors

In [19]:
from imblearn.over_sampling import RandomOverSampler
rOs = RandomOverSampler()
X_ro, y_ro = rOs.fit_resample(X_train, y_train)

print('Classes échantillon oversampled :', dict(pd.Series(y_ro).value_counts()))

Classes échantillon oversampled : {0: 13984, 1: 13984}


In [23]:
#****************************************************************************************************************************
#                                Choix des Hyperparamètres de la Logistic Regression
#**************************************************************************************************************************** 

lr = LogisticRegression(max_iter = 200) 

parameters = {'C':[0.1,1, 10, 13, 20],
              'solver':['sag','lbfgs','liblinear']}

grid_lr = model_selection.GridSearchCV(estimator=lr, param_grid=parameters)

grille = grid_lr.fit(X_train, y_train)
print(pd.DataFrame.from_dict(grille.cv_results_).loc[:,['params', 'mean_test_score']])

print ('Best params:', grid_lr.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

                               params  mean_test_score
0         {'C': 0.1, 'solver': 'sag'}         0.835578
1       {'C': 0.1, 'solver': 'lbfgs'}         0.864346
2   {'C': 0.1, 'solver': 'liblinear'}         0.866393
3           {'C': 1, 'solver': 'sag'}         0.835794
4         {'C': 1, 'solver': 'lbfgs'}         0.865316
5     {'C': 1, 'solver': 'liblinear'}         0.866016
6          {'C': 10, 'solver': 'sag'}         0.835848
7        {'C': 10, 'solver': 'lbfgs'}         0.865208
8    {'C': 10, 'solver': 'liblinear'}         0.865909
9          {'C': 13, 'solver': 'sag'}         0.835794
10       {'C': 13, 'solver': 'lbfgs'}         0.865154
11   {'C': 13, 'solver': 'liblinear'}         0.865909
12         {'C': 20, 'solver': 'sag'}         0.835901
13       {'C': 20, 'solver': 'lbfgs'}         0.865370
14   {'C': 20, 'solver': 'liblinear'}         0.865909
Best params: {'C': 0.1, 'solver': 'liblinear'}


In [25]:
#****************************************************************************************************************************
#        Logistic Regression sur dataset Western Australia, avec les ++Best params++ sélectionnés (données non OverSamplées)
#****************************************************************************************************************************

lr2 = LogisticRegression(C = 13, max_iter = 200, solver = 'liblinear')

lr2.fit(X_train, y_train)

y_pred = lr2.predict(X_test)

pd.crosstab(y_test, y_pred, rownames = ['Classe Réelle'], colnames = ['Classe Prédite'])

Classe Prédite,0,1
Classe Réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3345,204
1,420,672


In [21]:
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Classification Report Imbalanced:')
print(classification_report_imbalanced(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.94      0.91      3549
           1       0.77      0.62      0.68      1092

    accuracy                           0.87      4641
   macro avg       0.83      0.78      0.80      4641
weighted avg       0.86      0.87      0.86      4641

Classification Report Imbalanced:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.94      0.62      0.91      0.76      0.60      3549
          1       0.77      0.62      0.94      0.68      0.76      0.56      1092

avg / total       0.86      0.87      0.69      0.86      0.76      0.59      4641



In [26]:
#*****************************************************************************************************************************
# Logistic Regression sur dataset Western Australia, avec Best params sélectionnés et données échantillonnées avec OVERSAMPLER
#*****************************************************************************************************************************
lr2 = LogisticRegression(C = 13, max_iter = 200, solver = 'liblinear')

lr2.fit(X_ro, y_ro)

y_pred = lr2.predict(X_test)

print(pd.crosstab(y_test, y_pred, rownames = ['Classe Réelle'], colnames = ['Classe Prédite']))
print(classification_report_imbalanced(y_test,y_pred))

Classe Prédite     0    1
Classe Réelle            
0               2968  581
1                222  870
                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.84      0.80      0.88      0.82      0.67      3549
          1       0.60      0.80      0.84      0.68      0.82      0.66      1092

avg / total       0.85      0.83      0.81      0.83      0.82      0.67      4641



In [27]:
#******************************************************************************************************************************
#                                   Choix des Hyperparamètres de Knn Neighbors
#*******************************************************************************************************************************
from sklearn import neighbors

knn = neighbors.KNeighborsClassifier()
param_knn ={'n_neighbors': [ 3, 5, 6, 8],
            'metric':['euclidean', 'manhattan']}

grid_knn = model_selection.GridSearchCV(estimator=knn, param_grid=param_knn)
grille_knn = grid_knn.fit(X_train, y_train)

print(pd.DataFrame.from_dict(grille_knn.cv_results_).loc[:,['param_metric', 'param_n_neighbors','mean_test_score']])
df_grille_knn = pd.DataFrame.from_dict(grille_knn.cv_results_).loc[:,['param_metric', 'param_n_neighbors','mean_test_score']]

print ('****************************************************************************')
print()
print ('Best params:', grid_knn.best_params_)
print ()
print ('****************************************************************************')



  param_metric param_n_neighbors  mean_test_score
0    euclidean                 3         0.847268
1    euclidean                 5         0.848130
2    euclidean                 6         0.841827
3    euclidean                 8         0.843875
4    manhattan                 3         0.849100
5    manhattan                 5         0.851201
6    manhattan                 6         0.847160
7    manhattan                 8         0.848561
****************************************************************************

Best params: {'metric': 'manhattan', 'n_neighbors': 5}

****************************************************************************


In [28]:
#****************************************************************************************************************************
#  KNN - sur dataset Western Australia (uniquement) avec Best Params sélectionnés & échantillonnage Classique (non Oversamplé)
#*****************************************************************************************************************************

knn = neighbors.KNeighborsClassifier(n_neighbors = 5, metric ='manhattan')
knn.fit(X_train,y_train)

y_pred_knn = knn.predict(X_test)

# Matrice de Confusion
print(pd.crosstab(y_test, y_pred_knn, rownames = ['Classe Réelle'], colnames = ['Classe Prédite']))


Classe Prédite     0    1
Classe Réelle            
0               3365  184
1                432  660


In [29]:
# Reports
print ('Classification Report:')
print(classification_report(y_test, y_pred_knn))

print ('Classification Report Imbalanced:')
print(classification_report_imbalanced(y_test, y_pred_knn))

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      3549
           1       0.78      0.60      0.68      1092

    accuracy                           0.87      4641
   macro avg       0.83      0.78      0.80      4641
weighted avg       0.86      0.87      0.86      4641

Classification Report Imbalanced:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.95      0.60      0.92      0.76      0.59      3549
          1       0.78      0.60      0.95      0.68      0.76      0.55      1092

avg / total       0.86      0.87      0.69      0.86      0.76      0.58      4641



In [30]:
#****************************************************************************************************************************
#    KNN - sur dataset Région Western Australia avec  Best Params ET données échantillonnées avec OVERSAMPLER
#*****************************************************************************************************************************

knn.fit(X_ro,y_ro)

y_pred_knn_ro = knn.predict(X_test)

# Matrice de Confusion
print(pd.crosstab(y_test, y_pred_knn_ro, rownames = ['Classe Réelle'], colnames = ['Classe Prédite']))

# Reports
print ('Classification Report:')
print(classification_report(y_test, y_pred_knn_ro))
print ('Classification Report Imbalanced:')
print(classification_report_imbalanced(y_test, y_pred_knn_ro))


Classe Prédite     0    1
Classe Réelle            
0               2937  612
1                232  860
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.83      0.87      3549
           1       0.58      0.79      0.67      1092

    accuracy                           0.82      4641
   macro avg       0.76      0.81      0.77      4641
weighted avg       0.85      0.82      0.83      4641

Classification Report Imbalanced:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.83      0.79      0.87      0.81      0.65      3549
          1       0.58      0.79      0.83      0.67      0.81      0.65      1092

avg / total       0.85      0.82      0.80      0.83      0.81      0.65      4641



In [32]:
#******************************************************************************************************************************
#       Random Forest Classifier sur dataset Région Western Australia - Echantillonnage Classique SANS Oversampling
#
#******************************************************************************************************************************
from sklearn import ensemble
rf = ensemble.RandomForestClassifier(n_jobs=-1,
                                     random_state=66, 
                                     criterion = 'gini', 
                                     max_depth = 8, 
                                     max_features = 'log2',
                                     n_estimators = 500)
                                               
rf.fit(X_train,y_train)           #Entraînement du modèle sur le jeu d'entraînement

y_pred_rf = rf.predict(X_test)    # Prédictions sur le jeu de test

print(pd.crosstab(y_test, y_pred_rf, rownames = ['Classe Réelle'], colnames = ['Classe Prédite']))

# Reports
print ('Classification Report:')
print(classification_report(y_test, y_pred_rf))
print ('Classification Report Imbalanced:')
print(classification_report_imbalanced(y_test, y_pred_rf))

Classe Prédite     0    1
Classe Réelle            
0               3399  150
1                493  599
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.96      0.91      3549
           1       0.80      0.55      0.65      1092

    accuracy                           0.86      4641
   macro avg       0.84      0.75      0.78      4641
weighted avg       0.86      0.86      0.85      4641

Classification Report Imbalanced:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.87      0.96      0.55      0.91      0.72      0.55      3549
          1       0.80      0.55      0.96      0.65      0.72      0.50      1092

avg / total       0.86      0.86      0.64      0.85      0.72      0.54      4641



In [34]:
#******************************************************************************************************************************
#       Random Forest Classifier sur dataset Région Western Australia - Sur-Echantillonnage AVEC Oversampling
#
#******************************************************************************************************************************
from sklearn import ensemble
from sklearn.metrics import classification_report
from imblearn.metrics import classification_report_imbalanced
rf = ensemble.RandomForestClassifier(n_jobs=-1,
                                     random_state=66, 
                                     criterion = 'gini', 
                                     max_depth = 8, 
                                     max_features = 'log2',
                                     n_estimators = 500)
                                               
rf.fit(X_ro,y_ro)           #Entraînement du modèle sur le jeu d'entraînement

y_pred_rf_ro = rf.predict(X_test)    # Prédictions sur le jeu de test

print(pd.crosstab(y_test, y_pred_rf_ro, rownames = ['Classe Réelle'], colnames = ['Classe Prédite']))

# Reports
print ('Classification Report:')
print(classification_report(y_test, y_pred_rf_ro))
print ('Classification Report Imbalanced:')
print(classification_report_imbalanced(y_test, y_pred_rf_ro))

Classe Prédite     0    1
Classe Réelle            
0               2997  552
1                229  863
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.84      0.88      3549
           1       0.61      0.79      0.69      1092

    accuracy                           0.83      4641
   macro avg       0.77      0.82      0.79      4641
weighted avg       0.85      0.83      0.84      4641

Classification Report Imbalanced:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.84      0.79      0.88      0.82      0.67      3549
          1       0.61      0.79      0.84      0.69      0.82      0.66      1092

avg / total       0.85      0.83      0.80      0.84      0.82      0.67      4641

