In [None]:
#*****************************************************************************************************************************
# 
# Le but de ce projet est d'effectuer des prédictions sur la variable "RainTomorrow", pour savoir s'il va ou non pleuvoir demain
# C'est donc une problématique de classification que nous rencontrons
#
# On travaille sur un fichier extrait du fichier général et qui regoupe toutes les données relatives à 
# la Région South Australia.
# Nous allons procéder de la même façon au nettoyage et standardisation des données, avant d'entraîner et d'ajuster les modèles
# de calcul
#
#*****************************************************************************************************************************

In [31]:
import pandas as pd
import numpy as np

In [3]:
df_sa = pd.read_csv('D:\\Données Météo Australie\WeatherAUSRegionSOA.csv', sep = ';')
df_sa.info()
df_sa.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8853 entries, 0 to 8852
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Location       8853 non-null   object 
 1   MinTemp        8853 non-null   float64
 2   MaxTemp        8853 non-null   float64
 3   Rainfall       8853 non-null   float64
 4   Evaporation    8853 non-null   float64
 5   Sunshine       8853 non-null   float64
 6   WindGustDir    8853 non-null   object 
 7   WindGustSpeed  8853 non-null   float64
 8   Humidity9am    8853 non-null   float64
 9   Humidity3pm    8853 non-null   float64
 10  Pressure9am    8853 non-null   float64
 11  Pressure3pm    8853 non-null   float64
 12  RainToday      8853 non-null   object 
 13  RainTomorrow   8853 non-null   object 
 14  year           8853 non-null   int64  
 15  month          8853 non-null   int64  
 16  day            8853 non-null   int64  
 17  week           8853 non-null   int64  
 18  Radar   

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,RainToday,RainTomorrow,year,month,day,week,Radar,Region
0,Adelaide,16.9,22.9,0.0,8.0,10.9,SW,50.0,58.0,40.0,1011.3,1012.4,No,No,2009,1,1,1,Nuriootpa,South Australia
1,Adelaide,13.9,24.1,0.0,14.2,12.6,SE,37.0,39.0,25.0,1020.9,1018.5,No,No,2009,1,2,1,Nuriootpa,South Australia
2,Adelaide,13.8,29.1,0.0,8.0,12.7,SW,30.0,26.0,15.0,1017.6,1014.7,No,No,2009,1,3,1,Nuriootpa,South Australia
3,Adelaide,15.1,33.3,0.0,8.0,12.0,W,31.0,25.0,7.0,1013.8,1012.2,No,No,2009,1,4,1,Nuriootpa,South Australia
4,Adelaide,14.6,30.5,0.0,24.6,13.3,SW,26.0,28.0,22.0,1015.0,1013.2,No,No,2009,1,5,2,Nuriootpa,South Australia


In [4]:
df_sa.isna().sum().sum()

0

In [None]:
# Avant de commencer la modélisation, il faut apporter encore quelques modifications au fichier de base:
# 1) Remplacer les Yes, No des variables RainToday et RainTomorrow par 1 et 0 respectivement
# 2) remplacer la variable WindGustDir par des variables indicatrices et supprimer la colonne WindGustDirStr
# 3 appliquer une standardisation des variables numériques

In [5]:
df_sa['RainToday'] = df_sa['RainToday'].replace (to_replace = ['Yes','No'], value = [1,0])
df_sa['RainTomorrow'] = df_sa['RainTomorrow'].replace (to_replace = ['Yes','No'], value = [1,0])
df_sa.head(20)

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,RainToday,RainTomorrow,year,month,day,week,Radar,Region
0,Adelaide,16.9,22.9,0.0,8.0,10.9,SW,50.0,58.0,40.0,1011.3,1012.4,0,0,2009,1,1,1,Nuriootpa,South Australia
1,Adelaide,13.9,24.1,0.0,14.2,12.6,SE,37.0,39.0,25.0,1020.9,1018.5,0,0,2009,1,2,1,Nuriootpa,South Australia
2,Adelaide,13.8,29.1,0.0,8.0,12.7,SW,30.0,26.0,15.0,1017.6,1014.7,0,0,2009,1,3,1,Nuriootpa,South Australia
3,Adelaide,15.1,33.3,0.0,8.0,12.0,W,31.0,25.0,7.0,1013.8,1012.2,0,0,2009,1,4,1,Nuriootpa,South Australia
4,Adelaide,14.6,30.5,0.0,24.6,13.3,SW,26.0,28.0,22.0,1015.0,1013.2,0,0,2009,1,5,2,Nuriootpa,South Australia
5,Adelaide,15.2,31.0,0.0,8.2,13.3,WSW,31.0,25.0,14.0,1012.0,1008.9,0,0,2009,1,6,2,Nuriootpa,South Australia
6,Adelaide,16.0,25.1,0.2,8.2,9.1,SW,48.0,75.0,43.0,1013.3,1013.5,0,0,2009,1,7,2,Nuriootpa,South Australia
7,Adelaide,14.3,23.8,0.0,7.4,12.7,SE,37.0,45.0,30.0,1019.9,1019.2,0,0,2009,1,8,2,Nuriootpa,South Australia
8,Adelaide,13.8,26.0,0.0,8.0,9.5,ESE,54.0,36.0,27.0,1018.6,1014.8,0,0,2009,1,9,2,Nuriootpa,South Australia
9,Adelaide,13.3,28.7,0.0,7.6,12.6,SW,35.0,46.0,29.0,1012.4,1009.6,0,0,2009,1,10,2,Nuriootpa,South Australia


In [6]:
# Remplacement de la variable "WindGustDir" par des variables indicatrices
dummies = pd.get_dummies(df_sa['WindGustDir'])
df_sa = pd.concat([df_sa,dummies], axis = 1)
df_sa.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,Humidity9am,Humidity3pm,...,NNW,NW,S,SE,SSE,SSW,SW,W,WNW,WSW
0,Adelaide,16.9,22.9,0.0,8.0,10.9,SW,50.0,58.0,40.0,...,0,0,0,0,0,0,1,0,0,0
1,Adelaide,13.9,24.1,0.0,14.2,12.6,SE,37.0,39.0,25.0,...,0,0,0,1,0,0,0,0,0,0
2,Adelaide,13.8,29.1,0.0,8.0,12.7,SW,30.0,26.0,15.0,...,0,0,0,0,0,0,1,0,0,0
3,Adelaide,15.1,33.3,0.0,8.0,12.0,W,31.0,25.0,7.0,...,0,0,0,0,0,0,0,1,0,0
4,Adelaide,14.6,30.5,0.0,24.6,13.3,SW,26.0,28.0,22.0,...,0,0,0,0,0,0,1,0,0,0


In [7]:
# Remplacement de la variable "Location" par des variables indicatrices
dummies2 = pd.get_dummies(df_sa['Location'], prefix = 'Location')
df_sa = pd.concat([df_sa,dummies2], axis = 1)

In [8]:
df_sa = df_sa.drop('WindGustDir', axis = 1)
df_sa = df_sa.drop('Location', axis = 1)

In [9]:
df_sa = df_sa.drop('Radar', axis = 1)
df_sa = df_sa.drop('Region', axis = 1)

In [10]:
df_sa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8853 entries, 0 to 8852
Data columns (total 35 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   MinTemp                8853 non-null   float64
 1   MaxTemp                8853 non-null   float64
 2   Rainfall               8853 non-null   float64
 3   Evaporation            8853 non-null   float64
 4   Sunshine               8853 non-null   float64
 5   WindGustSpeed          8853 non-null   float64
 6   Humidity9am            8853 non-null   float64
 7   Humidity3pm            8853 non-null   float64
 8   Pressure9am            8853 non-null   float64
 9   Pressure3pm            8853 non-null   float64
 10  RainToday              8853 non-null   int64  
 11  RainTomorrow           8853 non-null   int64  
 12  year                   8853 non-null   int64  
 13  month                  8853 non-null   int64  
 14  day                    8853 non-null   int64  
 15  week

In [11]:
# Standardisation des variables numériques:
df_sa_std = df_sa
num_var = df_sa_std.select_dtypes(include = 'float64')
num = num_var.columns
# on ne prend pas les 'int64', car ces variables correspondent à year, month, day
#Standardisation:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_sa_std[num] = scaler.fit_transform(df_sa_std[num])
df_sa_std.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,SE,SSE,SSW,SW,W,WNW,WSW,Location_Adelaide,Location_MountGambier,Location_Nuriootpa
0,1.31851,0.193727,-0.357597,0.879484,0.930506,0.779581,-0.457941,-0.435577,-0.971546,-0.590491,...,0,0,0,1,0,0,0,1,0,0
1,0.722778,0.361254,-0.357597,2.446332,1.359123,-0.233909,-1.38155,-1.142126,0.312481,0.250676,...,1,0,0,0,0,0,0,1,0,0
2,0.70292,1.059286,-0.357597,0.879484,1.384336,-0.779634,-2.013493,-1.61316,-0.128903,-0.27333,...,0,0,0,1,0,0,0,1,0,0
3,0.961071,1.645632,-0.357597,0.879484,1.207847,-0.701673,-2.062104,-1.989986,-0.637164,-0.61807,...,0,0,0,0,1,0,0,1,0,0
4,0.861782,1.254734,-0.357597,5.074593,1.535613,-1.091477,-1.916271,-1.283436,-0.476661,-0.480174,...,0,0,0,1,0,0,0,1,0,0


In [12]:
target = df_sa_std['RainTomorrow']
data = df_sa_std.drop('RainTomorrow', axis = 1)

In [13]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 66)

In [14]:
print(X_train.isnull().sum().sum())
print(y_train.isna().sum().sum())

0
0


In [None]:
#-------------------------------------------------------------------------------------------------------------------------------
#              Application d'un modèle de régression logistqiue simple pour prédire la variable cible RainTomorrow
#-------------------------------------------------------------------------------------------------------------------------------

In [15]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C = 0.1, max_iter = 200)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
pd.crosstab(y_test, y_pred, rownames = ['Classe Réelle'], colnames = ['Classe Prédite'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classe Prédite,0,1
Classe Réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1288,86
1,163,234


In [16]:
from sklearn.metrics import classification_report
print('Classification Report:')
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.94      0.91      1374
           1       0.73      0.59      0.65       397

    accuracy                           0.86      1771
   macro avg       0.81      0.76      0.78      1771
weighted avg       0.85      0.86      0.85      1771



In [17]:
from imblearn.metrics import classification_report_imbalanced
print('Classification Report Imbalanced:')
print(classification_report_imbalanced(y_test, y_pred))

Classification Report Imbalanced:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.94      0.59      0.91      0.74      0.57      1374
          1       0.73      0.59      0.94      0.65      0.74      0.53       397

avg / total       0.85      0.86      0.67      0.85      0.74      0.56      1771



In [None]:
# Le modèle performe plutôt bien quand il s'agit de prédire qu'il ne pleut pas, mais beaucoup moins bien quand il s'agit de 
# prédire qu'il va pleuvoir le lendemain.
# Il est vrai que le jeu de données est déséquilibré. Dans le dataset initial,il y a 22% de "1" (positif) pour la variable cible
# "RainTomorrow".
# Essayons de voir la moyenne géométrique de ce classifieur naïf, qui peut s'avérer utile, dans les problèmes de classification
# déséquilibrée.

In [None]:
# Essayer de faire un OverSampling pour rééquilibrer les données, puis d'appliquer à nouveau Logistic Regression et pourquoi pas
# KNNeighbors

In [18]:
from imblearn.over_sampling import RandomOverSampler
rOs = RandomOverSampler()
X_ro, y_ro = rOs.fit_resample(X_train, y_train)

print('Classes échantillon oversampled :', dict(pd.Series(y_ro).value_counts()))

Classes échantillon oversampled : {0: 5355, 1: 5355}


In [19]:
#****************************************************************************************************************************
#                                Choix des Hyperparamètres de la Logistic Regression
#**************************************************************************************************************************** 

lr = LogisticRegression(max_iter = 200) 

parameters = {'C':[0.1,1, 10, 13, 20],
              'solver':['sag','lbfgs','liblinear']}

grid_lr = model_selection.GridSearchCV(estimator=lr, param_grid=parameters)

grille = grid_lr.fit(X_train, y_train)
print(pd.DataFrame.from_dict(grille.cv_results_).loc[:,['params', 'mean_test_score']])

print ('Best params:', grid_lr.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

                               params  mean_test_score
0         {'C': 0.1, 'solver': 'sag'}         0.790455
1       {'C': 0.1, 'solver': 'lbfgs'}         0.865152
2   {'C': 0.1, 'solver': 'liblinear'}         0.866281
3           {'C': 1, 'solver': 'sag'}         0.790455
4         {'C': 1, 'solver': 'lbfgs'}         0.864163
5     {'C': 1, 'solver': 'liblinear'}         0.867410
6          {'C': 10, 'solver': 'sag'}         0.790313
7        {'C': 10, 'solver': 'lbfgs'}         0.863457
8    {'C': 10, 'solver': 'liblinear'}         0.868116
9          {'C': 13, 'solver': 'sag'}         0.790737
10       {'C': 13, 'solver': 'lbfgs'}         0.865434
11   {'C': 13, 'solver': 'liblinear'}         0.866564
12         {'C': 20, 'solver': 'sag'}         0.790455
13       {'C': 20, 'solver': 'lbfgs'}         0.865151
14   {'C': 20, 'solver': 'liblinear'}         0.866705
Best params: {'C': 10, 'solver': 'liblinear'}


In [20]:
#****************************************************************************************************************************
#        Logistic Regression sur les données South Australia, avec les ++Best params++ sélectionnés (données non OverSamplées)
#****************************************************************************************************************************

lr2 = LogisticRegression(C = 10, max_iter = 200, solver = 'liblinear')

lr2.fit(X_train, y_train)

y_pred = lr2.predict(X_test)

pd.crosstab(y_test, y_pred, rownames = ['Classe Réelle'], colnames = ['Classe Prédite'])


Classe Prédite,0,1
Classe Réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1279,95
1,155,242


In [21]:
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Classification Report Imbalanced:')
print(classification_report_imbalanced(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      1374
           1       0.72      0.61      0.66       397

    accuracy                           0.86      1771
   macro avg       0.81      0.77      0.79      1771
weighted avg       0.85      0.86      0.85      1771

Classification Report Imbalanced:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.93      0.61      0.91      0.75      0.59      1374
          1       0.72      0.61      0.93      0.66      0.75      0.55       397

avg / total       0.85      0.86      0.68      0.85      0.75      0.58      1771



In [23]:
# Test Logistic Regression - données South Australia (NON Oversamplées) - Paramètre C=13 
#-----------------------------------------------------------------------------------------------------------------------------
lr3 = LogisticRegression(C = 13, max_iter = 200, solver = 'liblinear')
lr3.fit(X_train, y_train)
y_pred = lr3.predict(X_test)
print(pd.crosstab(y_test, y_pred, rownames = ['Classe Réelle'], colnames = ['Classe Prédite']))
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Classification Report Imbalanced:')
print(classification_report_imbalanced(y_test, y_pred))

Classe Prédite     0    1
Classe Réelle            
0               1279   95
1                155  242
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      1374
           1       0.72      0.61      0.66       397

    accuracy                           0.86      1771
   macro avg       0.81      0.77      0.79      1771
weighted avg       0.85      0.86      0.85      1771

Classification Report Imbalanced:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.93      0.61      0.91      0.75      0.59      1374
          1       0.72      0.61      0.93      0.66      0.75      0.55       397

avg / total       0.85      0.86      0.68      0.85      0.75      0.58      1771



In [24]:
#*****************************************************************************************************************************
# Logistic Regression sur South Australia, avec les Best params sélectionnés et données échantillonnées avec OVERSAMPLER
#*****************************************************************************************************************************
lr2 = LogisticRegression(C = 10, max_iter = 200, solver = 'liblinear')

lr2.fit(X_ro, y_ro)

y_pred = lr2.predict(X_test)

print(pd.crosstab(y_test, y_pred, rownames = ['Classe Réelle'], colnames = ['Classe Prédite']))
print(classification_report_imbalanced(y_test,y_pred))

Classe Prédite     0    1
Classe Réelle            
0               1168  206
1                 76  321
                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      0.85      0.81      0.89      0.83      0.69      1374
          1       0.61      0.81      0.85      0.69      0.83      0.68       397

avg / total       0.86      0.84      0.82      0.85      0.83      0.69      1771



In [25]:
#******************************************************************************************************************************
#                                   Choix des Hyperparamètres de Knn Neighbors
#*******************************************************************************************************************************
from sklearn import neighbors

knn = neighbors.KNeighborsClassifier()
param_knn ={'n_neighbors': [ 3, 5, 6, 8],
            'metric':['euclidean', 'manhattan']}

grid_knn = model_selection.GridSearchCV(estimator=knn, param_grid=param_knn)
grille_knn = grid_knn.fit(X_train, y_train)

print(pd.DataFrame.from_dict(grille_knn.cv_results_).loc[:,['param_metric', 'param_n_neighbors','mean_test_score']])
df_grille_knn = pd.DataFrame.from_dict(grille_knn.cv_results_).loc[:,['param_metric', 'param_n_neighbors','mean_test_score']]

print ('****************************************************************************')
print()
print ('Best params:', grid_knn.best_params_)
print ()
print ('****************************************************************************')

  param_metric param_n_neighbors  mean_test_score
0    euclidean                 3         0.844253
1    euclidean                 5         0.840298
2    euclidean                 6         0.835780
3    euclidean                 8         0.837475
4    manhattan                 3         0.844817
5    manhattan                 5         0.843971
6    manhattan                 6         0.839170
7    manhattan                 8         0.839734
****************************************************************************

Best params: {'metric': 'manhattan', 'n_neighbors': 3}

****************************************************************************


In [26]:
#****************************************************************************************************************************
#  KNN - sur le dataset de la région South Australia avec Best Params sélectionnés & échantillonnage Classique (non Oversamplé)
#*****************************************************************************************************************************

knn = neighbors.KNeighborsClassifier(n_neighbors = 3, metric ='manhattan')
knn.fit(X_train,y_train)

y_pred_knn = knn.predict(X_test)

# Matrice de Confusion
print(pd.crosstab(y_test, y_pred_knn, rownames = ['Classe Réelle'], colnames = ['Classe Prédite']))


Classe Prédite     0    1
Classe Réelle            
0               1273  101
1                185  212


In [27]:
# Reports
print ('Classification Report:')
print(classification_report(y_test, y_pred_knn))

print ('Classification Report Imbalanced:')
print(classification_report_imbalanced(y_test, y_pred_knn))


Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      1374
           1       0.68      0.53      0.60       397

    accuracy                           0.84      1771
   macro avg       0.78      0.73      0.75      1771
weighted avg       0.83      0.84      0.83      1771

Classification Report Imbalanced:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.87      0.93      0.53      0.90      0.70      0.51      1374
          1       0.68      0.53      0.93      0.60      0.70      0.48       397

avg / total       0.83      0.84      0.62      0.83      0.70      0.51      1771



In [28]:
#****************************************************************************************************************************
#    KNN - sur dataset Région South Australia avec  Best Params ET données échantillonnées avec OVERSAMPLER
#*****************************************************************************************************************************

knn.fit(X_ro,y_ro)

y_pred_knn_ro = knn.predict(X_test)

# Matrice de Confusion
print(pd.crosstab(y_test, y_pred_knn_ro, rownames = ['Classe Réelle'], colnames = ['Classe Prédite']))

# Reports
print ('Classification Report:')
print(classification_report(y_test, y_pred_knn_ro))
print ('Classification Report Imbalanced:')
print(classification_report_imbalanced(y_test, y_pred_knn_ro))


Classe Prédite     0    1
Classe Réelle            
0               1141  233
1                104  293
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.83      0.87      1374
           1       0.56      0.74      0.63       397

    accuracy                           0.81      1771
   macro avg       0.74      0.78      0.75      1771
weighted avg       0.84      0.81      0.82      1771

Classification Report Imbalanced:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.83      0.74      0.87      0.78      0.62      1374
          1       0.56      0.74      0.83      0.63      0.78      0.61       397

avg / total       0.84      0.81      0.76      0.82      0.78      0.62      1771



In [30]:
#******************************************************************************************************************************
#       Random Forest Classifier sur dataset Région South Australia - Echantillonnage Classique SANS Oversampling
#
#******************************************************************************************************************************
from sklearn import ensemble
rf = ensemble.RandomForestClassifier(n_jobs=-1,
                                     random_state=66, 
                                     criterion = 'gini', 
                                     max_depth = 8, 
                                     max_features = 'log2',
                                     n_estimators = 500)
                                               
rf.fit(X_train,y_train)           #Entraînement du modèle sur le jeu d'entraînement

y_pred_rf = rf.predict(X_test)    # Prédictions sur le jeu de test

print(pd.crosstab(y_test, y_pred_rf, rownames = ['Classe Réelle'], colnames = ['Classe Prédite']))

# Reports
print ('Classification Report:')
print(classification_report(y_test, y_pred_rf))
print ('Classification Report Imbalanced:')
print(classification_report_imbalanced(y_test, y_pred_rf))

Classe Prédite     0    1
Classe Réelle            
0               1305   69
1                185  212
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      1374
           1       0.75      0.53      0.63       397

    accuracy                           0.86      1771
   macro avg       0.82      0.74      0.77      1771
weighted avg       0.85      0.86      0.85      1771

Classification Report Imbalanced:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.95      0.53      0.91      0.71      0.53      1374
          1       0.75      0.53      0.95      0.63      0.71      0.49       397

avg / total       0.85      0.86      0.63      0.85      0.71      0.52      1771



In [32]:
#******************************************************************************************************************************
#       Random Forest Classifier sur dataset Région South Australia - Sur-Echantillonnage AVEC Oversampling
#
#******************************************************************************************************************************
from sklearn import ensemble
from sklearn.metrics import classification_report
from imblearn.metrics import classification_report_imbalanced
rf = ensemble.RandomForestClassifier(n_jobs=-1,
                                     random_state=66, 
                                     criterion = 'gini', 
                                     max_depth = 8, 
                                     max_features = 'log2',
                                     n_estimators = 500)
                                               
rf.fit(X_ro,y_ro)           #Entraînement du modèle sur le jeu d'entraînement

y_pred_rf_ro = rf.predict(X_test)    # Prédictions sur le jeu de test

print(pd.crosstab(y_test, y_pred_rf_ro, rownames = ['Classe Réelle'], colnames = ['Classe Prédite']))

# Reports
print ('Classification Report:')
print(classification_report(y_test, y_pred_rf_ro))
print ('Classification Report Imbalanced:')
print(classification_report_imbalanced(y_test, y_pred_rf_ro))

Classe Prédite     0    1
Classe Réelle            
0               1183  191
1                 74  323
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.86      0.90      1374
           1       0.63      0.81      0.71       397

    accuracy                           0.85      1771
   macro avg       0.78      0.84      0.80      1771
weighted avg       0.87      0.85      0.86      1771

Classification Report Imbalanced:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      0.86      0.81      0.90      0.84      0.70      1374
          1       0.63      0.81      0.86      0.71      0.84      0.70       397

avg / total       0.87      0.85      0.82      0.86      0.84      0.70      1771

