## Imports des bibliothèques

In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, RandomizedSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, precision_score, log_loss, accuracy_score, roc_auc_score, confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    Normalizer,
    PowerTransformer,
    QuantileTransformer,
    RobustScaler,
    StandardScaler,
    minmax_scale,
)
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.svm import LinearSVC,NuSVC,SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.calibration import CalibratedClassifierCV
from joblib import dump, load
import warnings
warnings.filterwarnings('ignore')
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold

In [2]:
df = pd.read_parquet('diabete.parquet')

Creation de la liste des modèles

In [3]:
classifiers = [
    KNeighborsClassifier(),
    SVC(),
    NuSVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    HistGradientBoostingClassifier(),
    LinearSVC(),
    MLPClassifier(),]

In [5]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,0.0,33.6,0.627,50,1
1,1,85.0,66.0,102.5,26.6,0.351,31,0
2,8,183.0,64.0,0.0,23.3,0.672,32,1
3,1,89.0,66.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,180.0,32.9,0.171,63,0
764,2,122.0,70.0,102.5,36.8,0.340,27,0
765,5,121.0,72.0,112.0,26.2,0.245,30,0
766,1,126.0,60.0,0.0,30.1,0.349,47,1


Nous générons des données additionnelles aléatoire pour rééquilibrer les classes

In [6]:
y = df.Outcome
X = df.drop(columns='Outcome')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

In [7]:
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

Nous mettons tout à l'échelle en utilisant le scaler qui était le plus performant sur l'ensemble des modèles de machine learning que nous conservons dans notre modèle final

In [8]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_ros)
X_test_scaled = scaler.transform(X_test)

Notre modèle de machine learning va être un Voting Classifier afin de réduire les possibilités de prédictions erronnées
Nous allons ici évaluer les performances de nos modèles retenus qfin de leur donner des poids différents lors du vote final :

In [14]:
skf = StratifiedKFold(n_splits=5)
results = {
    'models' : [
        ('mlpc', MLPClassifier(activation='tanh', alpha=0.05, hidden_layer_sizes= (10, 30, 10), learning_rate='adaptive', solver='adam')),
        ('rfc', RandomForestClassifier(bootstrap=False, max_depth=60, max_features='sqrt', min_samples_leaf=4, min_samples_split=5, n_estimators=1000)),
        ('dtc', DecisionTreeClassifier(criterion='gini', max_depth=12, min_samples_leaf=11, min_samples_split=2)),
        ('etc', ExtraTreesClassifier(bootstrap=False, criterion='entropy', max_depth=50, max_features='log2', min_samples_leaf=1, min_samples_split=2, n_estimators=100, warm_start=True)),
        ('xgb', XGBClassifier(booster='dart', colsample_bytree=0.8, gamma=0.4, max_depth=9, min_child_weight=1, reg_alpha=0, subsample=0.6)),
        ('gbc', GradientBoostingClassifier(warm_start=True,subsample=0.8,n_estimators=300,min_samples_leaf=2,min_samples_split=2,max_features=0.4,max_depth=5,loss='log_loss',learning_rate=0.2,criterion='squared_error')),
        ],
    'mean_score' :[],
    'std_dev_score' :[]
}
for _, model in results.get('models') :
    scores = cross_val_score(model,
                            X_test_scaled,
                            y_test,
                            scoring = 'roc_auc',
                            cv = skf,
                            n_jobs = -1)
    results['mean_score'].append(scores.mean())
    results['std_dev_score'].append(scores.std())

    name = type(model).__name__
    print(f'{name} - Roc AUC score: {scores.mean():.4f} ± {scores.std():.4f}')

MLPClassifier - Roc AUC score: 0.8705 ± 0.0459
RandomForestClassifier - Roc AUC score: 0.8935 ± 0.0442
DecisionTreeClassifier - Roc AUC score: 0.9059 ± 0.0400
ExtraTreesClassifier - Roc AUC score: 0.8958 ± 0.0450
XGBClassifier - Roc AUC score: 0.8722 ± 0.0459
GradientBoostingClassifier - Roc AUC score: 0.8730 ± 0.0448


Nous instancions le modèle de voting

In [15]:
voting = VotingClassifier(
    estimators=results.get('models'),
    voting='soft',
    weights=results.get('mean_score'),
    verbose=True,
    n_jobs=-1

)

Nous le testons

In [16]:
scores = cross_val_score(voting,
                        X_test_scaled,
                        y_test,
                        scoring = 'roc_auc',
                        cv = skf,
                        n_jobs = -1)

print(f'Roc AUC score: {scores.mean():.4f} ± {scores.std():.4f}')

Roc AUC score: 0.9057 ± 0.0452


In [17]:
voting.fit(X_train_scaled,y_train_ros)

In [18]:
predictions = voting.predict(X_test_scaled)
voting.predict_proba(X_test_scaled)

array([[6.36742061e-01, 3.63257941e-01],
       [9.89893863e-01, 1.01061391e-02],
       [9.98059764e-01, 1.94023781e-03],
       [9.44860212e-01, 5.51397908e-02],
       [9.08605809e-01, 9.13941908e-02],
       [4.09643168e-01, 5.90356832e-01],
       [9.97732548e-01, 2.26745062e-03],
       [8.08902690e-01, 1.91097309e-01],
       [2.86085870e-01, 7.13914130e-01],
       [9.13369199e-01, 8.66308036e-02],
       [6.75818601e-02, 9.32418140e-01],
       [7.10621241e-01, 2.89378761e-01],
       [7.09693057e-01, 2.90306948e-01],
       [7.88386884e-01, 2.11613112e-01],
       [5.37920419e-01, 4.62079581e-01],
       [6.85955399e-01, 3.14044606e-01],
       [9.90020325e-01, 9.97967766e-03],
       [9.97705639e-01, 2.29435726e-03],
       [1.50888814e-02, 9.84911119e-01],
       [1.75438213e-01, 8.24561787e-01],
       [9.53674459e-01, 4.63255398e-02],
       [9.80162213e-01, 1.98377866e-02],
       [9.29187613e-01, 7.08123868e-02],
       [9.94951999e-01, 5.04799781e-03],
       [2.704698

In [19]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))
print("\n")
print(confusion_matrix(y_test,predictions))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89       123
           1       0.80      0.81      0.81        69

    accuracy                           0.86       192
   macro avg       0.85      0.85      0.85       192
weighted avg       0.86      0.86      0.86       192



[[109  14]
 [ 13  56]]


On exporte l'algorithme et le scaler après entrainement

In [None]:
#dump(voting, 'voting_diabete.joblib')
#dump(scaler, 'scaler_diabete.joblib')