## Imports des bibliothèques

In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, RandomizedSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, precision_score, log_loss, accuracy_score, roc_auc_score, confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    Normalizer,
    PowerTransformer,
    QuantileTransformer,
    RobustScaler,
    StandardScaler,
    minmax_scale,
)
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.svm import LinearSVC,NuSVC,SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.calibration import CalibratedClassifierCV
from joblib import dump, load
import warnings
warnings.filterwarnings('ignore')

In [21]:
df = pd.read_parquet('ckd_total.parquet')

Creation de la liste des modèles

In [22]:
classifiers = [
    KNeighborsClassifier(),
    SVC(),
    NuSVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    HistGradientBoostingClassifier(),
    LinearSVC(),
    MLPClassifier(),]

Nous décidons de remplacer les valeurs manquantes par le mode.

In [25]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [26]:
for column in df.iloc[:,:12].columns:
    df[column] = imputer.fit_transform(df[column].to_numpy().reshape(-1,1))

Nous générons des données additionnelles aléatoire pour rééquilibrer les classes

In [27]:
y = df.classification
X = df.drop(columns='classification')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

In [28]:
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

Nous mettons tout à l'échelle en utilisant le scaler qui était le plus performant sur l'ensemble des modèles de machine learning que nous conservons dans notre modèle final

In [29]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_ros)
X_test_scaled = scaler.transform(X_test)

Notre modèle de machine learning va être un Voting Classifier afin de réduire les possibilités de prédictions erronnées
Nous allons ici évaluer les performances de nos modèles retenus qfin de leur donner des poids différents lors du vote final :

In [30]:
skf = StratifiedKFold(n_splits=5)
results = {
    'models' : [
        ('mlpc', MLPClassifier()),
        ('rfc', RandomForestClassifier()),
        ('dtc', DecisionTreeClassifier()),
        ('etc', ExtraTreesClassifier()),
        ('lsvc', SVC(kernel='linear',probability=True)),
        ('hgbc', HistGradientBoostingClassifier()),
        ],
    'mean_score' :[],
    'std_dev_score' :[],
    'roc_auc_scores': [],
    'predicted_probabilities': []
}
for _, model in results.get('models') :
    scores = cross_val_score(model,
                            X_test_scaled,
                            y_test,
                            scoring = 'roc_auc',
                            cv = skf,
                            n_jobs = -1)
    results['mean_score'].append(scores.mean())
    results['std_dev_score'].append(scores.std())

    name = type(model).__name__
    print(f'{name} - Roc AUC score: {scores.mean():.4f} ± {scores.std():.4f}')

MLPClassifier - Roc AUC score: 1.0000 ± 0.0000
RandomForestClassifier - Roc AUC score: 1.0000 ± 0.0000
DecisionTreeClassifier - Roc AUC score: 0.9429 ± 0.0700
ExtraTreesClassifier - Roc AUC score: 1.0000 ± 0.0000
SVC - Roc AUC score: 1.0000 ± 0.0000
HistGradientBoostingClassifier - Roc AUC score: 0.9959 ± 0.0082


Nous instancions le modèle de voting

In [31]:
voting = VotingClassifier(
    estimators=results.get('models'),
    voting='soft',
    weights=results.get('mean_score'),
    verbose=True,
    n_jobs=-1

)

Nous le testons

In [32]:
scores = cross_val_score(voting,
                        X_test_scaled,
                        y_test,
                        scoring = 'roc_auc',
                        cv = skf,
                        n_jobs = -1)

print(f'Roc AUC score: {scores.mean():.4f} ± {scores.std():.4f}')

Roc AUC score: 1.0000 ± 0.0000


In [33]:
voting.fit(X_train_scaled,y_train_ros)

In [None]:
predictions = voting.predict(X_test_scaled)
voting.predict_proba(X_test_scaled)

In [35]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))
print("\n")
print(confusion_matrix(y_test,predictions))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        35
         1.0       1.00      1.00      1.00        34

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



[[35  0]
 [ 0 34]]


On exporte l'algorithme et le scaler après entrainement

In [36]:
#dump(voting, 'voting_ckd.joblib')
#dump(scaler, 'scaler_ckd.joblib')