## Import des fichiers et bibliothèques

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, precision_score, log_loss, accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_val_score, train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    Normalizer,
    PowerTransformer,
    QuantileTransformer,
    RobustScaler,
    StandardScaler,
    minmax_scale,
)
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC,NuSVC,SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from joblib import dump
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_parquet('df_breast_cancer.parquet')

Création de la liste des modèles

In [3]:
classifiers = [
    KNeighborsClassifier(),
    SVC(),
    NuSVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    HistGradientBoostingClassifier(),
    LinearSVC(),
    MLPClassifier(),
    XGBClassifier(),
    LGBMClassifier(verbose=-1)]

In [5]:
X = df.drop('diagnosis', axis = 1)
y = df['diagnosis']

Création de la liste des scalers

In [6]:
scalers = [
    ("Unscaled data", X),
    ("Data after standard scaling", StandardScaler().fit_transform(X)),
    ("Data after min-max scaling", MinMaxScaler().fit_transform(X)),
    ("Data after max-abs scaling", MaxAbsScaler().fit_transform(X)),
    (
        "Data after robust scaling",
        RobustScaler(quantile_range=(25, 75)).fit_transform(X),
    ),
    (
        "Data after power transformation (Yeo-Johnson)",
        PowerTransformer(method="yeo-johnson").fit_transform(X),
    ),
    #(
    #    "Data after power transformation (Box-Cox)",
    #    PowerTransformer(method="box-cox").fit_transform(X),
    #),
    (
        "Data after quantile transformation (uniform pdf)",
        QuantileTransformer(
            output_distribution="uniform", random_state=42
        ).fit_transform(X),
    ),
    (
        "Data after quantile transformation (gaussian pdf)",
        QuantileTransformer(
            output_distribution="normal", random_state=42
        ).fit_transform(X),
    ),
    ("Data after sample-wise L2 normalizing", Normalizer().fit_transform(X)),
]



## Comparaison des différents scalers avec les différents modèles

In [7]:
# Logging for Visual Comparison
log_cols=["Scaler","Classifier", "Accuracy", "Recall", "Log Loss"]
log = pd.DataFrame(columns=log_cols)
for X in scalers :
    for clf in classifiers:
        scaler = X[0]
        X_train, X_test, y_train, y_test = train_test_split(X[1], y, random_state=42)
        clf.fit(X_train, y_train)
        name = clf.__class__.__name__

        #print("="*30)
        #print(name)

        #print('****Results****')
        train_predictions = clf.predict(X_test)
        acc = accuracy_score(y_test, train_predictions)
        rec = recall_score(y_test, train_predictions)
        ll = log_loss(y_test, train_predictions)
        #print("Accuracy: {:.4%}".format(acc))
        try:
            train_predictions = clf.predict_proba(X_test)
            
            #print("Log Loss: {}".format(ll))
        except:
            pass
        log_entry = pd.DataFrame([[scaler, name, acc*100, rec*100, ll]], columns=log_cols)
        log = pd.concat([log,log_entry], ignore_index=True)

  log = pd.concat([log,log_entry], ignore_index=True)


In [8]:
log.sort_values(['Accuracy','Recall','Log Loss'],ascending=[False, False, True]).head(50)

Unnamed: 0,Scaler,Classifier,Accuracy,Recall,Log Loss
87,Data after power transformation (Yeo-Johnson),MLPClassifier,96.402878,96.969697,1.296534
109,Data after quantile transformation (gaussian pdf),RandomForestClassifier,96.402878,93.939394,1.296534
27,Data after standard scaling,MLPClassifier,95.683453,95.454545,1.555841
10,Unscaled data,HistGradientBoostingClassifier,95.683453,92.424242,1.555841
13,Unscaled data,XGBClassifier,95.683453,92.424242,1.555841
25,Data after standard scaling,HistGradientBoostingClassifier,95.683453,92.424242,1.555841
28,Data after standard scaling,XGBClassifier,95.683453,92.424242,1.555841
40,Data after min-max scaling,HistGradientBoostingClassifier,95.683453,92.424242,1.555841
43,Data after min-max scaling,XGBClassifier,95.683453,92.424242,1.555841
55,Data after max-abs scaling,HistGradientBoostingClassifier,95.683453,92.424242,1.555841


## Création du jeu de test et du jeu d'entrainement ainsi que suréchantillonage et mise à l'échelle

In [10]:
X = df.drop('diagnosis', axis = 1)
y = df['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [12]:
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_ros)
X_test_scaled = scaler.transform(X_test)

## Comparaison des scores de nos algorithmes selectionnés et création d'un voting classifier

In [16]:
skf = StratifiedKFold(n_splits=5)
results = {
    'models' : [
        ('hgbc', HistGradientBoostingClassifier()),
        ('nsvc', NuSVC(probability=True)),
        ('mlpc', MLPClassifier()),
        ('lsvc', SVC(kernel='linear',probability=True)),
        ('rfc', RandomForestClassifier()),
        ('xgbc', XGBClassifier()),
        ],
    'mean_score' :[],
    'std_dev_score' :[]
}
for _, model in results.get('models') :
    scores = cross_val_score(model,
                            X_test_scaled,
                            y_test,
                            scoring = 'roc_auc',
                            cv = skf,
                            n_jobs = -1)
    results['mean_score'].append(scores.mean())
    results['std_dev_score'].append(scores.std())

    name = type(model).__name__
    print(f'{name} - Roc AUC score: {scores.mean():.4f} ± {scores.std():.4f}')

HistGradientBoostingClassifier - Roc AUC score: 0.9690 ± 0.0219
NuSVC - Roc AUC score: 0.9698 ± 0.0248
MLPClassifier - Roc AUC score: 0.9804 ± 0.0166
SVC - Roc AUC score: 0.9804 ± 0.0153
RandomForestClassifier - Roc AUC score: 0.9699 ± 0.0245
XGBClassifier - Roc AUC score: 0.9732 ± 0.0141


In [17]:
voting = VotingClassifier(
    estimators=results.get('models'),
    voting='soft',
    weights=results.get('mean_score'),
    verbose=True,
    n_jobs=-1

)

In [18]:
voting.fit(X_train_scaled,y_train_ros)

In [19]:
predictions = voting.predict(X_test_scaled)

Scoring puis export

In [20]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))
print("\n")
print(confusion_matrix(y_test,predictions))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97        73
           1       0.98      0.94      0.96        66

    accuracy                           0.96       139
   macro avg       0.97      0.96      0.96       139
weighted avg       0.96      0.96      0.96       139



[[72  1]
 [ 4 62]]


In [22]:
dump(scaler, 'scaler_cancer.joblib')
dump(voting, 'voting_cancer.joblib')

['voting_cancer.joblib']