In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, precision_score, log_loss, accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_val_score, train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    Normalizer,
    PowerTransformer,
    QuantileTransformer,
    RobustScaler,
    StandardScaler,
    minmax_scale,
)
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC,NuSVC,SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from joblib import dump


In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/MaskiVal/DataSets/main/heartDisease.csv')

In [3]:
df = df.drop_duplicates()

In [49]:
df.to_parquet('df_hd.parquet')

In [4]:
classifiers = [
    KNeighborsClassifier(),
    SVC(),
    NuSVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    HistGradientBoostingClassifier(),
    LinearSVC(),
    MLPClassifier(),
    XGBClassifier(),
    LGBMClassifier(verbose=-1)]

In [5]:
X = df.drop('target', axis = 1)
y = df['target']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [32]:
scalers = [
    ("Unscaled data", X),
    ("Data after standard scaling", StandardScaler().fit_transform(X)),
    ("Data after min-max scaling", MinMaxScaler().fit_transform(X)),
    ("Data after max-abs scaling", MaxAbsScaler().fit_transform(X)),
    (
        "Data after robust scaling",
        RobustScaler(quantile_range=(25, 75)).fit_transform(X),
    ),
    (
        "Data after power transformation (Yeo-Johnson)",
        PowerTransformer(method="yeo-johnson").fit_transform(X),
    ),
    #(
    #    "Data after power transformation (Box-Cox)",
    #    PowerTransformer(method="box-cox").fit_transform(X),
    #),
    (
        "Data after quantile transformation (uniform pdf)",
        QuantileTransformer(
            output_distribution="uniform", random_state=42
        ).fit_transform(X),
    ),
    (
        "Data after quantile transformation (gaussian pdf)",
        QuantileTransformer(
            output_distribution="normal", random_state=42
        ).fit_transform(X),
    ),
    ("Data after sample-wise L2 normalizing", Normalizer().fit_transform(X)),
]



In [33]:
# Logging for Visual Comparison
log_cols=["Scaler","Classifier", "Accuracy", "Recall", "Log Loss"]
log = pd.DataFrame(columns=log_cols)
for X in scalers :
    for clf in classifiers:
        scaler = X[0]
        X_train, X_test, y_train, y_test = train_test_split(X[1], y, random_state=42)
        clf.fit(X_train, y_train)
        name = clf.__class__.__name__

        #print("="*30)
        #print(name)

        #print('****Results****')
        train_predictions = clf.predict(X_test)
        acc = accuracy_score(y_test, train_predictions)
        rec = recall_score(y_test, train_predictions)
        ll = log_loss(y_test, train_predictions)
        #print("Accuracy: {:.4%}".format(acc))
        try:
            train_predictions = clf.predict_proba(X_test)
            
            #print("Log Loss: {}".format(ll))
        except:
            pass
        log_entry = pd.DataFrame([[scaler, name, acc*100, rec*100, ll]], columns=log_cols)
        log = pd.concat([log,log_entry], ignore_index=True)

  log = pd.concat([log,log_entry], ignore_index=True)


In [34]:
log.sort_values(['Accuracy','Recall','Log Loss'],ascending=[False, False, True]).head(50)

Unnamed: 0,Scaler,Classifier,Accuracy,Recall,Log Loss
7,Unscaled data,GaussianNB,89.473684,87.804878,3.794069
22,Data after standard scaling,GaussianNB,89.473684,87.804878,3.794069
37,Data after min-max scaling,GaussianNB,89.473684,87.804878,3.794069
52,Data after max-abs scaling,GaussianNB,89.473684,87.804878,3.794069
67,Data after robust scaling,GaussianNB,89.473684,87.804878,3.794069
17,Data after standard scaling,NuSVC,88.157895,90.243902,4.268327
15,Data after standard scaling,KNeighborsClassifier,88.157895,87.804878,4.268327
82,Data after power transformation (Yeo-Johnson),GaussianNB,88.157895,87.804878,4.268327
97,Data after quantile transformation (uniform pdf),GaussianNB,88.157895,87.804878,4.268327
127,Data after sample-wise L2 normalizing,GaussianNB,88.157895,85.365854,4.268327


In [7]:
skf = StratifiedKFold(n_splits=5)

In [36]:
X = df.drop('target', axis = 1)
y = df['target']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [8]:
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_ros)
X_test_scaled = scaler.transform(X_test)

In [42]:
results = {
    'models' : [
        ('gnb', GaussianNB()),
        ('nsvc', NuSVC(probability=True)),
        ('mlpc', MLPClassifier()),
        ('knc', KNeighborsClassifier()),
        ('lsvc', SVC(kernel='linear',probability=True)),
        ('rfc', RandomForestClassifier()),
        ('lda', LinearDiscriminantAnalysis()),
        ],
    'mean_score' :[],
    'std_dev_score' :[]
}
for _, model in results.get('models') :
    scores = cross_val_score(model,
                            X_test_scaled,
                            y_test,
                            scoring = 'roc_auc',
                            cv = skf,
                            n_jobs = -1)
    results['mean_score'].append(scores.mean())
    results['std_dev_score'].append(scores.std())

    name = type(model).__name__
    print(f'{name} - Roc AUC score: {scores.mean():.4f} ± {scores.std():.4f}')

GaussianNB - Roc AUC score: 0.8413 ± 0.0954
NuSVC - Roc AUC score: 0.8933 ± 0.0593
MLPClassifier - Roc AUC score: 0.8663 ± 0.0440
KNeighborsClassifier - Roc AUC score: 0.8565 ± 0.0967
SVC - Roc AUC score: 0.9329 ± 0.0295
RandomForestClassifier - Roc AUC score: 0.8740 ± 0.0789
LinearDiscriminantAnalysis - Roc AUC score: 0.8722 ± 0.0719


In [43]:
voting = VotingClassifier(
    estimators=results.get('models'),
    voting='soft',
    weights=results.get('mean_score'),
    verbose=True,
    n_jobs=-1

)

In [44]:
voting.fit(X_train_scaled,y_train_ros)

In [45]:
predictions = voting.predict(X_test_scaled)

In [46]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))
print("\n")
print(confusion_matrix(y_test,predictions))

              precision    recall  f1-score   support

           0       0.84      0.91      0.88        35
           1       0.92      0.85      0.89        41

    accuracy                           0.88        76
   macro avg       0.88      0.88      0.88        76
weighted avg       0.88      0.88      0.88        76



[[32  3]
 [ 6 35]]


In [62]:
scores = cross_val_score(voting, X_train_scaled, y_train_ros, scoring='accuracy', cv=5)
print(scores.mean())

0.8415510204081633


In [63]:
scores = cross_val_score(voting, X_train_scaled, y_train_ros, scoring='recall', cv=5)
print(scores.mean())

0.8466666666666667


In [48]:
dump(scaler, 'scaler_hd.joblib')
dump(voting, 'voting_hd.joblib')

['voting_hd.joblib']