In [53]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.pipeline import make_pipeline

# Reading the data 
heart = pd.read_csv('framingham.csv')
heart = heart.dropna()
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


# Defining X and Y

In [50]:
X = heart[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate']]
Y = heart['TenYearCHD']

# Defining the CV Strategy

In [51]:
skf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 1, random_state = 42)

# Logistic Regression

In [52]:
logit_cv = cross_val_score(LogisticRegression(), X, Y, scoring = 'roc_auc', cv = skf, n_jobs = -1)
print(f"The 10-folds oof ROC-AUC score of the LR model is {logit_cv.mean()}")

The 10-folds oof ROC-AUC score of the LR model is 0.6995536881950913


# MinMaxScaler + Logistic Regression

In [54]:
logit_cv = cross_val_score(make_pipeline(MinMaxScaler(), LogisticRegression()), 
                           X, 
                           Y, 
                           scoring = 'roc_auc', 
                           cv = skf, 
                           n_jobs = -1)

print(f"The 10-folds oof ROC-AUC score of the LR model is {logit_cv.mean()}")

The 10-folds oof ROC-AUC score of the LR model is 0.699628719411917


# StandardScaler + Logistic Regression

In [55]:
logit_cv = cross_val_score(make_pipeline(StandardScaler(), LogisticRegression()), 
                           X, 
                           Y, 
                           scoring = 'roc_auc', 
                           cv = skf, 
                           n_jobs = -1)

print(f"The 10-folds oof ROC-AUC score of the LR model is {logit_cv.mean()}")

The 10-folds oof ROC-AUC score of the LR model is 0.6995876894532549


# RobustScaler + Logistic Regression

In [56]:
logit_cv = cross_val_score(make_pipeline(RobustScaler(), LogisticRegression()), 
                           X, 
                           Y, 
                           scoring = 'roc_auc', 
                           cv = skf, 
                           n_jobs = -1)

print(f"The 10-folds oof ROC-AUC score of the LR model is {logit_cv.mean()}")

The 10-folds oof ROC-AUC score of the LR model is 0.699494988360632


# Linear Discriminant Analysis

In [57]:
lda_cv = cross_val_score(LinearDiscriminantAnalysis(), 
                         X, 
                         Y, 
                         scoring = 'roc_auc', 
                         cv = skf, 
                         n_jobs = -1)

print(f"The 10-folds oof ROC-AUC score of the LDA model is {lda_cv.mean()}")

The 10-folds oof ROC-AUC score of the LDA model is 0.6992450283561308


# Quadratic Discriminant Analysis

In [58]:
qda_cv = cross_val_score(QuadraticDiscriminantAnalysis(), 
                         X, 
                         Y, 
                         scoring = 'roc_auc', 
                         cv = skf, 
                         n_jobs = -1)

print(f"The 10-folds oof ROC-AUC score of the QDA model is {qda_cv.mean()}")

The 10-folds oof ROC-AUC score of the QDA model is 0.6929175855463046
