# Loading Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Reading the Data

In [2]:
heart = pd.read_csv('framingham.csv')
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


# Defining the X & Y

In [3]:
X = heart[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate']]
Y = heart['TenYearCHD']

skf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 1, random_state = 42)

In [4]:
X.describe()

Unnamed: 0,age,currentSmoker,totChol,BMI,heartRate
count,4238.0,4238.0,4188.0,4219.0,4237.0
mean,49.584946,0.494101,236.721585,25.802008,75.878924
std,8.57216,0.500024,44.590334,4.080111,12.026596
min,32.0,0.0,107.0,15.54,44.0
25%,42.0,0.0,206.0,23.07,68.0
50%,49.0,0.0,234.0,25.4,75.0
75%,56.0,1.0,263.0,28.04,83.0
max,70.0,1.0,696.0,56.8,143.0


# Random Forest

In [5]:
md1 = Pipeline([('imputer', KNNImputer(n_neighbors = 5, weights = 'distance', 
                                       add_indicator = True)), 
                ('RF', RandomForestClassifier(n_estimators = 100, 
                                              max_depth = 3))])
RF_cv = cross_val_score(md1, X, Y, cv = skf, scoring = 'roc_auc', n_jobs = -1)

print(f"The average 10-folds cross-validation ROC-AUC score of the RF model is {RF_cv.mean()}")

The average 10-folds cross-validation ROC-AUC score of the RF model is 0.6822786002636716


# Gradient Boosting

In [6]:
md2 = Pipeline([('imputer', KNNImputer(n_neighbors = 5, weights = 'distance', 
                                       add_indicator = True)), 
                ('GB', GradientBoostingClassifier(n_estimators = 100, 
                                                  max_depth = 3,
                                                  learning_rate = 0.1))])
GB_cv = cross_val_score(md2, X, Y, cv = skf, scoring = 'roc_auc', n_jobs = -1)

print(f"The average 10-folds cross-validation ROC-AUC score of the GB model is {GB_cv.mean()}")

The average 10-folds cross-validation ROC-AUC score of the GB model is 0.6738533952373937


In [7]:
# Based on my results, RF is the best model out of the two considered.