In [27]:
import pandas as pd 
import numpy as np

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import recall_score

heart = pd.read_csv('framingham.csv')
heart = heart.dropna()
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [28]:
X = heart.drop(columns = 'TenYearCHD', axis = 1)
Y = heart['TenYearCHD']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [29]:
RF_imp = list()

for i in tqdm(range(0, 100)):

    X_train_1, X_val, Y_train_1, Y_val = train_test_split(X_train, Y_train, test_size = 0.2, stratify = Y_train)

    RF_md = RandomForestClassifier(n_estimators = 500, 
                                   max_depth = 5).fit(X_train_1, Y_train_1)

    RF_imp.append(RF_md.feature_importances_)

RF_imp = pd.DataFrame(RF_imp, columns = X.columns.tolist())
RF_imp = RF_imp.apply(np.mean, axis = 0).sort_values(ascending = False)
RF_imp

  0%|          | 0/100 [00:00<?, ?it/s]

age                0.203740
sysBP              0.173065
diaBP              0.122886
glucose            0.112369
totChol            0.071935
BMI                0.067584
prevalentHyp       0.059435
heartRate          0.046682
cigsPerDay         0.040330
male               0.037010
education          0.024734
diabetes           0.019152
BPMeds             0.009733
currentSmoker      0.007596
prevalentStroke    0.003749
dtype: float64

In [34]:
to_select = ['age', 'sysBP', 'diaBP', 'glucose', 'totChol']

RF_md = RandomForestClassifier(n_estimators = 500, 
                                   max_depth = 5).fit(X_train[to_select], Y_train)

RF_pred = RF_md.predict_proba(X_test[to_select])[:, 1]
RF_label = np.where(RF_pred < 0.1, 0, 1)

print(f"The recall of the RF model is {recall_score(Y_test, RF_label)}")

The recall of the RF model is 0.7857142857142857


In [35]:
ET_imp = list()

for i in tqdm(range(0, 100)):

    X_train_1, X_val, Y_train_1, Y_val = train_test_split(X_train, Y_train, test_size = 0.2, stratify = Y_train)

    ET_md = ExtraTreesClassifier(n_estimators = 500, 
                                 max_depth = 5).fit(X_train_1, Y_train_1)

    ET_imp.append(ET_md.feature_importances_)

ET_imp = pd.DataFrame(ET_imp, columns = X.columns.tolist())
ET_imp = ET_imp.apply(np.mean, axis = 0).sort_values(ascending = False)
ET_imp

  0%|          | 0/100 [00:00<?, ?it/s]

prevalentHyp       0.209605
age                0.206049
sysBP              0.115029
male               0.084529
diaBP              0.069541
glucose            0.068079
diabetes           0.065538
BPMeds             0.035270
education          0.031879
totChol            0.027103
cigsPerDay         0.024431
BMI                0.023636
heartRate          0.016794
currentSmoker      0.014276
prevalentStroke    0.008241
dtype: float64

In [37]:
to_select = ['prevalentHyp', 'age', 'sysBP', 'diaBP', 'glucose']

ET_md = ExtraTreesClassifier(n_estimators = 500, 
                             max_depth = 5).fit(X_train[to_select], Y_train)

ET_pred = ET_md.predict_proba(X_test[to_select])[:, 1]
ET_label = np.where(ET_pred < 0.1, 0, 1)

print(f"The recall of the ET model is {recall_score(Y_test, ET_label)}")

The recall of the ET model is 0.8482142857142857
