In [1]:
import pandas as pd 
import numpy as np

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import recall_score

heart = pd.read_csv('framingham.csv')
heart = heart.dropna()
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [2]:
X = heart.drop(columns = 'TenYearCHD', axis = 1)
Y = heart['TenYearCHD']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

# Random Forest

In [3]:
RF_imp = list()

for i in tqdm(range(0, 100)):

    X_train_1, X_val, Y_train_1, Y_val = train_test_split(X_train, Y_train, test_size = 0.2, stratify = Y_train)

    RF_md = RandomForestClassifier(n_estimators = 500,
                                   max_depth = 5).fit(X_train_1, Y_train_1)

    RF_imp.append(RF_md.feature_importances_)

RF_imp = pd.DataFrame(RF_imp, columns = X.columns.tolist())
RF_imp = RF_imp.apply(np.mean, axis = 0).sort_values(ascending = False)
RF_imp

  0%|          | 0/100 [00:00<?, ?it/s]

sysBP              0.185250
age                0.183328
diaBP              0.132468
glucose            0.114284
BMI                0.078963
totChol            0.066077
prevalentHyp       0.065683
heartRate          0.045348
cigsPerDay         0.041303
male               0.026444
education          0.020449
diabetes           0.016007
BPMeds             0.011925
currentSmoker      0.007582
prevalentStroke    0.004890
dtype: float64

In [4]:
to_select = ['sysBP', 'age', 'diaBP', 'glucose', 'BMI']

RF_md = RandomForestClassifier(n_estimators = 500,
                                   max_depth = 5).fit(X_train[to_select], Y_train)

RF_pred = RF_md.predict_proba(X_test[to_select])[:, 1]
RF_label = np.where(RF_pred < 0.1, 0, 1)

print(f"The recall of the RF model is {recall_score(Y_test, RF_label)}")

The recall of the RF model is 0.8125


# Extra Trees

In [5]:
ET_imp = list()

for i in tqdm(range(0, 100)):

    X_train_1, X_val, Y_train_1, Y_val = train_test_split(X_train, Y_train, test_size = 0.2, stratify = Y_train)

    ET_md = ExtraTreesClassifier(n_estimators = 500,
                                 max_depth = 5).fit(X_train_1, Y_train_1)

    ET_imp.append(ET_md.feature_importances_)

ET_imp = pd.DataFrame(ET_imp, columns = X.columns.tolist())
ET_imp = ET_imp.apply(np.mean, axis = 0).sort_values(ascending = False)
ET_imp

  0%|          | 0/100 [00:00<?, ?it/s]

prevalentHyp       0.233188
age                0.180893
sysBP              0.126073
diaBP              0.085486
glucose            0.072904
male               0.060777
diabetes           0.057644
BPMeds             0.039570
BMI                0.028601
cigsPerDay         0.024245
totChol            0.023216
education          0.023079
heartRate          0.016313
currentSmoker      0.014363
prevalentStroke    0.013647
dtype: float64

In [6]:
to_select = ['prevalentHyp', 'age', 'sysBP', 'diaBP', 'glucose']

ET_md = ExtraTreesClassifier(n_estimators = 500,
                             max_depth = 5).fit(X_train[to_select], Y_train)

ET_pred = ET_md.predict_proba(X_test[to_select])[:, 1]
ET_label = np.where(ET_pred < 0.1, 0, 1)

print(f"The recall of the ET model is {recall_score(Y_test, ET_label)}")

The recall of the ET model is 0.8392857142857143


In [7]:
# From the above results, I would use Extra Trees to predict TenYearCHD