In [4]:
import boto3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from sklearn.svm import SVC

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-445'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'Fall_2021/In_Class_Assignments/framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
heart = pd.read_csv(file_content_stream)
heart = heart.dropna()
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [10]:
## Defining the input variables and target variable
X = heart.drop(columns = ['TenYearCHD'], axis = 1)
Y = heart['TenYearCHD']

## Defining the list to store results 
feature_importances = list()

for i in range(0, 100):
    print(i)
    ## Splitting the data 
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
    
    ## Fitting the random forest model 
    RF = RandomForestClassifier(n_estimators = 500).fit(X_train, Y_train)
    
    ## Extracting feature importances
    feature_importances.append(RF.feature_importances_)
    
## Converting the list to data-frame
feature_importances = pd.DataFrame(feature_importances)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [11]:
feature_importances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.021965,0.129419,0.037805,0.012377,0.051286,0.006827,0.003566,0.017236,0.006623,0.121143,0.137098,0.119123,0.124212,0.096910,0.114410
1,0.023061,0.124330,0.037600,0.012477,0.048133,0.006210,0.003647,0.017773,0.006298,0.121032,0.133435,0.123154,0.125696,0.097154,0.119999
2,0.021089,0.126809,0.037174,0.011308,0.049787,0.005596,0.003795,0.017770,0.007570,0.123410,0.135531,0.117816,0.124961,0.095170,0.122214
3,0.021668,0.122515,0.036090,0.012891,0.049097,0.008335,0.003019,0.017081,0.006547,0.122402,0.135808,0.117993,0.129205,0.095320,0.122030
4,0.019856,0.120135,0.036860,0.012304,0.051807,0.007475,0.003304,0.018592,0.005863,0.123779,0.137021,0.119315,0.128293,0.094702,0.120694
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.020796,0.125116,0.037168,0.012560,0.051268,0.007387,0.003961,0.019367,0.006073,0.124382,0.134355,0.115991,0.129829,0.093034,0.118712
96,0.021939,0.123077,0.036485,0.012516,0.054676,0.005876,0.003609,0.016809,0.006368,0.120154,0.133613,0.122490,0.126292,0.096904,0.119191
97,0.020766,0.126323,0.036463,0.012907,0.051276,0.007495,0.002642,0.017872,0.005864,0.121658,0.132855,0.124204,0.126571,0.096748,0.116357
98,0.020517,0.124222,0.035879,0.012522,0.049923,0.007623,0.004310,0.016743,0.006534,0.126444,0.134379,0.119542,0.128477,0.095323,0.117562


In [14]:
## Computing average importance
avg_importance = feature_importances.apply(np.mean, axis = 0)
avg_importance

0     0.021256
1     0.124332
2     0.036904
3     0.012565
4     0.050361
5     0.007059
6     0.003289
7     0.018287
8     0.006538
9     0.122080
10    0.134914
11    0.118988
12    0.127210
13    0.096316
14    0.119902
dtype: float64

In [21]:
X = heart[['sysBP', 'BMI', 'age', 'totChol', 'glucose']]
Y = heart['TenYearCHD']

## Splitting data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Building first random forest model 
RF1 = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)

## Predicting on the test dataset
pred1 = RF1.predict_proba(X_test)[:, 1]
pred1 = np.where(pred1 < 0.1, 0, 1)

## Computing the recall
recall_score(Y_test, pred1)

0.8392857142857143

In [22]:
## Building second random forest model 
RF2 = RandomForestClassifier(n_estimators = 500, max_depth = 5).fit(X_train, Y_train)

## Predicting on the test dataset
pred2 = RF2.predict_proba(X_test)[:, 1]
pred2 = np.where(pred2 < 0.1, 0, 1)

## Computing the recall
recall_score(Y_test, pred2)

0.8482142857142857

In [23]:
## Building third random forest model 
RF3 = RandomForestClassifier(n_estimators = 500, max_depth = 7).fit(X_train, Y_train)

## Predicting on the test dataset
pred3 = RF3.predict_proba(X_test)[:, 1]
pred3 = np.where(pred3 < 0.1, 0, 1)

## Computing the recall
recall_score(Y_test, pred3)

0.8660714285714286