In [13]:
import boto3
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, accuracy_score

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-445'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'Fall_2021/In_Class_Assignments/framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
heart = pd.read_csv(file_content_stream)
heart = heart.dropna()
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [14]:
## Defining the input and target variables
X = heart[['age', 'totChol', 'sysBP', 'BMI', 'heartRate', 'glucose']] 
Y = heart['TenYearCHD']

## Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Standardizing the input data 
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [18]:
## Adaboost model 
md1 = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)

## Predicting on the test dataset
pred1 = md1.predict_proba(X_test)[:, 1]
pred1_label = np.where(pred1 < 0.25, 0, 1)

## Computing the accuracy and recall-score
print(accuracy_score(Y_test, pred1_label))
print(recall_score(Y_test, pred1_label))

0.21448087431693988
0.9464285714285714


In [19]:
## Gradient Boosting 
md2 = GradientBoostingClassifier(n_estimators = 500, max_depth= 3, learning_rate = 0.01).fit(X_train, Y_train)

## Predicting on the test dataset
pred2 = md2.predict_proba(X_test)[:, 1]
pred2_label = np.where(pred2 < 0.25, 0, 1)

## Computing the accuracy and recall-score
print(accuracy_score(Y_test, pred2_label))
print(recall_score(Y_test, pred2_label))

0.7049180327868853
0.4375


In [20]:
## Support Vector Machine
md3 = SVC(kernel = 'rbf', probability = True).fit(X_train, Y_train)

## Predicting on the test dataset
pred3 = md3.predict_proba(X_test)[:, 1]
pred3_label = np.where(pred3 < 0.25, 0, 1)

## Computing the accuracy and recall-score
print(accuracy_score(Y_test, pred3_label))
print(recall_score(Y_test, pred3_label))

0.8128415300546448
0.10714285714285714


In [28]:
## Random forest ensemble 
X_rf = pd.concat([pd.DataFrame(pred1), pd.DataFrame(pred2), pd.DataFrame(pred3)], axis = 1)

## Building Random Forest 
RF_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_rf, Y_test)

## Extracting combined probabilities
RF_pred = RF_md.predict_proba(X_rf)[:, 1]
RF_pred

array([0.14876191, 0.15996987, 0.06384891, 0.16975539, 0.18954438,
       0.262183  , 0.06280962, 0.13105215, 0.16066279, 0.13809629,
       0.05952404, 0.16613232, 0.10468398, 0.31707027, 0.28216562,
       0.13702047, 0.07238004, 0.09285096, 0.06009758, 0.19441544,
       0.06377003, 0.09376963, 0.09521537, 0.13672957, 0.25180946,
       0.30793684, 0.09782144, 0.07861745, 0.29565074, 0.17705654,
       0.10237905, 0.15627339, 0.1036318 , 0.29680833, 0.06223222,
       0.06208364, 0.11039724, 0.34472868, 0.06068828, 0.17089723,
       0.18299896, 0.24991076, 0.21071229, 0.06227967, 0.19145846,
       0.10597705, 0.10949385, 0.07750038, 0.08158269, 0.05522795,
       0.0757251 , 0.14661382, 0.10656968, 0.1659336 , 0.13696802,
       0.09114904, 0.20770923, 0.08949124, 0.29291287, 0.24896171,
       0.07690406, 0.29943385, 0.0586118 , 0.21054593, 0.07729876,
       0.14893967, 0.09725784, 0.06395956, 0.27681577, 0.06193766,
       0.07627927, 0.13481533, 0.06387048, 0.078154  , 0.16209

In [21]:
pred1

array([0.41278831, 0.44394091, 0.24546297, 0.46057473, 0.50190452,
       0.48327266, 0.25701771, 0.39279574, 0.39017774, 0.23776013,
       0.31366198, 0.45048356, 0.39641903, 0.48369714, 0.47110771,
       0.41733205, 0.36509955, 0.39977251, 0.24288334, 0.40468553,
       0.37475926, 0.39576554, 0.39208786, 0.38557818, 0.44508606,
       0.4613545 , 0.37916049, 0.35486476, 0.4568616 , 0.3077148 ,
       0.38564073, 0.40185723, 0.40473494, 0.47466137, 0.36588064,
       0.23681548, 0.4193415 , 0.47398866, 0.36207542, 0.43152951,
       0.30730729, 0.43026371, 0.44642141, 0.36112829, 0.41119934,
       0.36458318, 0.32464413, 0.39209731, 0.36723454, 0.28147523,
       0.37048301, 0.42593633, 0.40999477, 0.42285412, 0.15988662,
       0.391111  , 0.43085408, 0.20056286, 0.4867927 , 0.04265314,
       0.38953173, 0.49247293, 0.2924437 , 0.45455727, 0.38512369,
       0.13118217, 0.38897058, 0.37087504, 0.47649391, 0.33719691,
       0.26278447, 0.41924623, 0.36946928, 0.36672157, 0.37614