In [1]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-445'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'Fall_2021/In_Class_Assignments/framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
heart = pd.read_csv(file_content_stream)
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [2]:
## Dropping missing values
heart = heart.dropna()

In [3]:
heart.columns

Index(['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')

In [4]:
heart.shape

(3656, 16)

In [18]:
X1 = heart[['age', 'currentSmoker', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']]
X2 = heart[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate', 'glucose']]
Y = heart['TenYearCHD']

In [21]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X1)
scaler.transform(X1)

array([[0.18421053, 0.        , 0.16837782, ..., 0.27702375, 0.36363636,
        0.10451977],
       [0.36842105, 0.        , 0.28131417, ..., 0.31968008, 0.51515152,
        0.10169492],
       [0.42105263, 1.        , 0.27104723, ..., 0.23751818, 0.31313131,
        0.08474576],
       ...,
       [0.47368421, 1.        , 0.41067762, ..., 0.2527872 , 0.22222222,
        0.1299435 ],
       [0.5       , 1.        , 0.19301848, ..., 0.10106641, 0.21212121,
        0.07909605],
       [0.52631579, 0.        , 0.32032854, ..., 0.14372273, 0.36363636,
        0.18926554]])

In [23]:
a = pd.DataFrame(X1)
a.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.184211,0.0,0.168378,0.106383,0.232804,0.277024,0.363636,0.10452
1,0.368421,0.0,0.281314,0.177305,0.349206,0.31968,0.515152,0.101695
2,0.421053,1.0,0.271047,0.208038,0.338624,0.237518,0.313131,0.084746
3,0.763158,1.0,0.229979,0.314421,0.497354,0.316045,0.212121,0.177966
4,0.368421,1.0,0.353183,0.219858,0.380952,0.183228,0.414141,0.127119


In [41]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler

## Defining the input and target variables
X1 = heart[['age', 'currentSmoker', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']]
X2 = heart[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate', 'glucose']]
Y = heart['TenYearCHD']

## Standardizing the input data
scaler1 = MinMaxScaler().fit(X1)
X1 = scaler1.transform(X1)
X1 = pd.DataFrame(X1)

scaler2 = MinMaxScaler().fit(X2)
X2 = scaler2.transform(X2)
X2 = pd.DataFrame(X2)

## Defining list to store f1-scores 
md1_f1 = list()
md2_f1 = list()

kfold = KFold(n_splits = 5, shuffle = True)

for train_ix, test_ix in kfold.split(X1):
    
    ## Splitting the data 
    X1_train, X1_test = X1.iloc[train_ix], X1.iloc[test_ix]
    X2_train, X2_test = X2.iloc[train_ix], X2.iloc[test_ix]
    Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
    
    ## Fitting the models
    md1 = LogisticRegression().fit(X1_train, Y_train)
    md2 = LogisticRegression().fit(X2_train, Y_train)
    
    ## Predicting on test
    pred1 = md1.predict_proba(X1_test)[:, 1]
    pred2 = md2.predict_proba(X2_test)[:, 1]
    
    ## Changing likelihoods to labels (using 25% threshold)
    pred1 = np.where(pred1 < 0.25, 0, 1)
    pred2 = np.where(pred2 < 0.25, 0, 1)
    
    ## Computing recall and precision
    md1_f1.append(f1_score(Y_test, pred1))
    md2_f1.append(f1_score(Y_test, pred2))

In [44]:
print('The average f1-score of the first model is ', np.mean(md1_f1))
print('The average f1-score of the second model is ', np.mean(md2_f1))

The average f1-score of the first model is  0.3369692169191952
The average f1-score of the second model is  0.31411279695190836
