In [3]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

s3 = boto3.resource('s3')
bucket_name = 'data-445'
bucket = s3.Bucket(bucket_name)

file_key = 'framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the data-file
heart = pd.read_csv(file_content_stream)

## Dropping NAs
heart = heart.dropna()
heart.head()



Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
## Defining input and target
X = heart.drop(columns = ['TenYearCHD'], axis = 1)
Y = heart['TenYearCHD']

## Defining list to store results
md1_results = list()
md2_results = list()

kf = KFold(n_splits = 5, shuffle = True)

for train_idx, val_idx in kf.split(X):
    
    ## Splitting the data 
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    Y_train, Y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    
    #############
    ## Model 1 ##
    #############
    X1 = X_train[['age', 'currentSmoker', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']]
    X1_val = X_val[['age', 'currentSmoker', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']]
    
    ## Transform the input data 
    scaler = MinMaxScaler()
    X1 = scaler.fit_transform(X1)
    X1_val = scaler.fit_transform(X1_val)
    
    ## Building the logistic model
    md1 = LogisticRegression().fit(X1, Y_train)
    
    ## Predicting 
    md1_pred = md1.predict_proba(X1_val)[:, 1]
    
    ## Changing likelihood to labels
    md1_labels = np.where(md1_pred < 0.25, 0, 1)
    
    ## Storing the f1-score
    md1_results.append(f1_score(Y_val, md1_labels))
    
    #############
    ## Model 2 ##
    #############
    X2 = X_train[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate', 'glucose']]
    X2_val = X_val[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate', 'glucose']]
    
    ## Transform the input data
    X2 = scaler.fit_transform(X2)
    X2_val = scaler.fit_transform(X2_val)
    
    ## Building the logistic model
    md2 = LogisticRegression().fit(X2, Y_train)
    
    ## Predicting 
    md2_pred = md2.predict_proba(X2_val)[:, 1]
    
    ## Changing likelihood to labels
    md2_labels = np.where(md2_pred < 0.25, 0, 1)
    
    ## Storing the f1-score
    md2_results.append(f1_score(Y_val, md2_labels))

In [7]:
print('The average F1-score of model 1 is', np.mean(md1_results))
print('The average F1-score of model 2 is', np.mean(md2_results))

The average F1-score of model 1 is 0.360390703533585
The average F1-score of model 2 is 0.34477792251625516
