In [5]:
import boto3
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-445'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'Fall_2021/In_Class_Assignments/framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
heart = pd.read_csv(file_content_stream)
heart = heart.dropna()
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [6]:
## Defining the input and target variables
X = heart[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate']]
Y = heart['TenYearCHD']

## Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

## Building the logisitic model
logit_md = LogisticRegression().fit(X_train, Y_train)

## Predicting on the test dataset
preds = logit_md.predict_proba(X_test)[:, 1]
preds

array([0.07856579, 0.07629017, 0.06285889, 0.09288359, 0.13989003,
       0.16696641, 0.42048076, 0.06656117, 0.12221158, 0.14004995,
       0.18338885, 0.35448437, 0.20075217, 0.33468313, 0.09244445,
       0.08347319, 0.11376601, 0.17077108, 0.1060346 , 0.34165066,
       0.12610613, 0.10345725, 0.114075  , 0.32992442, 0.11235596,
       0.13521754, 0.08822108, 0.2425687 , 0.05176396, 0.25551162,
       0.30835466, 0.03884981, 0.08618911, 0.0852898 , 0.08998117,
       0.10047607, 0.17225775, 0.11393393, 0.03724134, 0.10665079,
       0.17142845, 0.28766372, 0.06028629, 0.05412309, 0.03785483,
       0.11114134, 0.25897422, 0.04766664, 0.10489586, 0.15901934,
       0.09460759, 0.2904739 , 0.37383975, 0.04918768, 0.13331911,
       0.08333288, 0.15658288, 0.07636977, 0.26276407, 0.10896387,
       0.27361406, 0.04078856, 0.28730351, 0.0781823 , 0.33215076,
       0.40403568, 0.11845596, 0.03828069, 0.18022366, 0.35511332,
       0.16662868, 0.33375579, 0.32056736, 0.17978139, 0.11766