In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

s3 = boto3.resource('s3')
bucket_name = 'omar-vargas-bucket'
bucket = s3.Bucket(bucket_name)

file_key = 'framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#Reading the data-file
heart = pd.read_csv(file_content_stream)
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [2]:
#removing observations with missing values
heart = heart.dropna()

In [3]:
#Defining the input and target variables
X = heart[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate']]
Y = heart['TenYearCHD']

#Splitting th data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [5]:
#Building the logistic model
logit_md = LogisticRegression().fit(X_train, Y_train)

#Predicting on the test dataset
logit_pred = logit_md.predict_proba(X_test)[:,1]
logit_pred

array([0.09948579, 0.14973379, 0.12879039, 0.07690512, 0.10579774,
       0.42424163, 0.0341777 , 0.17523405, 0.08933132, 0.21635271,
       0.04492633, 0.06521332, 0.27537036, 0.13049932, 0.15058505,
       0.36523892, 0.1134401 , 0.27231612, 0.2522147 , 0.03877942,
       0.08365558, 0.13502713, 0.08483467, 0.20844731, 0.16139017,
       0.06552606, 0.16561817, 0.11784089, 0.06003131, 0.26745183,
       0.1122364 , 0.39474552, 0.09006004, 0.0410162 , 0.15030716,
       0.17816462, 0.13238947, 0.11192706, 0.31423441, 0.13737675,
       0.18659977, 0.22808767, 0.08042439, 0.18852637, 0.16129377,
       0.04289464, 0.11816255, 0.20797683, 0.40409758, 0.23449031,
       0.07524796, 0.44700319, 0.35606694, 0.16764853, 0.0572537 ,
       0.11254242, 0.03783769, 0.04504026, 0.13052702, 0.08617262,
       0.05060802, 0.11771163, 0.07587632, 0.23773656, 0.13061339,
       0.08775671, 0.11654966, 0.35643787, 0.10528982, 0.15151632,
       0.14055402, 0.16508221, 0.0699306 , 0.20161848, 0.39903

In [7]:
#Changing likelihoods to labels
logit_label = np.where(logit_pred < 0.25, 0, 1)
logit_label

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,

In [8]:
#Constructing the confusion matrix
confusion_matrix(Y_test, logit_label)

array([[535,  64],
       [ 98,  35]])

In [9]:
#Computing the accuracy
accuracy_score(Y_test, logit_label)

0.7786885245901639