In [5]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

s3= boto3.resource('s3')
bucket_name= 'morgangant-bata-445-bucket'
bucket= s3.Bucket(bucket_name)

file_key= 'framingham.csv'

bucket_object= bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the datefile
heart = pd.read_csv(file_content_stream)
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [6]:
#Removing missing values
heart= heart.dropna()

In [7]:
#Define the input and target variable
x= heart[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate']]
y= heart['TenYearCHD']

#Splitting the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2)

In [9]:
#Building logistic model
logit_md= LogisticRegression().fit(x_train, y_train)

#Predicting on test data set
logit_pred= logit_md.predict_proba(x_test)[:,1]
logit_pred

array([0.06900946, 0.04961664, 0.11570118, 0.07765386, 0.10070222,
       0.05050721, 0.09235541, 0.11165137, 0.28732242, 0.20704847,
       0.16585025, 0.22754751, 0.08125699, 0.08598569, 0.13496763,
       0.24966178, 0.0389308 , 0.07690623, 0.13888952, 0.05172604,
       0.23499554, 0.08917153, 0.15354875, 0.10688645, 0.24768252,
       0.07353065, 0.18501745, 0.17624971, 0.16220805, 0.10915225,
       0.10324848, 0.44188506, 0.08168519, 0.16567712, 0.09228805,
       0.04902277, 0.18687009, 0.06639079, 0.20183144, 0.31753223,
       0.14155647, 0.1528514 , 0.14389414, 0.24735957, 0.12132988,
       0.30838008, 0.1416317 , 0.05354178, 0.04504482, 0.41313206,
       0.22575875, 0.08329725, 0.09447   , 0.06636165, 0.24855554,
       0.32437634, 0.23038839, 0.06680714, 0.0910318 , 0.09972588,
       0.08879577, 0.11286381, 0.07642539, 0.11356036, 0.35705189,
       0.06342025, 0.09631459, 0.04572134, 0.07799119, 0.24668765,
       0.17443319, 0.12603204, 0.11493493, 0.13712885, 0.04796

In [11]:
#Changing likleyhood to labels
logit_label= np.where(logit_pred < .25,0,1)
logit_label

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,

In [12]:
#Constructing confusion matrix
confusion_matrix(y_test, logit_label)

array([[545,  81],
       [ 75,  31]])