In [7]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, Lasso, LassoCV
from sklearn.ensemble import RandomForestClassifier



s3= boto3.resource('s3')
bucket_name= 'morgangant-bata-445-bucket'
bucket= s3.Bucket(bucket_name)

file_key= 'framingham.csv'

bucket_object= bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the datefile
heart = pd.read_csv(file_content_stream)
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [8]:
#Dropping na values
heart = heart.dropna()

In [24]:
#Define the input and target variable
x= heart[['male','age', 'currentSmoker', 'totChol', 'BMI', 'heartRate', 'sysBP', 'glucose']]
y= heart['TenYearCHD']

#Splitting the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2)

In [25]:
#Scaling
scaler=MinMaxScaler()
x_train= pd.DataFrame(scaler.fit_transform(x_train), columns= x_train.columns)
x_test= pd.DataFrame(scaler.fit_transform(x_test), columns= x_test.columns)

In [26]:
#Identifying important variables with lasso
lasso_cv= LassoCV().fit(x_train, y_train)

#Extracting best lambda
cv_lambda= lasso_cv.alpha_
print('Estimated lambda for the lasso model is:', cv_lambda)

#Building lasso
lasso_md= Lasso(alpha= cv_lambda).fit(x_train, y_train)
lasso_md.coef_

Estimated lambda for the lasso model is: 0.0005424190917061774


array([ 0.07098261,  0.27758435,  0.04300827,  0.        ,  0.        ,
       -0.        ,  0.52336634,  0.40739983])

In [33]:
#Define the input and target variable for logistic 
x_train_logit= x_train[['age', 'BMI', 'totChol', 'sysBP', 'glucose']]
x_test_logit= x_test[['age', 'BMI', 'totChol', 'sysBP', 'glucose']]

#Buiding model
logit_md= LogisticRegression().fit(x_train_logit, y_train)

#Predictingon the model
logit_pred= logit_md.predict_proba(x_test_logit)[:,1]
logit_pred

#Changling likleyhoods to labels
logit_lable= np.where(logit_pred < .1, 0, 1)

#Computing recall
recall_score(y_test, logit_lable)

0.9067796610169492