In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Reading the csv file
heart = pd.read_csv('framingham.csv')
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
# Removing observations with missing values
heart = heart.dropna()

In [4]:
X = heart[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate']]
Y = heart['TenYearCHD']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [7]:
# Building the logistic regression model 
logit_md = LogisticRegression().fit(X_train, Y_train)

# Predicting on the test
logit_pred = logit_md.predict_proba(X_test)[:, 1]
print(logit_pred)

# Changing likelihood to labels 
logit_label = np.where(logit_pred < 0.25, 0, 1)
print(logit_label)

[0.07870111 0.05865092 0.04537785 0.1432189  0.17850358 0.23277407
 0.15208859 0.07870099 0.12912307 0.11849013 0.12258131 0.06627603
 0.11504016 0.3316904  0.04154307 0.3358242  0.20623319 0.24276644
 0.27991594 0.3234164  0.35301946 0.2493315  0.06839103 0.20507449
 0.18623391 0.1573308  0.06134879 0.02790908 0.31339649 0.0915627
 0.25448726 0.23714534 0.12001971 0.06560698 0.02613957 0.21014994
 0.04946705 0.29196851 0.19533212 0.25826601 0.1574394  0.12392212
 0.45501137 0.13714151 0.06938113 0.08297202 0.10521891 0.17450837
 0.17908938 0.04174471 0.20055989 0.13993548 0.10302689 0.11937674
 0.07321237 0.09105007 0.03076304 0.24121157 0.27967348 0.12250203
 0.06964222 0.17871255 0.12908628 0.2010435  0.2528688  0.13103347
 0.11686148 0.04716275 0.14202014 0.04378772 0.42212026 0.08925228
 0.1445787  0.2642345  0.17697624 0.24801175 0.11372254 0.368292
 0.16908231 0.12284291 0.19827219 0.06382416 0.08937317 0.10370327
 0.1134857  0.11839688 0.08087238 0.07810292 0.11836529 0.2392811

In [8]:
# Confusion matrix
confusion_matrix(Y_test, logit_label)

array([[535,  88],
       [ 80,  29]])

In [10]:
# Classification report
print(classification_report(Y_test, logit_label))

              precision    recall  f1-score   support

           0       0.87      0.86      0.86       623
           1       0.25      0.27      0.26       109

    accuracy                           0.77       732
   macro avg       0.56      0.56      0.56       732
weighted avg       0.78      0.77      0.77       732

