In [3]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline

## Reading the csv 
heart = pd.read_csv('framingham.csv')

## Removing NAs
heart = heart.dropna()
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
X = heart[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate']]
Y = heart['TenYearCHD']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [8]:
# Building the logistic regression model
logit_md = make_pipeline(StandardScaler(), LogisticRegression()).fit(X_train, Y_train)

# Predicting on test
logit_pred = logit_md.predict_proba(X_test)[:, 1]
logit_label = np.where(logit_pred < 0.25, 0, 1)

# Printing the classification report 
print(classification_report(Y_test, logit_label))

              precision    recall  f1-score   support

           0       0.86      0.89      0.88       607
           1       0.36      0.30      0.32       125

    accuracy                           0.79       732
   macro avg       0.61      0.59      0.60       732
weighted avg       0.77      0.79      0.78       732



In [9]:
# Building the perceptron 
per_md = make_pipeline(StandardScaler(), 
                       Perceptron(penalty = 'l2', alpha = 0.01)).fit(X_train, Y_train)

# Predicting on the test
per_pred = per_md.predict(X_test)

## Printing the classification report 
print(classification_report(Y_test, per_pred))

              precision    recall  f1-score   support

           0       0.90      0.69      0.78       607
           1       0.29      0.62      0.40       125

    accuracy                           0.68       732
   macro avg       0.60      0.65      0.59       732
weighted avg       0.79      0.68      0.72       732



In [10]:
# Based on my results, I would use the logistic model to predict TenYearCHD.