In [1]:

import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
#from lightgbm import LGBMClassifier
#from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score, f1_score,recall_score,roc_auc_score

In [2]:
#Import the dataset
df = pd.read_csv("cleaned_heart_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Height(cm),Weight(kg),Blood Pressure(mmHg),Cholesterol(mg/dL),Glucose(mg/dL),Exercise(hours/week),Heart Attack,systolic,diastolic,gender_cat,smoker_cat
0,0,45,175,80,120/80,200,90,3,0,120,80,1,0
1,1,35,160,65,110/70,180,80,2,0,110,70,0,0
2,2,55,180,85,130/85,220,95,4,1,130,85,1,1
3,3,40,165,70,115/75,190,85,3,0,115,75,0,0
4,4,50,170,75,125/80,210,92,2,1,125,80,1,1


In [3]:
df.columns

Index(['Unnamed: 0', 'Age', 'Height(cm)', 'Weight(kg)', 'Blood Pressure(mmHg)',
       'Cholesterol(mg/dL)', 'Glucose(mg/dL)', 'Exercise(hours/week)',
       'Heart Attack', 'systolic', 'diastolic', 'gender_cat', 'smoker_cat'],
      dtype='object')

In [4]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,Age,Height(cm),Weight(kg),Blood Pressure(mmHg),Cholesterol(mg/dL),Glucose(mg/dL),Exercise(hours/week),Heart Attack,systolic,diastolic,gender_cat,smoker_cat
0,45,175,80,120/80,200,90,3,0,120,80,1,0
1,35,160,65,110/70,180,80,2,0,110,70,0,0
2,55,180,85,130/85,220,95,4,1,130,85,1,1
3,40,165,70,115/75,190,85,3,0,115,75,0,0
4,50,170,75,125/80,210,92,2,1,125,80,1,1


Train Test Split

In [6]:
X = df.drop(['Heart Attack', 'Blood Pressure(mmHg)'], axis=1)
y = df['Heart Attack']

In [7]:
X.head()

Unnamed: 0,Age,Height(cm),Weight(kg),Cholesterol(mg/dL),Glucose(mg/dL),Exercise(hours/week),systolic,diastolic,gender_cat,smoker_cat
0,45,175,80,200,90,3,120,80,1,0
1,35,160,65,180,80,2,110,70,0,0
2,55,180,85,220,95,4,130,85,1,1
3,40,165,70,190,85,3,115,75,0,0
4,50,170,75,210,92,2,125,80,1,1


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(543, 10) (181, 10) (543,) (181,)


### Model Building

### LogisticRegression

In [10]:
lr = LogisticRegression(penalty='l2')
lr.fit(X_train, y_train)
print(lr.coef_)
print(lr.classes_)
print(lr.intercept_)

[[ 0.4197515  -1.48744701  1.75303308  0.31693414  0.72648442  2.39284269
  -0.05188268 -0.32794183  0.74359771  0.15018779]]
[0 1]
[-0.17120067]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# Training Metrics
y_train_pred = lr.predict(X_train)
print(roc_auc_score(y_train, y_train_pred))
print(recall_score(y_train, y_train_pred))
confusion_matrix(y_train, y_train_pred)
print(classification_report(y_train, y_train_pred))

0.9900423429781228
0.9938461538461538
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       218
           1       0.99      0.99      0.99       325

    accuracy                           0.99       543
   macro avg       0.99      0.99      0.99       543
weighted avg       0.99      0.99      0.99       543



In [12]:
# Test Metrics
y_test_pred = lr.predict(X_test)
print(roc_auc_score(y_test, y_test_pred))
print(recall_score(y_test, y_test_pred))
print(accuracy_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

0.9630479102956168
0.981651376146789
0.9668508287292817
[[ 68   4]
 [  2 107]]
              precision    recall  f1-score   support

           0       0.97      0.94      0.96        72
           1       0.96      0.98      0.97       109

    accuracy                           0.97       181
   macro avg       0.97      0.96      0.97       181
weighted avg       0.97      0.97      0.97       181



In [13]:
model_pkl_file = "logistic_regression_model.pkl"
with open(model_pkl_file,'wb') as file:
    pickle.dump(lr, file)

In [14]:
import joblib

filename = 'joblib_logistic_reg.sav'
joblib.dump(lr,filename)

['joblib_logistic_reg.sav']

## Decision Tree 

In [15]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

In [16]:
# Training Metrics
y_train_pred = dt.predict(X_train)
print(roc_auc_score(y_train, y_train_pred))
print(recall_score(y_train, y_train_pred))
confusion_matrix(y_train, y_train_pred)
print(classification_report(y_train, y_train_pred))

1.0
1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       218
           1       1.00      1.00      1.00       325

    accuracy                           1.00       543
   macro avg       1.00      1.00      1.00       543
weighted avg       1.00      1.00      1.00       543



In [18]:
# Test Metric
y_test_pred = dt.predict(X_test)
print(roc_auc_score(y_train, y_train_pred))
print(recall_score(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

1.0
1.0
[[218   0]
 [  0 325]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       218
           1       1.00      1.00      1.00       325

    accuracy                           1.00       543
   macro avg       1.00      1.00      1.00       543
weighted avg       1.00      1.00      1.00       543



In [None]:
filename = "decision_tree_model.pkl"
with open(filename, "wb") as file:
    pickle.dump(dt, file)