## Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

## Read in csv files

In [2]:
all_hd = pd.read_csv('../data/interim/heart_disease_data_all_pt2.csv').drop('Unnamed: 0', axis=1)
cleveland_hd = pd.read_csv('../data/interim/heart_disease_data_cleveland_pt2.csv').drop('Unnamed: 0', axis=1)

## View data

In [3]:
all_hd.head()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,num,Male,chest_pain,high_fbs,abnormal_restecg,exercise_induced_angina
0,1,63,145.0,233.0,150.0,2.3,0,1,1,1,1,0
1,2,67,160.0,286.0,108.0,1.5,1,1,0,0,1,1
2,3,67,120.0,229.0,129.0,2.6,1,1,0,0,1,1
3,4,37,130.0,250.0,187.0,3.5,0,1,1,0,0,0
4,5,41,130.0,204.0,172.0,1.4,0,0,1,0,1,0


In [4]:
cleveland_hd.head()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num,Male,chest_pain,high_fbs,abnormal_restecg,exercise_induced_angina,upsloping,defect
0,1,63,145.0,233.0,150.0,2.3,0.0,0,1,1,1,1,0,0,1
1,2,67,160.0,286.0,108.0,1.5,3.0,1,1,0,0,1,1,0,0
2,3,67,120.0,229.0,129.0,2.6,2.0,1,1,0,0,1,1,0,1
3,4,37,130.0,250.0,187.0,3.5,0.0,0,1,1,0,0,0,0,0
4,5,41,130.0,204.0,172.0,1.4,0.0,0,0,1,0,1,0,1,0


## Split data into training and testing sets

In [5]:
X_all_train, X_all_test, y_all_train, y_all_test = train_test_split(all_hd.drop(columns=['num', 'id']), 
                                                    all_hd.num, test_size=0.3, 
                                                    random_state=47)

In [6]:
X_cleveland_train, X_cleveland_test, y_cleveland_train, y_cleveland_test = train_test_split(cleveland_hd.drop(columns=['num', 'id']), 
                                                    cleveland_hd.num, test_size=0.3, 
                                                    random_state=47)

## Set-up pipelines

In [7]:
pipe_all = make_pipeline(
    SimpleImputer(strategy='median'), 
    StandardScaler(),
    LogisticRegression()
)

pipe_all.fit(X_all_train, y_all_train)
y_all_tr_pred = pipe_all.predict(X_all_train)

In [8]:
print(classification_report(y_all_train, y_all_tr_pred))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80       279
           1       0.83      0.83      0.83       326

    accuracy                           0.81       605
   macro avg       0.81      0.81      0.81       605
weighted avg       0.81      0.81      0.81       605



In [9]:
pipe_cleveland = make_pipeline(
    SimpleImputer(strategy='median'), 
    StandardScaler(),
    LogisticRegression()
)

pipe_cleveland.fit(X_cleveland_train, y_cleveland_train)
pipe_cleveland.fit(X_cleveland_train, y_cleveland_train)
y_cleveland_tr_pred = pipe_cleveland.predict(X_cleveland_train)

In [10]:
print(classification_report(y_cleveland_train, y_cleveland_tr_pred))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90       114
           1       0.89      0.87      0.88        98

    accuracy                           0.89       212
   macro avg       0.89      0.89      0.89       212
weighted avg       0.89      0.89      0.89       212

