In [34]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

from sklearn.metrics import f1_score, roc_auc_score

In [35]:
df = pd.read_csv('heart_cleveland_upload.csv')

In [36]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2,1
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2,1
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2,1
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0,0


In [37]:
X = df[[col for col in df.columns if col != 'condition']]
y = df['condition']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [39]:
X_train.shape, X_test.shape

((198, 13), (99, 13))

In [40]:
cat_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

In [41]:
num_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

In [42]:
num_pipeline = Pipeline([
    ("impute", SimpleImputer(missing_values=np.nan, strategy="mean")),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ("impute", SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [43]:
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

In [44]:
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

In [45]:
lr = LogisticRegression()
lr.fit(X_train_prep, y_train)

LogisticRegression()

In [46]:
predicted = lr.predict(X_test_prep)

In [47]:
f1_score(y_test, predicted)

0.8172043010752689

In [48]:
roc_auc_score(y_test, predicted)

0.834717607973422