In [2]:
import pandas as pd
import numpy as np

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

In [21]:
data = pd.read_csv("heart_cleveland_upload.csv")
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [22]:
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
target_feature = ['condition']

In [23]:
numerical_transformer = Pipeline([
    ('num_imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

In [24]:
transformer = ColumnTransformer([
    ('num_transforms', numerical_transformer, numerical_features),
    ('cat_transforms', categorical_transformer, categorical_features)
])

In [25]:
model = Pipeline([
    ('transformer', transformer),
    ('logistic', LogisticRegression())
])

In [33]:
y = data[target_feature[0]]
X = data[numerical_features + categorical_features]

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
model.fit(X_train, y_train)

In [36]:
y_pred = model.predict(X_test)

In [38]:
print('accuracy score: {}'.format(accuracy_score(y_test, y_pred)))

accuracy score: 0.8


In [49]:
param_grid = {
    'logistic__C': [0.1, 10, 100, 200, 500, 1000],
    'logistic__penalty': ['l2'],
}

gvc_model = GridSearchCV(model, param_grid, scoring='accuracy')

In [50]:
gvc_model.fit(X_train, y_train)

In [51]:
y_pred = gvc_model.predict(X_test)

In [52]:
print('accuracy score: {}'.format(accuracy_score(y_test, y_pred)))

accuracy score: 0.7666666666666667


In [53]:
gvc_model.best_params_

{'logistic__C': 0.1, 'logistic__penalty': 'l2'}