In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

In [3]:
data = pd.read_csv("heart_cleveland_upload.csv")
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [4]:
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
target_feature = ['condition']

In [5]:
numerical_transformer = Pipeline([
    ('num_imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

In [6]:
transformer = ColumnTransformer([
    ('num_transforms', numerical_transformer, numerical_features),
    ('cat_transforms', categorical_transformer, categorical_features)
])

In [7]:
model = Pipeline([
    ('transformer', transformer),
    ('logistic', LogisticRegression())
])

In [8]:
y = data[target_feature[0]]
X = data[numerical_features + categorical_features]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)

In [12]:
print('accuracy score: {}'.format(accuracy_score(y_test, y_pred)))

accuracy score: 0.8


In [13]:
param_grid = {
    'logistic__C': [0.01, 0.1, 10, 100, 200, 500, 1000],
    'logistic__penalty': ['l2'],
}

gvc_model = GridSearchCV(model, param_grid, scoring='accuracy')

In [14]:
gvc_model.fit(X_train, y_train)

In [15]:
y_pred = gvc_model.predict(X_test)

In [16]:
print('accuracy score: {}'.format(accuracy_score(y_test, y_pred)))

accuracy score: 0.7666666666666667


In [17]:
gvc_model.best_params_

{'logistic__C': 0.1, 'logistic__penalty': 'l2'}

In [20]:
data.drop(columns=target_feature[0])

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0


In [None]:
data_train, *_ = train_test_split(data, )