In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import (
    train_test_split, GridSearchCV, StratifiedShuffleSplit
)
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, roc_auc_score

import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv('../data/raw/heart_cleveland_upload.csv')
df.shape

(297, 14)

In [3]:
cat_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
num_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

In [4]:
transformer = ColumnTransformer(
    [
        ('numerical_preprocess', StandardScaler(), num_cols),
        ('categorical_preprocess', OneHotEncoder(drop='first'), cat_cols)
    ]
)

In [5]:
X = df.drop('condition', axis=1)
y = df.condition

In [6]:
print(X.shape, y.shape)

(297, 13) (297,)


In [7]:
X = transformer.fit_transform(X)
X.shape

(297, 20)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y)

In [9]:
cv = StratifiedShuffleSplit(n_splits=5, random_state=42)

In [10]:
param_grid = {
    'n_estimators': [100, 150, 300],
    'criterion': ['gini', 'entropy'], 
    'max_depth': [3, 5, 10]
}
model = RandomForestClassifier(random_state=100)

grid_rf = GridSearchCV(model, param_grid, cv=cv, n_jobs=-1, 
                       scoring='roc_auc', verbose=1)

In [11]:
%%time 
grid_rf.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
CPU times: total: 234 ms
Wall time: 4.63 s


In [12]:
print('Accuracy RF:', accuracy_score(y_test, grid_rf.predict(X_test)))
print('Roc-auc RF:', roc_auc_score(y_test, grid_rf.predict(X_test)))

Accuracy RF: 0.8080808080808081
Roc-auc RF: 0.8020918785890074


In [13]:
param_grid = {
    'penalty': ['l1','l2'],
    'Cs': [0.001,0.01,0.1,1,10,100,1000]
}

model = LogisticRegressionCV(random_state=42)

grid_lr = GridSearchCV(model, param_grid, cv=cv, n_jobs=-1, 
                       scoring='roc_auc', verbose=1)

In [14]:
%%time 
grid_lr.fit(X_train, y_train)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
CPU times: total: 156 ms
Wall time: 7.01 s


In [15]:
print('Accuracy LR:', accuracy_score(y_test, grid_lr.predict(X_test)))
print('Roc-auc LR:', roc_auc_score(y_test, grid_lr.predict(X_test)))

Accuracy LR: 0.8484848484848485
Roc-auc LR: 0.8441345365053322


In [16]:
print(grid_rf.best_params_)
print(grid_lr.best_params_)

{'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 100}
{'Cs': 10, 'penalty': 'l2'}
