In [21]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [22]:
category_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
numeric_features =['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

In [23]:
df = pd.read_csv('../data/raw/heart.csv')

In [31]:
train_data, test_data = train_test_split(
        df, test_size = 0.2, 
        random_state = 42
    )

In [32]:
train_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
132,42,1,1,120,295,0,1,162,0,0.0,2,0,2,1
202,58,1,0,150,270,0,0,111,1,0.8,2,0,3,0
196,46,1,2,150,231,0,1,147,0,3.6,1,0,2,0
75,55,0,1,135,250,0,0,161,0,1.4,1,0,2,1
176,60,1,0,117,230,1,1,160,1,1.4,2,2,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,50,1,2,140,233,0,1,163,0,0.6,1,1,3,0
71,51,1,2,94,227,0,1,154,1,0.0,2,1,3,1
106,69,1,3,160,234,1,0,131,0,0.1,1,1,2,1
270,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0


In [33]:
y_train = train_data['target']
y_test = test_data['target']

In [34]:
categorical_pipeline = Pipeline(
    [
        ("ohe", OneHotEncoder(sparse=False)),
    ]
)


numerical_pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
    ]
)


In [35]:
pipeline = Pipeline(steps=[
            ('feature_processing', ColumnTransformer(
                [
                    ('numeric_processing', numerical_pipeline, numeric_features),
                    ('category_processing', categorical_pipeline, category_features),
                ]
            ))
        ])

X_train = pipeline.fit_transform(train_data)

In [36]:
model = RandomForestClassifier(
            n_estimators=100, 
            min_samples_leaf=5
        )

In [37]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [39]:
from sklearn.metrics import roc_auc_score
roc_auc_score(model.predict(pipeline.transform(test_data)), y_test)

0.8521505376344087