### Packages

In [9]:
import pandas as pd
import joblib 

from sklearn.model_selection import train_test_split 

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score

### Data and train/test split

In [2]:
df = pd.read_csv('data/heart.csv')

X = df.drop(columns='HeartDisease')
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Preprocessing and column transformer

In [3]:
numeric_scaling = Pipeline([('scaler', StandardScaler())])
categoric_encoding = Pipeline([('encoder',OneHotEncoder())])

In [4]:
num_cols = df.select_dtypes(include=['int64','float64']).columns.drop('HeartDisease')

cat_cols = df.select_dtypes(include=['object']).columns

In [5]:
preprocessing = ColumnTransformer([
    ('numeric_features',numeric_scaling, num_cols),
    ('categoric_features',categoric_encoding,cat_cols)],remainder='passthrough')

### Random Forest Classifier 

In [7]:
base_forest = Pipeline(steps=[('processing', preprocessing),
                           ('classifier',RandomForestClassifier())])

In [10]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'classifier__n_estimators':[20,60,100,140],
             'classifier__min_samples_split':[2,4,6],
              'classifier__min_samples_leaf':[1,2,3],
              'classifier__bootstrap': [True,False]
             }
base_forest_grid = GridSearchCV(base_forest, param_grid=param_grid, cv=5)
base_forest_grid.fit(X_train, y_train)

CPU times: user 1min 9s, sys: 1.32 s, total: 1min 11s
Wall time: 1min 49s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('numeric_features',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak'], dtype='object')),
                                                                        ('categoric_features',
                                                                         Pipeline(steps=[('encoder',
                                                                                          OneHotEncoder())]),
                                                                   

In [13]:
best_forest_base = base_forest_grid.best_estimator_
joblib.dump(best_forest_base,'RandForest_clf_gridsearch.joblib')

['RandForest_clf_gridsearch.joblib']

In [14]:
y_pred = best_base.predict(X_test)
accuracy_score(y_pred,y_test)

0.8840579710144928