### Packages

In [33]:
import pandas as pd
import joblib 

from sklearn.model_selection import train_test_split 

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score

### Data and train/test split

In [2]:
df = pd.read_csv('data/heart.csv')

X = df.drop(columns='HeartDisease')
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Preprocessing and column transformer

In [3]:
numeric_scaling = Pipeline([('scaler', StandardScaler())])
categoric_encoding = Pipeline([('encoder',OneHotEncoder())])

In [4]:
num_cols = df.select_dtypes(include=['int64','float64']).columns.drop('HeartDisease')

cat_cols = df.select_dtypes(include=['object']).columns

In [5]:
preprocessing = ColumnTransformer([
    ('numeric_features',numeric_scaling, num_cols),
    ('categoric_features',categoric_encoding,cat_cols)],remainder='passthrough')

### Random Forest Classifier 

In [7]:
base_forest = Pipeline(steps=[('processing', preprocessing),
                           ('classifier',RandomForestClassifier())])

In [10]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'classifier__n_estimators':[20,60,100,140],
             'classifier__min_samples_split':[2,4,6],
              'classifier__min_samples_leaf':[1,2,3],
              'classifier__bootstrap': [True,False]
             }
base_forest_grid = GridSearchCV(base_forest, param_grid=param_grid, cv=5)
base_forest_grid.fit(X_train, y_train)

CPU times: user 1min 9s, sys: 1.32 s, total: 1min 11s
Wall time: 1min 49s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('numeric_features',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak'], dtype='object')),
                                                                        ('categoric_features',
                                                                         Pipeline(steps=[('encoder',
                                                                                          OneHotEncoder())]),
                                                                   

In [13]:
best_forest_base = base_forest_grid.best_estimator_
joblib.dump(best_forest_base,'RandForest_clf_gridsearch.joblib')

['RandForest_clf_gridsearch.joblib']

In [14]:
y_pred = best_base.predict(X_test)
accuracy_score(y_pred,y_test)

0.8840579710144928

### Random Forest Classifier with PCA added

In [7]:
forest_pca = Pipeline(steps=[('processing', preprocessing),
                             ('pca',PCA()),
                             ('classifier',RandomForestClassifier())])

In [8]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'pca__n_components':[2,3,4,5,6],
              'classifier__n_estimators':[20,60,100,140],
             'classifier__min_samples_split':[2,4,6],
              'classifier__min_samples_leaf':[1,2,3],
              'classifier__bootstrap': [True,False]
             }
forest_pca_grid = GridSearchCV(forest_pca, param_grid=param_grid, cv=5)
forest_pca_grid.fit(X_train, y_train)

CPU times: user 7min 41s, sys: 27.5 s, total: 8min 8s
Wall time: 6min 7s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('numeric_features',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak'], dtype='object')),
                                                                        ('categoric_features',
                                                                         Pipeline(steps=[('encoder',
                                                                                          OneHotEncoder())]),
                                                                   

In [9]:
best_forest_pca = forest_pca_grid.best_estimator_
joblib.dump(best_forest_pca,'RForest-pca_gridsearch.joblib')

['RForest-pca_gridsearch.joblib']

In [10]:
y_pred = best_forest_pca.predict(X_test)
accuracy_score(y_pred,y_test)

0.8152173913043478

### SVC 

In [25]:
svc = Pipeline(steps=[('processing', preprocessing),
                             ('svc',SVC())])

In [26]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'svc__C':[0.01,0.1,0.5,1]}
base_svc_grid = GridSearchCV(svc, param_grid=param_grid, cv=5)
base_svc_grid.fit(X_train, y_train)

CPU times: user 741 ms, sys: 8.93 ms, total: 749 ms
Wall time: 851 ms


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('numeric_features',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak'], dtype='object')),
                                                                        ('categoric_features',
                                                                         Pipeline(steps=[('encoder',
                                                                                          OneHotEncoder())]),
                                                                   

In [27]:
best_svc = base_svc_grid.best_estimator_
joblib.dump(best_svc,'base-svc_gridsearch.joblib')

['base-svc_gridsearch.joblib']

In [28]:
y_pred = best_svc.predict(X_test)
accuracy_score(y_pred,y_test)

0.8586956521739131

#### SVM but kernel gridsearch

In [30]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'svc__C':[0.01,0.1,0.5,1],
             'svc__kernel':['linear','poly', 'rbf']
             }
svc_grid = GridSearchCV(svc, param_grid=param_grid, cv=5)
svc_grid.fit(X_train, y_train)

CPU times: user 2.35 s, sys: 33.3 ms, total: 2.38 s
Wall time: 2.62 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('numeric_features',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak'], dtype='object')),
                                                                        ('categoric_features',
                                                                         Pipeline(steps=[('encoder',
                                                                                          OneHotEncoder())]),
                                                                   

In [31]:
test_svc = svc_grid.best_estimator_
y_pred = test_svc.predict(X_test)
accuracy_score(y_pred,y_test)

0.8405797101449275

### Adaboost

In [34]:
ada = Pipeline(steps=[('processing', preprocessing),
                             ('adaboost',AdaBoostClassifier())])

In [None]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'adaboost__n_estimators':[25,50,75,100],
             'adaboost__learning_rate':[0.1,0.3,0.5,0.7,1]
             }
base_ada_grid = GridSearchCV(ada, param_grid=param_grid, cv=5)
base_ada_grid.fit(X_train, y_train)