## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import warnings

## Read in csv files

In [2]:
all_hd = pd.read_csv('../data/interim/heart_disease_data_all_pt2.csv').drop('Unnamed: 0', axis=1)
cleveland_hd = pd.read_csv('../data/interim/heart_disease_data_cleveland_pt2.csv').drop('Unnamed: 0', axis=1)

## View data

In [3]:
all_hd.head()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,num,Male,chest_pain,high_fbs,abnormal_restecg,exercise_induced_angina
0,1,63,145.0,233.0,150.0,2.3,0,1,1,1,1,0
1,2,67,160.0,286.0,108.0,1.5,1,1,0,0,1,1
2,3,67,120.0,229.0,129.0,2.6,1,1,0,0,1,1
3,4,37,130.0,250.0,187.0,3.5,0,1,1,0,0,0
4,5,41,130.0,204.0,172.0,1.4,0,0,1,0,1,0


In [4]:
cleveland_hd.head()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num,Male,chest_pain,high_fbs,abnormal_restecg,exercise_induced_angina,upsloping,defect
0,1,63,145.0,233.0,150.0,2.3,0.0,0,1,1,1,1,0,0,1
1,2,67,160.0,286.0,108.0,1.5,3.0,1,1,0,0,1,1,0,0
2,3,67,120.0,229.0,129.0,2.6,2.0,1,1,0,0,1,1,0,1
3,4,37,130.0,250.0,187.0,3.5,0.0,0,1,1,0,0,0,0,0
4,5,41,130.0,204.0,172.0,1.4,0.0,0,0,1,0,1,0,1,0


## Split data into training and testing sets

In [5]:
X_all_train, X_all_test, y_all_train, y_all_test = train_test_split(all_hd.drop(columns='num'), 
                                                    all_hd.num, test_size=0.3, 
                                                    random_state=47)

In [6]:
X_cleveland_train, X_cleveland_test, y_cleveland_train, y_cleveland_test = train_test_split(cleveland_hd.drop(columns='num'), 
                                                    cleveland_hd.num, test_size=0.3, 
                                                    random_state=47)

## Logistic Regression

### All datasets

In [7]:
pipe_log_reg_all = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    LogisticRegression()
)

In [8]:
grid_params = {
        'simpleimputer__strategy': ['mean', 'median'],
        'standardscaler': [StandardScaler(), MinMaxScaler(), None],
        'simpleimputer__strategy': ['mean', 'median'],
        'logisticregression__C': [0.01, 0.1, 1, 10, 100],
        'logisticregression__fit_intercept': [True, False],
        'logisticregression__l1_ratio': [0, .1, .25, .5, .75, 1],
        'logisticregression__penalty': ['elasticnet', None],
        'logisticregression__solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

In [9]:
warnings.filterwarnings("ignore")
log_reg_all_rand_cv = RandomizedSearchCV(pipe_log_reg_all, param_distributions=grid_params, cv=5, n_jobs=-1, n_iter = 60,
                                         scoring='f1')
log_reg_all_rand_cv.fit(X_all_train, y_all_train)
print(log_reg_all_rand_cv.best_params_)
print(f'Best Score = {log_reg_all_rand_cv.best_score_}')

{'standardscaler': MinMaxScaler(), 'simpleimputer__strategy': 'median', 'logisticregression__solver': 'saga', 'logisticregression__penalty': None, 'logisticregression__l1_ratio': 0.25, 'logisticregression__fit_intercept': False, 'logisticregression__C': 100}
Best Score = 0.8404747731624299


### Cleveland dataset

In [10]:
pipe_log_reg_cleveland = make_pipeline(
    SimpleImputer(strategy='median'), 
    StandardScaler(),
    LogisticRegression()
)

In [11]:
log_reg_cleveland_rand_cv = RandomizedSearchCV(pipe_log_reg_cleveland, param_distributions=grid_params, 
                                               cv=5, n_jobs=-1, n_iter = 60, scoring='f1')
log_reg_cleveland_rand_cv.fit(X_cleveland_train, y_cleveland_train)
print(log_reg_cleveland_rand_cv.best_params_)
print(f'Best Score = {log_reg_cleveland_rand_cv.best_score_}')

{'standardscaler': None, 'simpleimputer__strategy': 'mean', 'logisticregression__solver': 'newton-cholesky', 'logisticregression__penalty': None, 'logisticregression__l1_ratio': 0, 'logisticregression__fit_intercept': False, 'logisticregression__C': 0.01}
Best Score = 0.851592662080467


## AdaBoost

### All datasets - Decision Tree Estimator

In [12]:
decision_tree = DecisionTreeClassifier()

pipe_adaboost_decision_tree_all = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    AdaBoostClassifier()
)

In [13]:
grid_params = {
    'adaboostclassifier__estimator': [decision_tree],
    'simpleimputer__strategy': ['mean', 'median'],
    'standardscaler': [StandardScaler(), MinMaxScaler(), None],
    'adaboostclassifier__algorithm': ['SAMME', 'SAMME.R'],
    'adaboostclassifier__learning_rate': [.01, .25, .5, .75, 1],
    'adaboostclassifier__n_estimators': [100, 200, 300, 400, 500],
    'adaboostclassifier__estimator__ccp_alpha': [.001, .01, .1, .25],
    'adaboostclassifier__estimator__criterion': ['gini', 'entropy', 'log_loss'],
    'adaboostclassifier__estimator__max_depth': [1, 2, 3, 4],
    'adaboostclassifier__estimator__max_features': [None, 'auto', 'sqrt', 'log2'], 
    'adaboostclassifier__estimator__min_impurity_decrease': [.0001, .001, .01, .1],
    'adaboostclassifier__estimator__min_samples_leaf': [1, 2, 3, 4, 5],
    'adaboostclassifier__estimator__min_samples_split': [2, 3, 4, 5],
    'adaboostclassifier__estimator__splitter': ['best', 'random']
}

In [14]:
adaboost_decision_tree_all_grid_cv = RandomizedSearchCV(pipe_adaboost_decision_tree_all, param_distributions=grid_params, 
                                               cv=5, n_jobs=-1, n_iter = 180, scoring='f1')
adaboost_decision_tree_all_grid_cv.fit(X_all_train, y_all_train)
print(adaboost_decision_tree_all_grid_cv.best_params_)
print(f'Best Score = {adaboost_decision_tree_all_grid_cv.best_score_}')

{'standardscaler': None, 'simpleimputer__strategy': 'median', 'adaboostclassifier__n_estimators': 300, 'adaboostclassifier__learning_rate': 0.75, 'adaboostclassifier__estimator__splitter': 'best', 'adaboostclassifier__estimator__min_samples_split': 4, 'adaboostclassifier__estimator__min_samples_leaf': 5, 'adaboostclassifier__estimator__min_impurity_decrease': 0.001, 'adaboostclassifier__estimator__max_features': 'log2', 'adaboostclassifier__estimator__max_depth': 2, 'adaboostclassifier__estimator__criterion': 'log_loss', 'adaboostclassifier__estimator__ccp_alpha': 0.01, 'adaboostclassifier__estimator': DecisionTreeClassifier(ccp_alpha=0.01, criterion='log_loss', max_depth=2,
                       max_features='log2', min_impurity_decrease=0.001,
                       min_samples_leaf=5, min_samples_split=4), 'adaboostclassifier__algorithm': 'SAMME'}
Best Score = 0.8880168577682017


### All datasets - SVM Estimator

In [15]:
svm = SVC(probability=True)

pipe_adaboost_svm_all = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    AdaBoostClassifier()
)

In [16]:
grid_params = {
    'adaboostclassifier__estimator': [svm],
    'simpleimputer__strategy': ['mean', 'median'],
    'standardscaler': [StandardScaler(), MinMaxScaler(), None],
    'adaboostclassifier__algorithm': ['SAMME', 'SAMME.R'],
    'adaboostclassifier__learning_rate': [.01, .25, .5, .75, 1],
    'adaboostclassifier__n_estimators': [25, 50, 75, 100],
    'adaboostclassifier__estimator__C': [0.01, 0.1, 1],
    'adaboostclassifier__estimator__decision_function_shape': ['ovo', 'ovr'],
    'adaboostclassifier__estimator__gamma': ['scale', 'auto'],
    'adaboostclassifier__estimator__kernel': ['linear'],
    'adaboostclassifier__estimator__shrinking': [True, False]
}

In [None]:
adaboost_svm_all_rand_cv = RandomizedSearchCV(pipe_adaboost_svm_all, param_distributions=grid_params, 
                                               cv=5, n_jobs=-1, n_iter = 60, scoring='f1')
adaboost_svm_all_rand_cv.fit(X_all_train, y_all_train)
print(adaboost_svm_all_rand_cv.best_params_)
print(f'Best Score = {adaboost_svm_all_rand_cv.best_score_}')

### All datasets - Logistic Regression Estimator

In [None]:
logistic_regression = LogisticRegression()

pipe_adaboost_log_reg_all = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    AdaBoostClassifier()
)

In [None]:
grid_params = {
    'adaboostclassifier__estimator': [logistic_regression],
    'simpleimputer__strategy': ['mean', 'median'],
    'standardscaler': [StandardScaler(), MinMaxScaler(), None],
    'adaboostclassifier__algorithm': ['SAMME', 'SAMME.R'],
    'adaboostclassifier__learning_rate': [.01, .25, .5, .75, 1],
    'adaboostclassifier__n_estimators': [100, 200, 300, 400, 500],
    'adaboostclassifier__estimator__C': [0.01, 0.1, 1, 10, 100, 100],
    'adaboostclassifier__estimator__fit_intercept': [True, False],
    'adaboostclassifier__estimator__l1_ratio': [0, .1, .25, .5, .75, 1],
    'adaboostclassifier__estimator__penalty': ['elasticnet', None],
    'adaboostclassifier__estimator__solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

In [None]:
adaboost_log_reg_all_rand_cv = RandomizedSearchCV(pipe_adaboost_log_reg_all, param_distributions=grid_params, 
                                               cv=5, n_jobs=-1, n_iter = 60, scoring='f1')
adaboost_log_reg_all_rand_cv.fit(X_all_train, y_all_train)
print(adaboost_log_reg_all_rand_cv.best_params_)
print(f'Best Score = {adaboost_log_reg_all_rand_cv.best_score_}')

### Cleveland dataset - Decision Tree Estimator

In [None]:
decision_tree = DecisionTreeClassifier()

pipe_adaboost_decision_tree_cleveland = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    AdaBoostClassifier()
)

In [None]:
grid_params = {
    'adaboostclassifier__estimator': [decision_tree],
    'simpleimputer__strategy': ['mean', 'median'],
    'standardscaler': [StandardScaler(), MinMaxScaler(), None],
    'adaboostclassifier__algorithm': ['SAMME', 'SAMME.R'],
    'adaboostclassifier__learning_rate': [.01, .25, .5, .75, 1],
    'adaboostclassifier__n_estimators': [100, 200, 300, 400, 500],
    'adaboostclassifier__estimator__ccp_alpha': [.001, .01, .1, .25],
    'adaboostclassifier__estimator__criterion': ['gini', 'entropy', 'log_loss'],
    'adaboostclassifier__estimator__max_depth': [1, 2, 3, 4],
    'adaboostclassifier__estimator__max_features': [None, 'auto', 'sqrt', 'log2'], 
    'adaboostclassifier__estimator__min_impurity_decrease': [.0001, .001, .01, .1],
    'adaboostclassifier__estimator__min_samples_leaf': [1, 2, 3, 4, 5],
    'adaboostclassifier__estimator__min_samples_split': [2, 3, 4, 5],
    'adaboostclassifier__estimator__splitter': ['best', 'random']
}

In [None]:
adaboost_decision_tree_cleveland_rand_cv = RandomizedSearchCV(pipe_adaboost_decision_tree_cleveland, param_distributions=grid_params, 
                                               cv=5, n_jobs=-1, n_iter = 60, scoring='f1')
adaboost_decision_tree_cleveland_rand_cv.fit(X_cleveland_train, y_cleveland_train)
print(adaboost_decision_tree_cleveland_rand_cv.best_params_)
print(f'Best Score = {adaboost_decision_tree_cleveland_rand_cv.best_score_}')

### Cleveland dataset - SVM Estimator

In [None]:
svm = SVC(probability=True)

pipe_adaboost_svm_cleveland = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    AdaBoostClassifier()
)

In [None]:
grid_params = {
    'adaboostclassifier__estimator': [svm],
    'simpleimputer__strategy': ['mean', 'median'],
    'standardscaler': [StandardScaler(), MinMaxScaler(), None],
    'adaboostclassifier__algorithm': ['SAMME', 'SAMME.R'],
    'adaboostclassifier__learning_rate': [.01, .25, .5, .75, 1],
    'adaboostclassifier__n_estimators': [25, 50, 75, 100],
    'adaboostclassifier__estimator__C': [0.01, 0.1, 1],
    'adaboostclassifier__estimator__decision_function_shape': ['ovo', 'ovr'],
    'adaboostclassifier__estimator__gamma': ['scale', 'auto'],
    'adaboostclassifier__estimator__kernel': ['linear'],
    'adaboostclassifier__estimator__shrinking': [True, False]
}

In [None]:
adaboost_svm_cleveland_rand_cv = RandomizedSearchCV(pipe_adaboost_svm_cleveland, param_distributions=grid_params, 
                                               cv=5, n_jobs=-1, n_iter = 60, scoring='f1')
adaboost_svm_cleveland_rand_cv.fit(X_cleveland_train, y_cleveland_train)
print(adaboost_svm_cleveland_rand_cv.best_params_)
print(f'Best Score = {adaboost_svm_cleveland_rand_cv.best_score_}')

### Cleveland dataset - Logistic Regression Estimator

In [None]:
logistic_regression = LogisticRegression()

pipe_adaboost_log_reg_cleveland = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    AdaBoostClassifier()
)

In [None]:
grid_params = {
    'adaboostclassifier__estimator': [logistic_regression],
    'simpleimputer__strategy': ['mean', 'median'],
    'standardscaler': [StandardScaler(), MinMaxScaler(), None],
    'adaboostclassifier__algorithm': ['SAMME', 'SAMME.R'],
    'adaboostclassifier__learning_rate': [.01, .25, .5, .75, 1],
    'adaboostclassifier__n_estimators': [100, 200, 300, 400, 500],
    'adaboostclassifier__estimator__C': [0.01, 0.1, 1, 10, 100, 100],
    'adaboostclassifier__estimator__fit_intercept': [True, False],
    'adaboostclassifier__estimator__l1_ratio': [0, .1, .25, .5, .75, 1],
    'adaboostclassifier__estimator__penalty': ['elasticnet', None],
    'adaboostclassifier__estimator__solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

In [None]:
adaboost_log_reg_cleveland_rand_cv = RandomizedSearchCV(pipe_adaboost_log_reg_cleveland, param_distributions=grid_params, 
                                               cv=5, n_jobs=-1, n_iter = 60, scoring='f1')
adaboost_log_reg_cleveland_rand_cv.fit(X_cleveland_train, y_cleveland_train)
print(adaboost_log_reg_cleveland_rand_cv.best_params_)
print(f'Best Score = {adaboost_log_reg_cleveland_rand_cv.best_score_}')

## Random Forest

### All datasets

In [None]:
pipe_random_forest_all = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    RandomForestClassifier()
)

In [None]:
grid_params = {
    'simpleimputer__strategy': ['mean', 'median'],
    'standardscaler': [StandardScaler(), MinMaxScaler(), None],
    'randomforestclassifier__bootstrap': [True, False],
    'randomforestclassifier__ccp_alpha': [.0001, .001, .01, .1],
    'randomforestclassifier__criterion': ["gini", "entropy", "log_loss"],
    'randomforestclassifier__max_depth': [3, 5, 10, 20, None],
    'randomforestclassifier__max_features': ["sqrt", "log2", None],
    'randomforestclassifier__max_leaf_nodes': [10, 20, 50, 100, 200, None],
    'randomforestclassifier__min_impurity_decrease': [0, .0001, .001, .01, .1],
    'randomforestclassifier__min_samples_leaf': [1, 2, 3, 4, 5],
    'randomforestclassifier__n_estimators': [100, 200, 300, 400, 500],
    'randomforestclassifier__n_jobs': [-1]
}

In [None]:
random_forest_all_grid_cv = RandomizedSearchCV(pipe_random_forest_all, param_distributions=grid_params, 
                                               cv=5, n_jobs=-1, n_iter = 180, scoring='f1')
random_forest_all_grid_cv.fit(X_all_train, y_all_train)
print(random_forest_all_grid_cv.best_params_)
print(f'Best Score = {random_forest_all_grid_cv.best_score_}')

### Cleveland

In [None]:
pipe_random_forest_cleveland = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    RandomForestClassifier()
)

In [None]:
grid_params = {
    'simpleimputer__strategy': ['mean', 'median'],
    'standardscaler': [StandardScaler(), MinMaxScaler(), None],
    'randomforestclassifier__bootstrap': [True, False],
    'randomforestclassifier__ccp_alpha': [.0001, .001, .01, .1],
    'randomforestclassifier__criterion': ["gini", "entropy", "log_loss"],
    'randomforestclassifier__max_depth': [3, 5, 10, 20, None],
    'randomforestclassifier__max_features': ["sqrt", "log2", None],
    'randomforestclassifier__max_leaf_nodes': [10, 20, 50, 100, 200, None],
    'randomforestclassifier__min_impurity_decrease': [0, .0001, .001, .01, .1],
    'randomforestclassifier__min_samples_leaf': [1, 2, 3, 4, 5],
    'randomforestclassifier__n_estimators': [100, 200, 300, 400, 500],
    'randomforestclassifier__n_jobs': [-1]
}

In [None]:
random_forest_cleveland_rand_cv = RandomizedSearchCV(pipe_random_forest_cleveland, param_distributions=grid_params, 
                                               cv=5, n_jobs=-1, n_iter = 180, scoring='f1')
random_forest_cleveland_rand_cv.fit(X_cleveland_train, y_cleveland_train)
print(random_forest_cleveland_rand_cv.best_params_)
print(f'Best Score = {random_forest_cleveland_rand_cv.best_score_}')

In [None]:
warnings.resetwarnings()

## Results

The additional features in the Cleveland only models did not improve performance, so I decided to only choose the best All Datasets model to use going forward. Using the F1 score as the evaluation metric, the best performing model was the AdaBoost model using a Decision Tree estimator. This was slightly better than the best Random Forest model, and it will be the model of choice moving forward.