## Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn import __version__ as sklearn_version
import datetime
import pickle
import warnings

## Read in csv files

In [3]:
all_hd = pd.read_csv('../data/interim/heart_disease_data_all_pt2.csv').drop('Unnamed: 0', axis=1)
cleveland_hd = pd.read_csv('../data/interim/heart_disease_data_cleveland_pt2.csv').drop('Unnamed: 0', axis=1)

## View data

In [4]:
all_hd.head()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,num,Male,chest_pain,high_fbs,abnormal_restecg,exercise_induced_angina
0,1,63,145.0,233.0,150.0,2.3,0,1,1,1,1,0
1,2,67,160.0,286.0,108.0,1.5,1,1,0,0,1,1
2,3,67,120.0,229.0,129.0,2.6,1,1,0,0,1,1
3,4,37,130.0,250.0,187.0,3.5,0,1,1,0,0,0
4,5,41,130.0,204.0,172.0,1.4,0,0,1,0,1,0


In [5]:
cleveland_hd.head()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num,Male,chest_pain,high_fbs,abnormal_restecg,exercise_induced_angina,upsloping,defect
0,1,63,145.0,233.0,150.0,2.3,0.0,0,1,1,1,1,0,0,1
1,2,67,160.0,286.0,108.0,1.5,3.0,1,1,0,0,1,1,0,0
2,3,67,120.0,229.0,129.0,2.6,2.0,1,1,0,0,1,1,0,1
3,4,37,130.0,250.0,187.0,3.5,0.0,0,1,1,0,0,0,0,0
4,5,41,130.0,204.0,172.0,1.4,0.0,0,0,1,0,1,0,1,0


## Split data into training and testing sets

In [6]:
X_all_train, X_all_test, y_all_train, y_all_test = train_test_split(all_hd.drop(columns=['num', 'id']), 
                                                    all_hd.num, test_size=0.3, 
                                                    random_state=47)

In [7]:
X_cleveland_train, X_cleveland_test, y_cleveland_train, y_cleveland_test = train_test_split(cleveland_hd.drop(columns=['num', 'id']), 
                                                    cleveland_hd.num, test_size=0.3, 
                                                    random_state=47)

## Logistic Regression

### All datasets

In [8]:
pipe_log_reg_all = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    LogisticRegression()
)

In [9]:
grid_params = {
        'simpleimputer__strategy': ['mean', 'median'],
        'standardscaler': [StandardScaler(), MinMaxScaler(), None],
        'simpleimputer__strategy': ['mean', 'median'],
        'logisticregression__C': [0.01, 0.1, 1, 10, 100],
        'logisticregression__fit_intercept': [True, False],
        'logisticregression__l1_ratio': [0, .1, .25, .5, .75, 1],
        'logisticregression__penalty': ['elasticnet', None],
        'logisticregression__solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

In [10]:
warnings.filterwarnings("ignore")
log_reg_all_grid_cv = GridSearchCV(pipe_log_reg_all, param_grid=grid_params, cv=5, n_jobs=-1, scoring='f1')
log_reg_all_grid_cv.fit(X_all_train, y_all_train)
print(log_reg_all_grid_cv.best_params_)
print(f'Best Score = {log_reg_all_grid_cv.best_score_}')

{'logisticregression__C': 0.01, 'logisticregression__fit_intercept': False, 'logisticregression__l1_ratio': 0, 'logisticregression__penalty': None, 'logisticregression__solver': 'lbfgs', 'simpleimputer__strategy': 'mean', 'standardscaler': MinMaxScaler()}
Best Score = 0.8321308241401686


### Cleveland dataset

In [11]:
pipe_log_reg_cleveland = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    LogisticRegression()
)

In [12]:
log_reg_cleveland_grid_cv = GridSearchCV(pipe_log_reg_cleveland, param_grid=grid_params, cv=5, n_jobs=-1, scoring='f1')
log_reg_cleveland_grid_cv.fit(X_cleveland_train, y_cleveland_train)
print(log_reg_cleveland_grid_cv.best_params_)
print(f'Best Score = {log_reg_cleveland_grid_cv.best_score_}')

{'logisticregression__C': 0.1, 'logisticregression__fit_intercept': False, 'logisticregression__l1_ratio': 0.75, 'logisticregression__penalty': 'elasticnet', 'logisticregression__solver': 'saga', 'simpleimputer__strategy': 'median', 'standardscaler': StandardScaler()}
Best Score = 0.8685332744730163


## AdaBoost

### All datasets - Decision Tree Estimator

In [13]:
decision_tree = DecisionTreeClassifier()

pipe_adaboost_decision_tree_all = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    AdaBoostClassifier()
)

In [14]:
grid_params = {
    'adaboostclassifier__estimator': [decision_tree],
    'simpleimputer__strategy': ['mean', 'median'],
    'standardscaler': [StandardScaler(), MinMaxScaler(), None],
    'adaboostclassifier__algorithm': ['SAMME', 'SAMME.R'],
    'adaboostclassifier__learning_rate': [.01, .25, .5, .75, 1],
    'adaboostclassifier__n_estimators': [100, 200, 300, 400, 500],
    'adaboostclassifier__estimator__ccp_alpha': [.001, .01, .1, .25],
    'adaboostclassifier__estimator__criterion': ['gini', 'entropy', 'log_loss'],
    'adaboostclassifier__estimator__max_depth': [1, 2, 3, 4],
    'adaboostclassifier__estimator__max_features': [None, 'auto', 'sqrt', 'log2'], 
    'adaboostclassifier__estimator__min_impurity_decrease': [.0001, .001, .01, .1],
    'adaboostclassifier__estimator__min_samples_leaf': [1, 2, 3, 4, 5],
    'adaboostclassifier__estimator__min_samples_split': [2, 3, 4, 5],
    'adaboostclassifier__estimator__splitter': ['best', 'random']
}

In [15]:
adaboost_decision_tree_all_grid_cv = RandomizedSearchCV(pipe_adaboost_decision_tree_all, param_distributions=grid_params, 
                                               cv=5, n_jobs=-1, n_iter = 180, scoring='f1')
adaboost_decision_tree_all_grid_cv.fit(X_all_train, y_all_train)
print(adaboost_decision_tree_all_grid_cv.best_params_)
print(f'Best Score = {adaboost_decision_tree_all_grid_cv.best_score_}')

{'standardscaler': None, 'simpleimputer__strategy': 'median', 'adaboostclassifier__n_estimators': 300, 'adaboostclassifier__learning_rate': 0.01, 'adaboostclassifier__estimator__splitter': 'best', 'adaboostclassifier__estimator__min_samples_split': 4, 'adaboostclassifier__estimator__min_samples_leaf': 5, 'adaboostclassifier__estimator__min_impurity_decrease': 0.0001, 'adaboostclassifier__estimator__max_features': 'log2', 'adaboostclassifier__estimator__max_depth': 4, 'adaboostclassifier__estimator__criterion': 'gini', 'adaboostclassifier__estimator__ccp_alpha': 0.01, 'adaboostclassifier__estimator': DecisionTreeClassifier(ccp_alpha=0.01, max_depth=4, max_features='log2',
                       min_impurity_decrease=0.0001, min_samples_leaf=5,
                       min_samples_split=4), 'adaboostclassifier__algorithm': 'SAMME.R'}
Best Score = 0.83280594576378


### All datasets - SVM Estimator

In [16]:
svm = SVC(probability=True)

pipe_adaboost_svm_all = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    AdaBoostClassifier()
)

In [17]:
grid_params = {
    'adaboostclassifier__estimator': [svm],
    'simpleimputer__strategy': ['mean', 'median'],
    'standardscaler': [StandardScaler(), MinMaxScaler(), None],
    'adaboostclassifier__algorithm': ['SAMME', 'SAMME.R'],
    'adaboostclassifier__learning_rate': [.01, .25, .5, .75, 1],
    'adaboostclassifier__n_estimators': [25, 50, 75, 100],
    'adaboostclassifier__estimator__C': [0.01, 0.1, 1],
    'adaboostclassifier__estimator__decision_function_shape': ['ovo', 'ovr'],
    'adaboostclassifier__estimator__gamma': ['scale', 'auto'],
    'adaboostclassifier__estimator__kernel': ['linear'],
    'adaboostclassifier__estimator__shrinking': [True, False]
}

In [18]:
adaboost_svm_all_rand_cv = RandomizedSearchCV(pipe_adaboost_svm_all, param_distributions=grid_params, 
                                               cv=5, n_jobs=-1, n_iter = 180, scoring='f1')
adaboost_svm_all_rand_cv.fit(X_all_train, y_all_train)
print(adaboost_svm_all_rand_cv.best_params_)
print(f'Best Score = {adaboost_svm_all_rand_cv.best_score_}')

{'standardscaler': None, 'simpleimputer__strategy': 'mean', 'adaboostclassifier__n_estimators': 25, 'adaboostclassifier__learning_rate': 0.25, 'adaboostclassifier__estimator__shrinking': True, 'adaboostclassifier__estimator__kernel': 'linear', 'adaboostclassifier__estimator__gamma': 'auto', 'adaboostclassifier__estimator__decision_function_shape': 'ovr', 'adaboostclassifier__estimator__C': 0.1, 'adaboostclassifier__estimator': SVC(C=0.1, gamma='auto', kernel='linear', probability=True), 'adaboostclassifier__algorithm': 'SAMME.R'}
Best Score = 0.8373209580748935


### All datasets - Logistic Regression Estimator

In [19]:
logistic_regression = LogisticRegression()

pipe_adaboost_log_reg_all = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    AdaBoostClassifier()
)

In [20]:
grid_params = {
    'adaboostclassifier__estimator': [logistic_regression],
    'simpleimputer__strategy': ['mean', 'median'],
    'standardscaler': [StandardScaler(), MinMaxScaler(), None],
    'adaboostclassifier__algorithm': ['SAMME', 'SAMME.R'],
    'adaboostclassifier__learning_rate': [.01, .25, .5, .75, 1],
    'adaboostclassifier__n_estimators': [100, 200, 300, 400, 500],
    'adaboostclassifier__estimator__C': [0.01, 0.1, 1, 10, 100, 100],
    'adaboostclassifier__estimator__fit_intercept': [True, False],
    'adaboostclassifier__estimator__l1_ratio': [0, .1, .25, .5, .75, 1],
    'adaboostclassifier__estimator__penalty': ['elasticnet', None],
    'adaboostclassifier__estimator__solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

In [21]:
adaboost_log_reg_all_rand_cv = RandomizedSearchCV(pipe_adaboost_log_reg_all, param_distributions=grid_params, 
                                               cv=5, n_jobs=-1, n_iter = 180, scoring='f1')
adaboost_log_reg_all_rand_cv.fit(X_all_train, y_all_train)
print(adaboost_log_reg_all_rand_cv.best_params_)
print(f'Best Score = {adaboost_log_reg_all_rand_cv.best_score_}')

{'standardscaler': MinMaxScaler(), 'simpleimputer__strategy': 'mean', 'adaboostclassifier__n_estimators': 300, 'adaboostclassifier__learning_rate': 0.5, 'adaboostclassifier__estimator__solver': 'newton-cholesky', 'adaboostclassifier__estimator__penalty': None, 'adaboostclassifier__estimator__l1_ratio': 0.1, 'adaboostclassifier__estimator__fit_intercept': False, 'adaboostclassifier__estimator__C': 0.01, 'adaboostclassifier__estimator': LogisticRegression(C=0.01, fit_intercept=False, l1_ratio=0.1, penalty=None,
                   solver='newton-cholesky'), 'adaboostclassifier__algorithm': 'SAMME'}
Best Score = 0.833079850534669


### Cleveland dataset - Decision Tree Estimator

In [22]:
decision_tree = DecisionTreeClassifier()

pipe_adaboost_decision_tree_cleveland = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    AdaBoostClassifier()
)

In [23]:
grid_params = {
    'adaboostclassifier__estimator': [decision_tree],
    'simpleimputer__strategy': ['mean', 'median'],
    'standardscaler': [StandardScaler(), MinMaxScaler(), None],
    'adaboostclassifier__algorithm': ['SAMME', 'SAMME.R'],
    'adaboostclassifier__learning_rate': [.01, .25, .5, .75, 1],
    'adaboostclassifier__n_estimators': [100, 200, 300, 400, 500],
    'adaboostclassifier__estimator__ccp_alpha': [.001, .01, .1, .25],
    'adaboostclassifier__estimator__criterion': ['gini', 'entropy', 'log_loss'],
    'adaboostclassifier__estimator__max_depth': [1, 2, 3, 4],
    'adaboostclassifier__estimator__max_features': [None, 'auto', 'sqrt', 'log2'], 
    'adaboostclassifier__estimator__min_impurity_decrease': [.0001, .001, .01, .1],
    'adaboostclassifier__estimator__min_samples_leaf': [1, 2, 3, 4, 5],
    'adaboostclassifier__estimator__min_samples_split': [2, 3, 4, 5],
    'adaboostclassifier__estimator__splitter': ['best', 'random']
}

In [24]:
adaboost_decision_tree_cleveland_rand_cv = RandomizedSearchCV(pipe_adaboost_decision_tree_cleveland, param_distributions=grid_params, 
                                               cv=5, n_jobs=-1, n_iter = 180, scoring='f1')
adaboost_decision_tree_cleveland_rand_cv.fit(X_cleveland_train, y_cleveland_train)
print(adaboost_decision_tree_cleveland_rand_cv.best_params_)
print(f'Best Score = {adaboost_decision_tree_cleveland_rand_cv.best_score_}')

{'standardscaler': None, 'simpleimputer__strategy': 'mean', 'adaboostclassifier__n_estimators': 500, 'adaboostclassifier__learning_rate': 0.5, 'adaboostclassifier__estimator__splitter': 'best', 'adaboostclassifier__estimator__min_samples_split': 5, 'adaboostclassifier__estimator__min_samples_leaf': 3, 'adaboostclassifier__estimator__min_impurity_decrease': 0.001, 'adaboostclassifier__estimator__max_features': 'log2', 'adaboostclassifier__estimator__max_depth': 2, 'adaboostclassifier__estimator__criterion': 'log_loss', 'adaboostclassifier__estimator__ccp_alpha': 0.1, 'adaboostclassifier__estimator': DecisionTreeClassifier(ccp_alpha=0.1, criterion='log_loss', max_depth=2,
                       max_features='log2', min_impurity_decrease=0.001,
                       min_samples_leaf=3, min_samples_split=5), 'adaboostclassifier__algorithm': 'SAMME.R'}
Best Score = 0.8683822278174012


### Cleveland dataset - SVM Estimator

In [25]:
svm = SVC(probability=True)

pipe_adaboost_svm_cleveland = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    AdaBoostClassifier()
)

In [26]:
grid_params = {
    'adaboostclassifier__estimator': [svm],
    'simpleimputer__strategy': ['mean', 'median'],
    'standardscaler': [StandardScaler(), MinMaxScaler(), None],
    'adaboostclassifier__algorithm': ['SAMME', 'SAMME.R'],
    'adaboostclassifier__learning_rate': [.01, .25, .5, .75, 1],
    'adaboostclassifier__n_estimators': [25, 50, 75, 100],
    'adaboostclassifier__estimator__C': [0.01, 0.1, 1],
    'adaboostclassifier__estimator__decision_function_shape': ['ovo', 'ovr'],
    'adaboostclassifier__estimator__gamma': ['scale', 'auto'],
    'adaboostclassifier__estimator__kernel': ['linear'],
    'adaboostclassifier__estimator__shrinking': [True, False]
}

In [27]:
adaboost_svm_cleveland_rand_cv = RandomizedSearchCV(pipe_adaboost_svm_cleveland, param_distributions=grid_params, 
                                               cv=5, n_jobs=-1, n_iter = 180, scoring='f1')
adaboost_svm_cleveland_rand_cv.fit(X_cleveland_train, y_cleveland_train)
print(adaboost_svm_cleveland_rand_cv.best_params_)
print(f'Best Score = {adaboost_svm_cleveland_rand_cv.best_score_}')

{'standardscaler': StandardScaler(), 'simpleimputer__strategy': 'median', 'adaboostclassifier__n_estimators': 100, 'adaboostclassifier__learning_rate': 0.01, 'adaboostclassifier__estimator__shrinking': True, 'adaboostclassifier__estimator__kernel': 'linear', 'adaboostclassifier__estimator__gamma': 'scale', 'adaboostclassifier__estimator__decision_function_shape': 'ovr', 'adaboostclassifier__estimator__C': 1, 'adaboostclassifier__estimator': SVC(C=1, kernel='linear', probability=True), 'adaboostclassifier__algorithm': 'SAMME.R'}
Best Score = 0.8572585309427415


### Cleveland dataset - Logistic Regression Estimator

In [28]:
logistic_regression = LogisticRegression()

pipe_adaboost_log_reg_cleveland = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    AdaBoostClassifier()
)

In [29]:
grid_params = {
    'adaboostclassifier__estimator': [logistic_regression],
    'simpleimputer__strategy': ['mean', 'median'],
    'standardscaler': [StandardScaler(), MinMaxScaler(), None],
    'adaboostclassifier__algorithm': ['SAMME', 'SAMME.R'],
    'adaboostclassifier__learning_rate': [.01, .25, .5, .75, 1],
    'adaboostclassifier__n_estimators': [100, 200, 300, 400, 500],
    'adaboostclassifier__estimator__C': [0.01, 0.1, 1, 10, 100, 100],
    'adaboostclassifier__estimator__fit_intercept': [True, False],
    'adaboostclassifier__estimator__l1_ratio': [0, .1, .25, .5, .75, 1],
    'adaboostclassifier__estimator__penalty': ['elasticnet', None],
    'adaboostclassifier__estimator__solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

In [30]:
adaboost_log_reg_cleveland_rand_cv = RandomizedSearchCV(pipe_adaboost_log_reg_cleveland, param_distributions=grid_params, 
                                               cv=5, n_jobs=-1, n_iter = 180, scoring='f1')
adaboost_log_reg_cleveland_rand_cv.fit(X_cleveland_train, y_cleveland_train)
print(adaboost_log_reg_cleveland_rand_cv.best_params_)
print(f'Best Score = {adaboost_log_reg_cleveland_rand_cv.best_score_}')

{'standardscaler': MinMaxScaler(), 'simpleimputer__strategy': 'median', 'adaboostclassifier__n_estimators': 200, 'adaboostclassifier__learning_rate': 0.01, 'adaboostclassifier__estimator__solver': 'lbfgs', 'adaboostclassifier__estimator__penalty': None, 'adaboostclassifier__estimator__l1_ratio': 0.1, 'adaboostclassifier__estimator__fit_intercept': False, 'adaboostclassifier__estimator__C': 100, 'adaboostclassifier__estimator': LogisticRegression(C=100, fit_intercept=False, l1_ratio=0.1, penalty=None), 'adaboostclassifier__algorithm': 'SAMME'}
Best Score = 0.8674725274725276


## Random Forest

### All datasets

In [31]:
pipe_random_forest_all = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    RandomForestClassifier()
)

In [32]:
grid_params = {
    'simpleimputer__strategy': ['mean', 'median'],
    'standardscaler': [StandardScaler(), MinMaxScaler(), None],
    'randomforestclassifier__bootstrap': [True, False],
    'randomforestclassifier__ccp_alpha': [.0001, .001, .01, .1],
    'randomforestclassifier__criterion': ["gini", "entropy", "log_loss"],
    'randomforestclassifier__max_depth': [3, 5, 10, 20, None],
    'randomforestclassifier__max_features': ["sqrt", "log2", None],
    'randomforestclassifier__max_leaf_nodes': [10, 20, 50, 100, 200, None],
    'randomforestclassifier__min_impurity_decrease': [0, .0001, .001, .01, .1],
    'randomforestclassifier__min_samples_leaf': [1, 2, 3, 4, 5],
    'randomforestclassifier__n_estimators': [100, 200, 300, 400, 500],
    'randomforestclassifier__n_jobs': [-1]
}

In [33]:
random_forest_all_grid_cv = RandomizedSearchCV(pipe_random_forest_all, param_distributions=grid_params, 
                                               cv=5, n_jobs=-1, n_iter = 180, scoring='f1')
random_forest_all_grid_cv.fit(X_all_train, y_all_train)
print(random_forest_all_grid_cv.best_params_)
print(f'Best Score = {random_forest_all_grid_cv.best_score_}')

{'standardscaler': StandardScaler(), 'simpleimputer__strategy': 'mean', 'randomforestclassifier__n_jobs': -1, 'randomforestclassifier__n_estimators': 400, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__min_impurity_decrease': 0, 'randomforestclassifier__max_leaf_nodes': None, 'randomforestclassifier__max_features': 'log2', 'randomforestclassifier__max_depth': 10, 'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__ccp_alpha': 0.01, 'randomforestclassifier__bootstrap': False}
Best Score = 0.8364529433820772


### Cleveland

In [34]:
pipe_random_forest_cleveland = make_pipeline(
    SimpleImputer(), 
    StandardScaler(),
    RandomForestClassifier()
)

In [35]:
grid_params = {
    'simpleimputer__strategy': ['mean', 'median'],
    'standardscaler': [StandardScaler(), MinMaxScaler(), None],
    'randomforestclassifier__bootstrap': [True, False],
    'randomforestclassifier__ccp_alpha': [.0001, .001, .01, .1],
    'randomforestclassifier__criterion': ["gini", "entropy", "log_loss"],
    'randomforestclassifier__max_depth': [3, 5, 10, 20, None],
    'randomforestclassifier__max_features': ["sqrt", "log2", None],
    'randomforestclassifier__max_leaf_nodes': [10, 20, 50, 100, 200, None],
    'randomforestclassifier__min_impurity_decrease': [0, .0001, .001, .01, .1],
    'randomforestclassifier__min_samples_leaf': [1, 2, 3, 4, 5],
    'randomforestclassifier__n_estimators': [100, 200, 300, 400, 500],
    'randomforestclassifier__n_jobs': [-1]
}

In [36]:
random_forest_cleveland_rand_cv = RandomizedSearchCV(pipe_random_forest_cleveland, param_distributions=grid_params, 
                                               cv=5, n_jobs=-1, n_iter = 180, scoring='f1')
random_forest_cleveland_rand_cv.fit(X_cleveland_train, y_cleveland_train)
print(random_forest_cleveland_rand_cv.best_params_)
print(f'Best Score = {random_forest_cleveland_rand_cv.best_score_}')

{'standardscaler': MinMaxScaler(), 'simpleimputer__strategy': 'mean', 'randomforestclassifier__n_jobs': -1, 'randomforestclassifier__n_estimators': 500, 'randomforestclassifier__min_samples_leaf': 4, 'randomforestclassifier__min_impurity_decrease': 0.0001, 'randomforestclassifier__max_leaf_nodes': 50, 'randomforestclassifier__max_features': 'sqrt', 'randomforestclassifier__max_depth': 5, 'randomforestclassifier__criterion': 'gini', 'randomforestclassifier__ccp_alpha': 0.0001, 'randomforestclassifier__bootstrap': False}
Best Score = 0.858958668920158


In [37]:
warnings.resetwarnings()

## Final Model Selection

In [38]:
def f1_score_output(model, X_train, X_test, y_train, y_test):
    f1 = cross_validate(model.best_estimator_, X_train, y_train, scoring='f1', cv=5, n_jobs=-1)
    f1_mean = np.mean(f1['test_score'])
    f1_std = np.std(f1['test_score'])
    print('Cross-Validation (Training Data)')
    print(f1_mean, f1_std)
    print('Test Score')
    print(f1_score(y_test, model.best_estimator_.predict(X_test)))

## All datasets

### Logistic Regression

In [39]:
f1_score_output(log_reg_all_grid_cv, X_all_train, X_all_test, y_all_train, y_all_test)

Cross-Validation (Training Data)
0.8321308241401686 0.011462114460607565
Test Score
0.8222996515679442


### Adaboost - Decision Tree Estimator

In [40]:
f1_score_output(adaboost_decision_tree_all_grid_cv, X_all_train, X_all_test, y_all_train, y_all_test)

Cross-Validation (Training Data)
0.8277380868855643 0.011618885896932026
Test Score
0.8299319727891156


### Adaboost - SVM Estimator

In [41]:
f1_score_output(adaboost_svm_all_rand_cv, X_all_train, X_all_test, y_all_train, y_all_test)

Cross-Validation (Training Data)
0.835640978109242 0.017507508431032714
Test Score
0.8280254777070064


### Adaboost - Logistic Regression Estimator

In [42]:
f1_score_output(adaboost_log_reg_all_rand_cv, X_all_train, X_all_test, y_all_train, y_all_test)

Cross-Validation (Training Data)
0.833079850534669 0.011756320958699568
Test Score
0.8275862068965517


### Random Forest

In [43]:
f1_score_output(random_forest_all_grid_cv, X_all_train, X_all_test, y_all_train, y_all_test)

Cross-Validation (Training Data)
0.8367031418194208 0.011491575046433876
Test Score
0.7986348122866894


## Cleveland

### Logistic Regression

In [44]:
f1_score_output(log_reg_cleveland_grid_cv, X_cleveland_train, X_cleveland_test, y_cleveland_train, y_cleveland_test)

Cross-Validation (Training Data)
0.8685332744730163 0.04178642694731104
Test Score
0.7749999999999999


### Adaboost - Decision Tree Estimator

In [45]:
f1_score_output(adaboost_decision_tree_cleveland_rand_cv, X_cleveland_train, X_cleveland_test, 
                y_cleveland_train, y_cleveland_test)

Cross-Validation (Training Data)
0.8048675933294221 0.08288407654015287
Test Score
0.7297297297297297


### Adaboost - SVM Estimator

In [46]:
f1_score_output(adaboost_svm_cleveland_rand_cv, X_cleveland_train, X_cleveland_test, y_cleveland_train, y_cleveland_test)

Cross-Validation (Training Data)
0.8572585309427415 0.025281657987366916
Test Score
0.8051948051948052


### Adaboost - Logistic Regression Estimator

In [47]:
f1_score_output(adaboost_log_reg_cleveland_rand_cv, X_cleveland_train, X_cleveland_test, 
                y_cleveland_train, y_cleveland_test)

Cross-Validation (Training Data)
0.8674725274725276 0.02798510825895061
Test Score
0.7466666666666667


### Random Forest

In [48]:
f1_score_output(random_forest_cleveland_rand_cv, X_cleveland_train, X_cleveland_test, y_cleveland_train, y_cleveland_test)

Cross-Validation (Training Data)
0.8454593801577113 0.02567876641142049
Test Score
0.7792207792207793


## Results

Overall, the best performing model was the adaboost with decision tree estimator. This model had an F1 score of .830 on the test set. The models trained on the Cleveland only datasets were overfit so I will only be using this one model moving forward.

In [49]:
best_model = adaboost_decision_tree_all_grid_cv.best_estimator_
best_model.version = 1.0
best_model.pandas_version = pd.__version__
best_model.numpy_version = np.__version__
best_model.sklearn_version = sklearn_version
best_model.X_columns = [col for col in X_all_train.columns]
best_model.build_datetime = datetime.datetime.now()

In [50]:
with open('../models/best_model.pickle', 'wb') as handle:
    pickle.dump(best_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

data = {'X_train': X_all_train, 'y_train': y_all_train, 'X_test': X_all_test, 'y_test': y_all_test}

with open('../data/processed/test_data.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)