In [1]:
# reconstructing lost file

In [2]:
from hfpred.utils import get_project_root
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.model_selection import cross_validate
from sklearn.metrics import SCORERS, make_scorer, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

import dataframe_image as dfi
pd.set_option("display.max_columns", None)

# Candidate Models [Parameter Tuning with Cross-Validation]
* idea here is to train candidate models and using more robust cross validation for analysing performance
* select best models (when applied to the validation set) to take part in a popular voting classifier

### Can use the training and validatinon set combined
* train models and optimise hyperparameters using cross validation using the train and val set combined

In [3]:
df = pd.read_csv(
    get_project_root() / "data/heart-processed.csv",
    index_col=0
)
print(f"dataset instances: {len(df)}")
df.tail()

dataset instances: 671


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,RestingECG_Normal,ExerciseAngina_N,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,HeartDisease,train/val
209,0.530612,0.305556,0.252896,0.0,0.533835,0.015873,1,1,0,0,1,1,0,1,0,1,val
704,0.44898,0.537037,0.305019,0.0,0.443609,0.428571,1,1,0,0,0,1,0,1,0,1,val
665,0.285714,0.407407,0.444015,0.0,0.421053,0.301587,1,1,0,0,1,0,0,1,0,1,val
447,1.0,0.296296,0.166023,0.0,0.308271,0.333333,1,1,0,0,0,0,0,0,1,1,val
907,0.326531,0.259259,0.162162,0.0,0.56391,0.460317,1,1,0,0,1,0,1,0,0,1,val


In [4]:
df.drop(
    columns="train/val",
    inplace=True
)

# split attributes and target:
cols = list(df.columns)
cols.pop(cols.index("HeartDisease"))

x_train = df[cols]
y_train = df["HeartDisease"]

## 10-Fold Cross-Validation for Default Model Parameters

In [5]:
def fit_model(model, x, y):
    model.fit(x, y)

def cv_model(model, x, y, name):
    df = pd.DataFrame(
        cross_validate(
            model,
            x,
            y,
            cv=10,
            scoring=[
                "accuracy",
                "recall",
                "precision",
                "f1",
                "roc_auc"
            ]
        )
    )
    
    df = df.mean().to_frame().transpose()
    df.rename(index={0:f"{name}"}, inplace=True)

    return df

In [6]:
models = {
    "kNN": KNeighborsClassifier(),
    "SVM Classifier": SVC(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "XGBoost": XGBClassifier(),
}

In [7]:
cv_models = pd.DataFrame()

for name, model in models.items():
    np.random.seed(13)
    fit_model(model, x_train, y_train)
    model_cv = cv_model(model, x_train, y_train, name)

    cv_models = pd.concat([cv_models, model_cv])

cv_models

Unnamed: 0,fit_time,score_time,test_accuracy,test_recall,test_precision,test_f1,test_roc_auc
kNN,0.002037,0.008451,0.837555,0.853314,0.817177,0.833124,0.896764
SVM Classifier,0.007523,0.00687,0.853973,0.878314,0.828329,0.85144,0.918225
Random Forest Classifier,0.095831,0.017807,0.864399,0.87197,0.851809,0.859776,0.925344
Logistic Regression,0.007003,0.0039,0.862906,0.872159,0.846404,0.857681,0.924245
XGBoost,0.043735,0.00719,0.836084,0.847159,0.819992,0.832006,0.905598


In [8]:
cv_models.drop(
    columns=[
        "fit_time",
        "score_time"
    ],
    inplace=True
)

In [None]:
# dfi.export(
#     cv_models,
#     "../output/tables/cv_def_models.png"
# )

In [10]:
# need class weights for some parameter tuning:
train_class_weights = {
    0: y_train.value_counts()[0],
    1: y_train.value_counts()[1]
}
train_class_weights

{0: 350, 1: 321}

## Hyperparameter Tuning
* will use cross validation and recall as a scoring metric to tune select model parameters.

### SVM Classifier Tuning

In [11]:
svc_hparams = {
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "C": [100, 10, 1, 0.1, 0.01, 0.001],
    "degree": [2, 3, 4, 5], # ignored by all but poly kernel
    "class_weight": [None, "balanced", train_class_weights]
}

In [12]:
svc_search = pd.DataFrame()

gs_svc = GridSearchCV(
    SVC(),
    svc_hparams,
    cv=5,
    scoring="recall",
)
gs_svc.fit(x_train, y_train)
svc_search.at[gs_svc.best_estimator_, "recall"]= gs_svc.best_score_

In [13]:
svc_search

Unnamed: 0,recall
"SVC(C=0.001, class_weight='balanced', degree=4, kernel='poly')",0.925192


In [14]:
np.random.seed(13)
# svc_model = SVC(C=0.001, class_weight='balanced', degree=4, kernel='poly')
svc_model = gs_svc.best_estimator_
fit_model(svc_model, x_train, y_train)
opt_svc = cv_model(svc_model, x_train, y_train, str(svc_model))
# print(str(svc_model))
opt_svc

Unnamed: 0,fit_time,score_time,test_accuracy,test_recall,test_precision,test_f1,test_roc_auc
"SVC(C=0.001, class_weight='balanced', degree=4, kernel='poly')",0.010365,0.005764,0.831585,0.925095,0.770966,0.840211,0.913109


In [15]:
# # save trained tuned model:
# svc_path = get_project_root() / "models/svc.joblib"
# joblib.dump(svc_model, svc_path)

### Random Forest Tuning

In [16]:
rf_hparams = {
    "max_features": ["sqrt", "log2"],
    "min_samples_leaf": [1, 2, 3, 5],
    "n_estimators": [10, 100, 1000],
    "bootstrap": [True, False],
}

In [17]:
np.random.seed(13)
rf_search = pd.DataFrame(columns=["recall"])

gs_rf = GridSearchCV(
    RandomForestClassifier(),
    rf_hparams,
    cv=5,
    scoring="recall",
)

gs_rf.fit(x_train, y_train)
rf_search.at[str(gs_rf.best_estimator_).replace("\n",""), "recall"]= gs_rf.best_score_

In [18]:
rf_search

Unnamed: 0,recall
"RandomForestClassifier(max_features='sqrt', min_samples_leaf=5)",0.881442


In [19]:
np.random.seed(13)
rf_model = gs_rf.best_estimator_
fit_model(rf_model, x_train, y_train)
opt_rf = cv_model(rf_model, x_train, y_train, str(rf_model))
opt_rf

Unnamed: 0,fit_time,score_time,test_accuracy,test_recall,test_precision,test_f1,test_roc_auc
"RandomForestClassifier(max_features='sqrt', min_samples_leaf=5)",0.097798,0.019007,0.861348,0.881345,0.836879,0.857671,0.929957


In [20]:
# # save trained tuned model:
# rf_path = get_project_root() / "models/rf.joblib"
# joblib.dump(rf_model, rf_path)

### Logistic Regression Tuning

In [21]:
lr_hparams = {
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
    "penalty": ["l1", "l2", "elasticnet"],
    "C": [10, 1.0, 0.1, 0.01],
}

In [22]:
lr_search = pd.DataFrame()

gs_lr = GridSearchCV(
    LogisticRegression(),
    lr_hparams,
    cv=5,
    scoring="recall",
)
gs_lr.fit(x_train, y_train)

lr_search.at[str(gs_lr.best_estimator_).replace("\n",""), "recall"]= gs_lr.best_score_

160 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/hfpred/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/hfpred/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/anaconda3/envs/hfpred/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got

In [23]:
lr_search

Unnamed: 0,recall
"LogisticRegression(C=0.1, solver='liblinear')",0.88149


In [24]:
lr_model = gs_lr.best_estimator_
fit_model(lr_model, x_train, y_train)
opt_lr = cv_model(lr_model, x_train, y_train, str(lr_model))
opt_lr

Unnamed: 0,fit_time,score_time,test_accuracy,test_recall,test_precision,test_f1,test_roc_auc
"LogisticRegression(C=0.1, solver='liblinear')",0.00216,0.004298,0.862862,0.881345,0.840203,0.858726,0.921737


In [25]:
# # save trained tuned model:
# lr_path = get_project_root() / "models/opt_lr.joblib"
# joblib.dump(lr_model, lr_path)

### XGBoost Tuning

In [26]:
xgb_hparams = {
    "objective": ["reg:squarederror", "binary:logistic"],
    "max_depth": [3, 5, 7, 9], 
    # "colsample_bylevel": [0.5],
    "learning_rate": [0.01, 0.05, 0.1, 0.5],
    "random_state": [13]
}

In [27]:
xgb_search = pd.DataFrame()

gs_xgb = GridSearchCV(
    XGBClassifier(),
    xgb_hparams,
    cv=5,
    scoring="recall",
)
gs_xgb.fit(x_train, y_train)

xgb_search.at[str(gs_xgb.best_estimator_).replace("\n",""), "recall"]= gs_xgb.best_score_

In [28]:
xgb_search

Unnamed: 0,recall
"XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise', importance_type=None, interaction_constraints='', learning_rate=0.01, max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0, num_parallel_tree=1, objective='reg:squarederror', predictor='auto', random_state=13, reg_alpha=0, ...)",0.88149


In [29]:
xgb_model = gs_xgb.best_estimator_
fit_model(xgb_model, x_train, y_train)
opt_xgb = cv_model(xgb_model, x_train, y_train, str(xgb_model).replace("\n",""))
opt_xgb

Unnamed: 0,fit_time,score_time,test_accuracy,test_recall,test_precision,test_f1,test_roc_auc
"XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise', importance_type=None, interaction_constraints='', learning_rate=0.01, max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0, num_parallel_tree=1, objective='reg:squarederror', predictor='auto', random_state=13, reg_alpha=0, ...)",0.074152,0.008376,0.846532,0.875189,0.818602,0.844451,0.920928


In [30]:
# # save trained tuned model:
# xgb_path = get_project_root() / "models/xgb.joblib"
# joblib.dump(xgb_model, xgb_path)

In [31]:
opt_models = pd.concat(
    [
        opt_svc,
        opt_rf,
        opt_lr,
        opt_xgb
    ]
)
opt_models

Unnamed: 0,fit_time,score_time,test_accuracy,test_recall,test_precision,test_f1,test_roc_auc
"SVC(C=0.001, class_weight='balanced', degree=4, kernel='poly')",0.010365,0.005764,0.831585,0.925095,0.770966,0.840211,0.913109
"RandomForestClassifier(max_features='sqrt', min_samples_leaf=5)",0.097798,0.019007,0.861348,0.881345,0.836879,0.857671,0.929957
"LogisticRegression(C=0.1, solver='liblinear')",0.00216,0.004298,0.862862,0.881345,0.840203,0.858726,0.921737
"XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise', importance_type=None, interaction_constraints='', learning_rate=0.01, max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0, num_parallel_tree=1, objective='reg:squarederror', predictor='auto', random_state=13, reg_alpha=0, ...)",0.074152,0.008376,0.846532,0.875189,0.818602,0.844451,0.920928


In [32]:
# dfi.export(
#     opt_models,
#     "../output/tables/opt_models.png"
# )

In [33]:
opt_models.rename(
    index={
        str(xgb_model).replace("\n",""):
        "XGBClassifier(objective='reg:squarederror', max_depth=3, learning_rate=0.01, random_state=13)"
    },
    inplace=True
)

opt_models.drop(
    columns=[
        "fit_time",
        "score_time"
    ],
    inplace=True
)



In [34]:
# dfi.export(
#     opt_models,
#     "../output/tables/opt_models.png"
# )

In [35]:
opt_mods = opt_models.head(3)

In [36]:
# dfi.export(
#     opt_mods,
#     "../output/tables/opt_3_models.png"
# )