### Model Training

In [159]:
import pandas as pd 
import numpy as np
from  matplotlib import pyplot as plt
import joblib

from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, recall_score, precision_score, f1_score, make_scorer, classification_report
)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [9]:
df = pd.read_csv("../data/raw/heart_disease_data.csv")

In [11]:
NUMERIC_FEATURES = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
CATEGORICAL_FEATURES = ['Sex', 'ChestPainType', 'RestingECG', 'ST_Slope', 'ExerciseAngina']
BINARY_FEATURES = ['FastingBS']

In [12]:
y = df['HeartDisease'].values
X = df.drop('HeartDisease', axis=1)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, random_state=RANDOM_SEED, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state=RANDOM_SEED, test_size=0.25)

In [20]:
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), NUMERIC_FEATURES),
        ('onehot', OneHotEncoder(drop=None, handle_unknown="ignore"), CATEGORICAL_FEATURES),
        ],
    remainder="passthrough"
)

### Logistic Classifier

In [21]:
log_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(random_state=RANDOM_SEED, max_iter=1_000))
    ]
)

In [22]:
log_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('scaler', ...), ('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [91]:
y_train_pred = log_pipeline.predict(X_train)
y_val_pred = log_pipeline.predict(X_val)
y_test_pred = log_pipeline.predict(X_test)

In [93]:
train_recall_baseline = recall_score(y_train, y_train_pred)
val_recall_baseline = recall_score(y_val, y_val_pred)
test_recall_baseline = recall_score(y_test, y_test_pred)
train_precision_baseline = precision_score(y_train, y_train_pred)
val_precision_baseline = precision_score(y_val, y_val_pred)
test_precision_baseline = precision_score(y_test, y_test_pred)

In [28]:
print(f"recall:\ntrain:{train_recall_baseline}\nvalidation:{val_recall_baseline}")

recall:
train:0.8847457627118644
validation:0.9245283018867925


In [29]:
print(f"precision:\ntrain:{train_precision_baseline}\nvalidation:{val_precision_baseline}")

precision:
train:0.8642384105960265
validation:0.8828828828828829


In [105]:
param_grid = {
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': [ 0.1, 1, 10],
    'classifier__solver': ['liblinear', 'saga'],
    'classifier__class_weight': ['balanced', {0: 1, 1: 1.5}, {0: 1, 1: 2}, {0: 1, 1: 2.5}],
    'classifier__max_iter': [500, 1000]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

grid_search_model = GridSearchCV(
    estimator=log_pipeline,
    param_grid=param_grid,
    scoring='recall',
    cv=cv,
    verbose=2,
    n_jobs=-1,
    return_train_score=True
)

grid_search_model.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV] END classifier__C=0.1, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=0.1, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=0.1, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s[CV] END classifier__C=0.1, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s

[CV] END classifier__C=0.1, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=0.1, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l1, classifier__sol



[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class



[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_we



[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 1.5}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 1.5}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 1.5}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 1.5}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 1.5}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 1.5}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s
[CV] END classifie



[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.3s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s




[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, cla



[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.2s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.2s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END classifie



[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END 

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'classifier__C': [0.1, 1, ...], 'classifier__class_weight': ['balanced', {0: 1, 1: 1.5}, ...], 'classifier__max_iter': [500, 1000], 'classifier__penalty': ['l1', 'l2'], ...}"
,scoring,'recall'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,transformers,"[('scaler', ...), ('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,"{0: 1, 1: 2.5}"
,random_state,42
,solver,'liblinear'
,max_iter,500


In [107]:
for param, value in grid_search_model.best_params_.items():
    print(f"  {param}: {value}")

  classifier__C: 0.1
  classifier__class_weight: {0: 1, 1: 2.5}
  classifier__max_iter: 500
  classifier__penalty: l1
  classifier__solver: liblinear


In [108]:
best_log_pipeline = grid_search_model.best_estimator_

In [110]:
y_train_pred_tuned = best_log_pipeline.predict(X_train)
y_val_pred_tuned = best_log_pipeline.predict(X_val) 
y_test_pred_tuned = best_log_pipeline.predict(X_test)

In [111]:
train_recall_tuned = recall_score(y_train, y_train_pred_tuned)
val_recall_tuned = recall_score(y_val, y_val_pred_tuned) 
test_recall_tuned = recall_score(y_test, y_test_pred_tuned)

In [115]:
## Recall improved by 5% on the test dataset
test_recall_tuned, test_recall_baseline, test_recall_tuned > test_recall_baseline

(0.9065420560747663, 0.8411214953271028, True)

In [162]:
with open("../models/log_regression.joblib", "wb") as f:
    joblib.dump(best_log_pipeline, f)

### Random Forest Classifier

In [118]:
rf_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=RANDOM_SEED))
    ]
)

In [119]:
rf_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('scaler', ...), ('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [120]:
y_train_pred = rf_pipeline.predict(X_train)
y_val_pred = rf_pipeline.predict(X_val)
y_test_pred = rf_pipeline.predict(X_test)

In [121]:
train_recall_baseline = recall_score(y_train, y_train_pred)
val_recall_baseline = recall_score(y_val, y_val_pred)
test_recall_baseline = recall_score(y_test, y_test_pred)
train_precision_baseline = precision_score(y_train, y_train_pred)
val_precision_baseline = precision_score(y_val, y_val_pred)
test_precision_baseline = precision_score(y_test, y_test_pred)

In [122]:
print(f"recall:\ntrain:{train_recall_baseline}\nvalidation:{val_recall_baseline}")

recall:
train:1.0
validation:0.9622641509433962


In [123]:
print(f"precision:\ntrain:{train_precision_baseline}\nvalidation:{val_precision_baseline}")

precision:
train:1.0
validation:0.8869565217391304


In [153]:
param_grid = {
    
    'classifier__bootstrap': [True],
    'classifier__max_depth': [4, 6, 8] ,    
    'classifier__max_features': ["sqrt"],    
    'classifier__min_samples_leaf': [2, 4, 6],
    'classifier__min_samples_split': [10, 12],
    'classifier__n_estimators': [40, 50 , 60,], #80, 90, 100
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

grid_search_model = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=param_grid,
    scoring='recall',
    cv=cv,
    verbose=2,
    n_jobs=-1,
    return_train_score=True
)

grid_search_model.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] END classifier__bootstrap=True, classifier__max_depth=4, classifier__max_features=sqrt, classifier__min_samples_leaf=2, classifier__min_samples_split=10, classifier__n_estimators=40; total time=   0.1s
[CV] END classifier__bootstrap=True, classifier__max_depth=4, classifier__max_features=sqrt, classifier__min_samples_leaf=2, classifier__min_samples_split=10, classifier__n_estimators=40; total time=   0.1s


[CV] END classifier__bootstrap=True, classifier__max_depth=4, classifier__max_features=sqrt, classifier__min_samples_leaf=2, classifier__min_samples_split=10, classifier__n_estimators=40; total time=   0.2s
[CV] END classifier__bootstrap=True, classifier__max_depth=4, classifier__max_features=sqrt, classifier__min_samples_leaf=2, classifier__min_samples_split=10, classifier__n_estimators=40; total time=   0.2s
[CV] END classifier__bootstrap=True, classifier__max_depth=4, classifier__max_features=sqrt, classifier__min_samples_leaf=2, classifier__min_samples_split=10, classifier__n_estimators=40; total time=   0.1s
[CV] END classifier__bootstrap=True, classifier__max_depth=4, classifier__max_features=sqrt, classifier__min_samples_leaf=2, classifier__min_samples_split=10, classifier__n_estimators=50; total time=   0.2s
[CV] END classifier__bootstrap=True, classifier__max_depth=4, classifier__max_features=sqrt, classifier__min_samples_leaf=2, classifier__min_samples_split=10, classifier__n

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'classifier__bootstrap': [True], 'classifier__max_depth': [4, 6, ...], 'classifier__max_features': ['sqrt'], 'classifier__min_samples_leaf': [2, 4, ...], ...}"
,scoring,'recall'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,transformers,"[('scaler', ...), ('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,60
,criterion,'gini'
,max_depth,6
,min_samples_split,10
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [154]:
for param, value in grid_search_model.best_params_.items():
    print(f"  {param}: {value}")

  classifier__bootstrap: True
  classifier__max_depth: 6
  classifier__max_features: sqrt
  classifier__min_samples_leaf: 4
  classifier__min_samples_split: 10
  classifier__n_estimators: 60


In [155]:
best_rf_pipeline = grid_search_model.best_estimator_

In [156]:
y_train_pred_tuned = best_rf_pipeline.predict(X_train)
y_val_pred_tuned = best_rf_pipeline.predict(X_val) 
y_test_pred_tuned = best_rf_pipeline.predict(X_test)

In [157]:
train_recall_tuned = recall_score(y_train, y_train_pred_tuned)
val_recall_tuned = recall_score(y_val, y_val_pred_tuned) 
test_recall_tuned = recall_score(y_test, y_test_pred_tuned)

In [158]:
## Recall improved by 5% on the test dataset
test_recall_tuned, test_recall_baseline, test_recall_tuned > test_recall_baseline

(0.8785046728971962, 0.8691588785046729, True)

In [163]:
with open("../models/random_forest.joblib", "wb") as f:
    joblib.dump(best_rf_pipeline, f)