### Model Training

In [2]:
import pandas as pd 
import numpy as np
from  matplotlib import pyplot as plt
import joblib

from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, recall_score, precision_score, f1_score, make_scorer, classification_report
)

from xgboost import XGBClassifier

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [3]:
df = pd.read_csv("../data/raw/heart_disease_data.csv")

In [4]:
NUMERIC_FEATURES = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
CATEGORICAL_FEATURES = ['Sex', 'ChestPainType', 'RestingECG', 'ST_Slope', 'ExerciseAngina']
BINARY_FEATURES = ['FastingBS']

In [5]:
y = df['HeartDisease'].values
X = df.drop('HeartDisease', axis=1)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, random_state=RANDOM_SEED, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state=RANDOM_SEED, test_size=0.25)

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), NUMERIC_FEATURES),
        ('onehot', OneHotEncoder(drop=None, handle_unknown="ignore"), CATEGORICAL_FEATURES),
        ],
    remainder="passthrough"
)

### Logistic Classifier

In [7]:
log_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(random_state=RANDOM_SEED, max_iter=1_000))
    ]
)

In [8]:
log_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('scaler', ...), ('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [9]:
y_train_pred = log_pipeline.predict(X_train)
y_val_pred = log_pipeline.predict(X_val)
y_test_pred = log_pipeline.predict(X_test)

In [10]:
train_recall_baseline = recall_score(y_train, y_train_pred)
val_recall_baseline = recall_score(y_val, y_val_pred)
test_recall_baseline = recall_score(y_test, y_test_pred)
train_precision_baseline = precision_score(y_train, y_train_pred)
val_precision_baseline = precision_score(y_val, y_val_pred)
test_precision_baseline = precision_score(y_test, y_test_pred)

In [11]:
print(f"recall:\ntrain:{train_recall_baseline}\nvalidation:{val_recall_baseline}")

recall:
train:0.8847457627118644
validation:0.9245283018867925


In [12]:
print(f"precision:\ntrain:{train_precision_baseline}\nvalidation:{val_precision_baseline}")

precision:
train:0.8642384105960265
validation:0.8828828828828829


In [13]:
param_grid = {
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': [ 0.1, 1, 10],
    'classifier__solver': ['liblinear', 'saga'],
    'classifier__class_weight': ['balanced', {0: 1, 1: 1.5}, {0: 1, 1: 2}, {0: 1, 1: 2.5}],
    'classifier__max_iter': [500, 1000]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

grid_search_model = GridSearchCV(
    estimator=log_pipeline,
    param_grid=param_grid,
    scoring='recall',
    cv=cv,
    verbose=2,
    n_jobs=-1,
    return_train_score=True
)

grid_search_model.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[CV] END classifier__C=0.1, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=0.1, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=0.1, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=0.1, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=0.1, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=0.1, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=0.1, class



[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.2s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.1s




[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.2s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.2s
[CV] END classifier__C=10, classifier__class_weight=balanced, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight=balanc



[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 1.5}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 1.5}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 1.5}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.2s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 1.5}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 1.5}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s




[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 1.5}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 1.5}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.2s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 1.5}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 1.5}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 1.5}, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 1.5}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C



[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.2s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__c



[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2}, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END classi



[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=500, classifier__penalty=l2, classifier__solver=saga; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C



[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.2s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=liblinear; total time=   0.0s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=500, classifier__penalty=l1, classifier__solver=saga; total time=   0.2s
[CV] END classifier__C=10, classifier__class_weight={0: 1, 1: 2.5}, classifier__max_iter=1000, classifier__penalty=l1, classifier__solver=saga; total time=   0.1s
[CV] END 

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'classifier__C': [0.1, 1, ...], 'classifier__class_weight': ['balanced', {0: 1, 1: 1.5}, ...], 'classifier__max_iter': [500, 1000], 'classifier__penalty': ['l1', 'l2'], ...}"
,scoring,'recall'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,transformers,"[('scaler', ...), ('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,"{0: 1, 1: 2.5}"
,random_state,42
,solver,'liblinear'
,max_iter,500


In [14]:
for param, value in grid_search_model.best_params_.items():
    print(f"  {param}: {value}")

  classifier__C: 0.1
  classifier__class_weight: {0: 1, 1: 2.5}
  classifier__max_iter: 500
  classifier__penalty: l1
  classifier__solver: liblinear


In [15]:
best_log_pipeline = grid_search_model.best_estimator_

In [16]:
y_train_pred_tuned = best_log_pipeline.predict(X_train)
y_val_pred_tuned = best_log_pipeline.predict(X_val) 
y_test_pred_tuned = best_log_pipeline.predict(X_test)

In [17]:
log_train_recall_tuned = recall_score(y_train, y_train_pred_tuned)
log_val_recall_tuned = recall_score(y_val, y_val_pred_tuned) 
log_test_recall_tuned = recall_score(y_test, y_test_pred_tuned)

In [18]:
## Recall improved by 5% on the test dataset
log_test_recall_tuned, test_recall_baseline, log_test_recall_tuned > test_recall_baseline

(0.9065420560747663, 0.8411214953271028, True)

In [19]:
with open("../models/log_regression.joblib", "wb") as f:
    joblib.dump(best_log_pipeline, f)

### Random Forest Classifier

In [20]:
rf_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=RANDOM_SEED))
    ]
)

In [21]:
rf_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('scaler', ...), ('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [22]:
y_train_pred = rf_pipeline.predict(X_train)
y_val_pred = rf_pipeline.predict(X_val)
y_test_pred = rf_pipeline.predict(X_test)

In [23]:
train_recall_baseline = recall_score(y_train, y_train_pred)
val_recall_baseline = recall_score(y_val, y_val_pred)
test_recall_baseline = recall_score(y_test, y_test_pred)
train_precision_baseline = precision_score(y_train, y_train_pred)
val_precision_baseline = precision_score(y_val, y_val_pred)
test_precision_baseline = precision_score(y_test, y_test_pred)

In [24]:
print(f"recall:\ntrain:{train_recall_baseline}\nvalidation:{val_recall_baseline}")

recall:
train:1.0
validation:0.9622641509433962


In [25]:
print(f"precision:\ntrain:{train_precision_baseline}\nvalidation:{val_precision_baseline}")

precision:
train:1.0
validation:0.8869565217391304


In [26]:
param_grid = {
    
    'classifier__bootstrap': [True],
    'classifier__max_depth': [4, 6, 8] ,    
    'classifier__max_features': ["sqrt"],    
    'classifier__min_samples_leaf': [2, 4, 6],
    'classifier__min_samples_split': [10, 12],
    'classifier__n_estimators': [40, 50 , 60,], #80, 90, 100
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

grid_search_model = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=param_grid,
    scoring='recall',
    cv=cv,
    verbose=2,
    n_jobs=-1,
    return_train_score=True
)

grid_search_model.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] END classifier__bootstrap=True, classifier__max_depth=4, classifier__max_features=sqrt, classifier__min_samples_leaf=2, classifier__min_samples_split=10, classifier__n_estimators=40; total time=   0.2s
[CV] END classifier__bootstrap=True, classifier__max_depth=4, classifier__max_features=sqrt, classifier__min_samples_leaf=2, classifier__min_samples_split=10, classifier__n_estimators=40; total time=   0.2s
[CV] END classifier__bootstrap=True, classifier__max_depth=4, classifier__max_features=sqrt, classifier__min_samples_leaf=2, classifier__min_samples_split=10, classifier__n_estimators=40; total time=   0.1s
[CV] END classifier__bootstrap=True, classifier__max_depth=4, classifier__max_features=sqrt, classifier__min_samples_leaf=2, classifier__min_samples_split=10, classifier__n_estimators=40; total time=   0.1s
[CV] END classifier__bootstrap=True, classifier__max_depth=4, classifier__max_features=sqrt, classifier__min_s

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'classifier__bootstrap': [True], 'classifier__max_depth': [4, 6, ...], 'classifier__max_features': ['sqrt'], 'classifier__min_samples_leaf': [2, 4, ...], ...}"
,scoring,'recall'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,transformers,"[('scaler', ...), ('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,60
,criterion,'gini'
,max_depth,6
,min_samples_split,10
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [27]:
for param, value in grid_search_model.best_params_.items():
    print(f"  {param}: {value}")

  classifier__bootstrap: True
  classifier__max_depth: 6
  classifier__max_features: sqrt
  classifier__min_samples_leaf: 4
  classifier__min_samples_split: 10
  classifier__n_estimators: 60


In [28]:
best_rf_pipeline = grid_search_model.best_estimator_

In [29]:
y_train_pred_tuned = best_rf_pipeline.predict(X_train)
y_val_pred_tuned = best_rf_pipeline.predict(X_val) 
y_test_pred_tuned = best_rf_pipeline.predict(X_test)

In [30]:
rf_train_recall_tuned = recall_score(y_train, y_train_pred_tuned)
rf_val_recall_tuned = recall_score(y_val, y_val_pred_tuned) 
rf_test_recall_tuned = recall_score(y_test, y_test_pred_tuned)

In [31]:
## Recall improved by 5% on the test dataset
rf_test_recall_tuned, test_recall_baseline, rf_test_recall_tuned > test_recall_baseline

(0.8785046728971962, 0.8691588785046729, True)

In [32]:
with open("../models/random_forest.joblib", "wb") as f:
    joblib.dump(best_rf_pipeline, f)

### XGBoost Classifier

In [33]:
xg_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(
            eval_metric='logloss'
        ))
    ]
)

In [34]:
xg_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('scaler', ...), ('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [35]:
y_train_pred = xg_pipeline.predict(X_train)
y_val_pred = xg_pipeline.predict(X_val)
y_test_pred = xg_pipeline.predict(X_test)

In [36]:
train_recall_baseline = recall_score(y_train, y_train_pred)
val_recall_baseline = recall_score(y_val, y_val_pred)
test_recall_baseline = recall_score(y_test, y_test_pred)
train_precision_baseline = precision_score(y_train, y_train_pred)
val_precision_baseline = precision_score(y_val, y_val_pred)
test_precision_baseline = precision_score(y_test, y_test_pred)

In [37]:
print(f"recall:\ntrain:{train_recall_baseline}\nvalidation:{val_recall_baseline}\ntest:{test_recall_baseline}")

recall:
train:1.0
validation:0.9433962264150944
test:0.8504672897196262


In [38]:
param_grid = {
    
    'classifier__n_estimators': [50, 70, 100],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [2, 3, 4],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

grid_search_model = GridSearchCV(
    estimator=xg_pipeline,
    param_grid=param_grid,
    scoring='recall',
    cv=cv,
    verbose=2,
    n_jobs=-1,
    return_train_score=True
)

grid_search_model.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END classifier__learning_rate=0.01, classifier__max_depth=2, classifier__n_estimators=50; total time=   0.0s
[CV] END classifier__learning_rate=0.01, classifier__max_depth=2, classifier__n_estimators=50; total time=   0.0s
[CV] END classifier__learning_rate=0.01, classifier__max_depth=2, classifier__n_estimators=50; total time=   0.0s
[CV] END classifier__learning_rate=0.01, classifier__max_depth=2, classifier__n_estimators=50; total time=   0.1s


[CV] END classifier__learning_rate=0.01, classifier__max_depth=2, classifier__n_estimators=50; total time=   0.0s
[CV] END classifier__learning_rate=0.01, classifier__max_depth=2, classifier__n_estimators=70; total time=   0.1s
[CV] END classifier__learning_rate=0.01, classifier__max_depth=2, classifier__n_estimators=70; total time=   0.1s
[CV] END classifier__learning_rate=0.01, classifier__max_depth=2, classifier__n_estimators=70; total time=   0.1s
[CV] END classifier__learning_rate=0.01, classifier__max_depth=2, classifier__n_estimators=70; total time=   0.2s
[CV] END classifier__learning_rate=0.01, classifier__max_depth=2, classifier__n_estimators=70; total time=   0.0s
[CV] END classifier__learning_rate=0.01, classifier__max_depth=2, classifier__n_estimators=100; total time=   0.1s
[CV] END classifier__learning_rate=0.01, classifier__max_depth=2, classifier__n_estimators=100; total time=   0.1s
[CV] END classifier__learning_rate=0.01, classifier__max_depth=2, classifier__n_estima

0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_grid,"{'classifier__learning_rate': [0.01, 0.1, ...], 'classifier__max_depth': [2, 3, ...], 'classifier__n_estimators': [50, 70, ...]}"
,scoring,'recall'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,transformers,"[('scaler', ...), ('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [39]:
for param, value in grid_search_model.best_params_.items():
    print(f"  {param}: {value}")

  classifier__learning_rate: 0.01
  classifier__max_depth: 3
  classifier__n_estimators: 50


In [40]:
best_xg_pipeline = grid_search_model.best_estimator_

In [41]:
y_train_pred_tuned = best_xg_pipeline.predict(X_train)
y_val_pred_tuned = best_xg_pipeline.predict(X_val) 
y_test_pred_tuned = best_xg_pipeline.predict(X_test)

In [42]:
xg_train_recall_tuned = recall_score(y_train, y_train_pred_tuned)
xg_val_recall_tuned = recall_score(y_val, y_val_pred_tuned) 
xg_test_recall_tuned = recall_score(y_test, y_test_pred_tuned)

In [43]:
## Recall improved by 5% on the test dataset
xg_test_recall_tuned, test_recall_baseline, xg_test_recall_tuned > test_recall_baseline

(0.8878504672897196, 0.8504672897196262, True)

In [44]:
with open("../models/xgbooost.joblib", "wb") as f:
    joblib.dump(best_xg_pipeline, f)

### Comparison of Models

In [45]:
print(f"""Training Recall Scores:\n
    Logistic Regression: {log_train_recall_tuned}\n
    Random Forest: {rf_train_recall_tuned}\n
    XGBoost: {xg_train_recall_tuned}\n  

    """)

Training Recall Scores:

    Logistic Regression: 0.9559322033898305

    Random Forest: 0.9355932203389831

    XGBoost: 0.9186440677966101
  

    


In [46]:
print(f"""Validation Recall Scores:\n
    Logistic Regression: {log_val_recall_tuned}\n
    Random Forest: {rf_val_recall_tuned}\n
    XGBoost: {xg_val_recall_tuned}\n  
    """)

Validation Recall Scores:

    Logistic Regression: 1.0

    Random Forest: 0.9622641509433962

    XGBoost: 0.9716981132075472
  
    


In [47]:
print(f"""Test Recall Scores:\n
    Logistic Regression: {log_test_recall_tuned}\n
    Random Forest: {rf_test_recall_tuned}\n
    XGBoost: {xg_test_recall_tuned}\n  

    """)

Test Recall Scores:

    Logistic Regression: 0.9065420560747663

    Random Forest: 0.8785046728971962

    XGBoost: 0.8878504672897196
  

    
