## load data

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, accuracy_score, recall_score, roc_auc_score, f1_score,\
precision_score, roc_auc_score, classification_report, fbeta_score
from sklearn.pipeline import Pipeline
from scipy.stats import randint

import warnings
warnings.filterwarnings("ignore")

In [37]:
data_path = '../data/heart_disease.csv'

df = pd.read_csv(data_path)

NUMERIC_COLS = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
CATEGORICAL_COLS = [f for f in df.columns if f not in NUMERIC_COLS]

# SELECTED_FEATS = ['thal_7', 'thalach', 'ca', 'cp_4', 'oldpeak', 'exang']

# NUMERIC_COLS = list(set(NUMERIC_COLS) & set(SELECTED_FEATS))
# CATEGORICAL_COLS = list(set(CATEGORICAL_COLS) & set(SELECTED_FEATS))

df.head(2)

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,target,cp_2,cp_3,cp_4,restecg_1,restecg_2,thal_6,thal_7
0,63.0,1.0,145.0,233.0,1.0,150.0,0.0,2.3,3.0,0.0,0,0,0,0,0,1,1,0
1,67.0,1.0,160.0,286.0,0.0,108.0,1.0,1.5,2.0,3.0,1,0,0,1,0,1,0,0


In [38]:
# split the data
X = df.drop("target", axis=1)
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Models Pipelines

- Best three models that showed **high accuracy**:
    - Random forest
    - Support vector machine
    - Logistic regression

In [39]:
CATEGORICAL_COLS.remove('target')

In [40]:
# preprocessor that scales only numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), NUMERIC_COLS),
        ('cat', 'passthrough', CATEGORICAL_COLS)
    ]
)
metrics = {} # metrics for each model
f2_scorer = make_scorer(fbeta_score, beta=2)
scoring = f2_scorer

- The **F2-score** optimally balances the clinical reality that false negatives (missed heart disease) are **more costly** than false positives in most healthcare scenarios.

# Random Forest Hyperparameter Tuning

### 1. Random forest Basline

In [41]:
# Random Forest Pipeline
rf_pipeline = Pipeline([
#     ('preprocessor', preprocessor), # since binary trees doesn't need scaling
    ('cls', RandomForestClassifier(random_state=42))
])

# Baseline random forset score
print(f"Baseline CV score: {cross_val_score(rf_pipeline, X_train, y_train, cv=5, scoring=scoring).mean():.4f}")

Baseline CV score: 0.7782


### 2. Random forest grid search (phase 1):
- Using **RandomizedSearchCV** with variety of parameters

In [42]:
# Random Search for Random Forest Parameter Grid
print("=== PHASE 1: RANDOM FOREST RANDOM_GRIDSEARCH ===")
rf_param_dist = {
    'cls__n_estimators': randint(50, 300), # minimum number of samples at a leaf node
    'cls__max_depth': randint(3, 15), # maximum depth of each tree.
    'cls__min_samples_split': randint(2, 20), # minimum number of samples to split a node.
    'cls__min_samples_leaf': randint(1, 10), # minimum number of samples at a leaf node.
    'cls__max_features': ['sqrt', 'log2', 0.5, 0.7], # number of features considered at each split.
    'cls__bootstrap': [True, False]
}

rf_random_search = RandomizedSearchCV(
    rf_pipeline, rf_param_dist, n_iter=50, cv=5, 
    scoring=scoring, n_jobs=-1, random_state=42
)
rf_random_search.fit(X_train, y_train)

print("Best random forest params (Phase 1):")
for key, value in rf_random_search.best_params_.items():
    print(f"  {key}: {value}")
print(f"Best CV score:{rf_random_search.best_score_:.3f}")

=== PHASE 1: RANDOM FOREST RANDOM_GRIDSEARCH ===
Best random forest params (Phase 1):
  cls__bootstrap: False
  cls__max_depth: 8
  cls__max_features: log2
  cls__min_samples_leaf: 5
  cls__min_samples_split: 2
  cls__n_estimators: 253
Best CV score:0.787


### 3. Random forest grid search (phase 2):
- Using **GridSearchCV** with more specific parametrs

In [43]:
print("=== PHASE 2: RANDOM FOREST GRIDSEARCH ===")

best = rf_random_search.best_params_

rf_param_grid = {
    'cls__n_estimators': [200, 250, 300],      # Slightly increase trees
    'cls__max_depth': [max(3, best['cls__max_depth']-2), best['cls__max_depth'], min(20, best['cls__max_depth']+2)],
    'cls__min_samples_split': [max(2, best['cls__min_samples_split']-2), best['cls__min_samples_split'], best['cls__min_samples_split']+2],
    'cls__min_samples_leaf': [max(1, best['cls__min_samples_leaf']-1), best['cls__min_samples_leaf'], best['cls__min_samples_leaf']+1],
    'cls__max_features': [best['cls__max_features']],
    'cls__bootstrap': [best['cls__bootstrap']]
}

rf_grid_search = GridSearchCV(
    rf_pipeline,
    rf_param_grid,
    cv=5,
    scoring=scoring,
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

rf_grid_search.fit(X_train, y_train)

print("Best Random Forest Parameters:")
for key, value in rf_grid_search.best_params_.items():
    print(f"  {key}: {value}")
print(f"Best CV Score: {rf_grid_search.best_score_:.3f}")

=== PHASE 2: RANDOM FOREST GRIDSEARCH ===
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Random Forest Parameters:
  cls__bootstrap: False
  cls__max_depth: 10
  cls__max_features: log2
  cls__min_samples_leaf: 5
  cls__min_samples_split: 2
  cls__n_estimators: 200
Best CV Score: 0.793


### 4. Evaluation for random forest best model

In [44]:
# Evaluate best model
best_rf = rf_grid_search.best_estimator_
y_pred_rf = best_rf.predict(X_test)
y_pred_proba_rf = best_rf.predict_proba(X_test)[:, 1]

metrics['Random Forest'] = {
    'Best_Params': rf_grid_search.best_params_,
    'CV_Score': rf_grid_search.best_score_,
    'Test_Accuracy': accuracy_score(y_test, y_pred_rf),
    'Test_Precision': precision_score(y_test, y_pred_rf),
    'Test_Recall': recall_score(y_test, y_pred_rf),
    'Test_F1': f1_score(y_test, y_pred_rf),
    'Test_AUC': roc_auc_score(y_test, y_pred_proba_rf)
}

print("\n=== TEST SET PERFORMANCE ===")
for metric, value in list(metrics['Random Forest'].items())[1:]:  # Skip first items
    print(f"{metric}: {value:.3f}")


=== TEST SET PERFORMANCE ===
CV_Score: 0.793
Test_Accuracy: 0.918
Test_Precision: 0.897
Test_Recall: 0.929
Test_F1: 0.912
Test_AUC: 0.953


# Support vector machine hypertuning

### 1. Support vector machine Basline

In [45]:
# SVM Pipeline
svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('cls', SVC(probability=True, random_state=42))  # probability=True for AUC
])

# Baseline support vector score
print(f"Baseline CV score: {cross_val_score(svm_pipeline, X_train, y_train, cv=5, scoring=scoring).mean():.4f}")

Baseline CV score: 0.7278


### 2. SVM grid search (phase 1):
- Using **RandomizedSearchCV** with variety of parameters

In [46]:
# Random Search for Support Vector Machine Parameter Grid
print("=== PHASE 1: SUPPORT VECTOR MACHINE RANDOM_GRIDSEARCH ===")

# linear and rbf kernel
svm_param_dist = {
    "cls__C": np.logspace(-3, 3, 20),        # 0.001 → 1000
    "cls__gamma": np.logspace(-4, 1, 20),    # 0.0001 → 10
    "cls__kernel": ["linear", "rbf"],
    "cls__class_weight": [None, "balanced"]
    
}

# poly kernel
# svm_param_dist = {
#     "cls__kernel": ["poly"],
#     "cls__degree": [2, 3],
#     "cls__C": [0.1, 1, 10],
#     "cls__gamma": ["scale", "auto", 0.01, 0.1],
#     "cls__coef0": [0, 1]
# }

svm_random_search = RandomizedSearchCV(
    svm_pipeline, svm_param_dist, n_iter=50, cv=5, 
    scoring=scoring, n_jobs=-1, random_state=42
)
svm_random_search.fit(X_train, y_train)

print("Best params (Phase 1):")
for key, value in svm_random_search.best_params_.items():
    print(f"  {key}: {value}")
print(f"Best CV score :{svm_random_search.best_score_:.3f}")

=== PHASE 1: SUPPORT VECTOR MACHINE RANDOM_GRIDSEARCH ===
Best params (Phase 1):
  cls__kernel: linear
  cls__gamma: 0.0069519279617756054
  cls__class_weight: None
  cls__C: 26.366508987303554
Best CV score :0.803


### 3. SVM grid search (phase 2):
- Using **GridSearchCV** with more specific parametrs

In [50]:
print("=== PHASE 2: SUPPORT VECTOR MACHINE GRIDSEARCH ===")

best = svm_random_search.best_params_

# for linear kernel
svm_param_grid = {
    "cls__C": [800, best['cls__C'], 1500],
    "cls__kernel": [best['cls__kernel']],
    "cls__class_weight": [best['cls__class_weight']]
}

svm_grid_search = GridSearchCV(
    svm_pipeline,
    svm_param_grid,
    cv=5,
    scoring=scoring,
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

svm_grid_search.fit(X_train, y_train)

print("Best model parameters:")
for key, value in svm_grid_search.best_params_.items():
    print(f"  {key}: {value}")
print(f"Best CV Score: {svm_grid_search.best_score_:.3f}")

=== PHASE 2: SUPPORT VECTOR MACHINE GRIDSEARCH ===
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best model parameters:
  cls__C: 26.366508987303554
  cls__class_weight: None
  cls__kernel: linear
Best CV Score: 0.803


### 4. Evaluation for Support Vector Machine best model

In [51]:
# Evaluate best model
best_svm = svm_grid_search.best_estimator_
y_pred_svm = best_svm.predict(X_test)
y_pred_proba_svm = best_svm.predict_proba(X_test)[:, 1]

metrics['SVM'] = {
    'Best_Params': svm_grid_search.best_params_,
    'CV_Score': svm_grid_search.best_score_,
    'Test_Accuracy': accuracy_score(y_test, y_pred_svm),
    'Test_Precision': precision_score(y_test, y_pred_svm),
    'Test_Recall': recall_score(y_test, y_pred_svm),
    'Test_F1': f1_score(y_test, y_pred_svm),
    'Test_AUC': roc_auc_score(y_test, y_pred_proba_svm)
}

print("\n=== TEST SET PERFORMANCE ===")
for metric, value in list(metrics['SVM'].items())[1:]:
    print(f"{metric}: {value:.3f}")


=== TEST SET PERFORMANCE ===
CV_Score: 0.803
Test_Accuracy: 0.869
Test_Precision: 0.812
Test_Recall: 0.929
Test_F1: 0.867
Test_AUC: 0.950


## Logistic regression hypertuning

### 1. Logistic regression Basline

In [52]:
# Logistic Regression Pipeline
lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('cls', LogisticRegression(random_state=42, max_iter=1000))
])

# Baseline Logistic Regression
print(f"Baseline CV score: {cross_val_score(lr_pipeline, X_train, y_train, cv=5, scoring=scoring).mean():.4f}")

Baseline CV score: 0.7882


### 2. Logistic Regression grid search (phase 1):
- Using **RandomizedSearchCV** with variety of parameters

In [53]:
# Random Search for Logistic Regression Parameter Grid
print("=== PHASE 1: LOGISTIC REGRESSION RANDOM_GRIDSEARCH ===")

lr_param_dist = {
    "cls__penalty": ["l1", "l2", "elasticnet"],
    "cls__C": np.logspace(-3, 3, 50),    # 0.001 → 1000
    "cls__solver": ["liblinear", "saga"],  # supports l1 & elasticnet
    "cls__class_weight": [None, "balanced"],
    "cls__l1_ratio": np.linspace(0, 1, 5)  # only matters if elasticnet
}

lr_random_search = RandomizedSearchCV(
    lr_pipeline, lr_param_dist, n_iter=50, cv=5, 
    scoring=scoring, n_jobs=-1, random_state=42
)
lr_random_search.fit(X_train, y_train)

print("Best params (Phase 1):")
for key, value in lr_random_search.best_params_.items():
    print(f"  {key}: {value}")
print(f"Best CV score :{lr_random_search.best_score_:.3f}")

=== PHASE 1: LOGISTIC REGRESSION RANDOM_GRIDSEARCH ===
Best params (Phase 1):
  cls__solver: liblinear
  cls__penalty: l2
  cls__l1_ratio: 0.5
  cls__class_weight: balanced
  cls__C: 0.001
Best CV score :0.846


### 3. Logistic Regression grid search (phase 2):
- Using **GridSearchCV** with more specific parametrs

In [54]:
print("=== PHASE 2: LOGISTIC REGRESSION GRIDSEARCH ===")

best = lr_random_search.best_params_

lr_param_grid = {
    "cls__C": np.logspace(-4, -1, 10),   # narrower range
    "cls__penalty": [best["cls__penalty"]],
    "cls__solver": [best["cls__solver"]],
    "cls__class_weight": [best["cls__class_weight"]]
}

lr_grid_search = GridSearchCV(
    lr_pipeline,
    lr_param_grid,
    cv=5,
    scoring=scoring,
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

lr_grid_search.fit(X_train, y_train)

print("Best model parameters:")
for key, value in lr_grid_search.best_params_.items():
    print(f"  {key}: {value}")
print(f"Best CV Score: {lr_grid_search.best_score_:.3f}")

=== PHASE 2: LOGISTIC REGRESSION GRIDSEARCH ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best model parameters:
  cls__C: 0.0001
  cls__class_weight: balanced
  cls__penalty: l2
  cls__solver: liblinear
Best CV Score: 0.853


### 4. Evaluation for Logistic Regression best model

In [55]:
# Evaluate best model
best_lr = lr_grid_search.best_estimator_
y_pred_lr = best_lr.predict(X_test)
y_pred_proba_lr = best_lr.predict_proba(X_test)[:, 1]

metrics['Logistic Regression'] = {
    'Best_Params': lr_grid_search.best_params_,
    'CV_Score': lr_grid_search.best_score_,
    'Test_Accuracy': accuracy_score(y_test, y_pred_lr),
    'Test_Precision': precision_score(y_test, y_pred_lr),
    'Test_Recall': recall_score(y_test, y_pred_lr),
    'Test_F1': f1_score(y_test, y_pred_lr),
    'Test_AUC': roc_auc_score(y_test, y_pred_proba_lr)
}

print("\n=== TEST SET PERFORMANCE ===")
for metric, value in list(metrics['Logistic Regression'].items())[1:]:
    print(f"{metric}: {value:.3f}")


=== TEST SET PERFORMANCE ===
CV_Score: 0.853
Test_Accuracy: 0.672
Test_Precision: 0.583
Test_Recall: 1.000
Test_F1: 0.737
Test_AUC: 0.951


# Evaluation

In [56]:
# Display results
print(f"\n{'='*80}")
print("FINAL RESULTS COMPARISON")
print(f"{'='*80}")

models = {
    "Logistic Regression": best_lr,
    "Random Forest": best_rf,
    "SVM": best_svm,
}
all_grids = {
        'Random Forest': rf_grid_search,
        'SVM': svm_grid_search,
        'Logistic Regression': lr_grid_search
}
results = []
for name, model in models.items():
    results.append({
        "Model": name,
        **metrics[name],
    })

df_results = pd.DataFrame(results)


print(df_results.drop(columns='Best_Params'))

# Display best parameters for each model
print(f"\n{'='*80}")
print("BEST PARAMETERS FOR EACH MODEL")
print(f"{'='*80}")

for i in results:
        print(f"\n{i['Model']}:")
        for param, value in i['Best_Params'].items():
            print(f"  {param}: {value}")


FINAL RESULTS COMPARISON
                 Model  CV_Score  Test_Accuracy  Test_Precision  Test_Recall  \
0  Logistic Regression  0.853067       0.672131        0.583333     1.000000   
1        Random Forest  0.793257       0.918033        0.896552     0.928571   
2                  SVM  0.802667       0.868852        0.812500     0.928571   

    Test_F1  Test_AUC  
0  0.736842  0.951299  
1  0.912281  0.953463  
2  0.866667  0.950216  

BEST PARAMETERS FOR EACH MODEL

Logistic Regression:
  cls__C: 0.0001
  cls__class_weight: balanced
  cls__penalty: l2
  cls__solver: liblinear

Random Forest:
  cls__bootstrap: False
  cls__max_depth: 10
  cls__max_features: log2
  cls__min_samples_leaf: 5
  cls__min_samples_split: 2
  cls__n_estimators: 200

SVM:
  cls__C: 26.366508987303554
  cls__class_weight: None
  cls__kernel: linear


Since `Random Forest` was the most accurate in terms of precision-recall balance, we deploy it

In [57]:
import joblib
joblib.dump(models['Random Forest'], "../models/final_model.pkl")

['../models/final_model.pkl']