In [1]:
# %% [markdown]
# 06_hyperparameter_tuning.ipynb
# Hyperparameter tuning with GridSearchCV and RandomizedSearchCV.

# %%
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib
from pathlib import Path

ROOT = Path("..").resolve()

DATA_DIR = ROOT / "data"
RESULTS_DIR = ROOT / "results"
MODELS_DIR = ROOT / "models"

CLEANED_CSV = DATA_DIR / "heart_disease_cleaned.csv"
df = pd.read_csv(CLEANED_CSV)

X = df.drop(columns=["target"])
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Random Forest tuning
param_grid = {
    "n_estimators": [100, 200, 400],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10]
}
rf = RandomForestClassifier(random_state=42)
gs = GridSearchCV(rf, param_grid, cv=5, scoring="roc_auc", n_jobs=-1)
gs.fit(X_train, y_train)
best_rf = gs.best_estimator_
print("Best RF:", gs.best_params_)

# SVM tuning
param_dist = {
    "C": np.logspace(-3, 2, 10),
    "gamma": ["scale", "auto"]
}
svm = SVC(probability=True, random_state=42)
rs = RandomizedSearchCV(svm, param_distributions=param_dist, n_iter=10, cv=5,
                        scoring="roc_auc", n_jobs=-1, random_state=42)
rs.fit(X_train, y_train)
best_svm = rs.best_estimator_
print("Best SVM:", rs.best_params_)

def eval_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]
    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_prob)
    }

rf_metrics = eval_model(best_rf, X_test, y_test)
svm_metrics = eval_model(best_svm, X_test, y_test)
print("RF metrics:", rf_metrics)
print("SVM metrics:", svm_metrics)

final_model = best_rf if rf_metrics["roc_auc"] >= svm_metrics["roc_auc"] else best_svm
joblib.dump(final_model, MODELS_DIR / "final_model.pkl")
print("Saved final model.")


Best RF: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 200}
Best SVM: {'gamma': 'scale', 'C': np.float64(100.0)}
RF metrics: {'accuracy': 0.9016393442622951, 'precision': 0.8666666666666667, 'recall': 0.9285714285714286, 'f1': 0.896551724137931, 'roc_auc': 0.9577922077922079}
SVM metrics: {'accuracy': 0.8032786885245902, 'precision': 0.7666666666666667, 'recall': 0.8214285714285714, 'f1': 0.7931034482758621, 'roc_auc': 0.9047619047619048}
Saved final model.
