In [1]:
# 06_hyperparameter_tuning.ipynb

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier

# --- Load preprocessed dataset ---
df_clean = pd.read_csv("heart_disease_clean.csv")

# --- Prepare features & target ---
X = df_clean.drop("num", axis=1)
y = (df_clean["num"] > 0).astype(int)

# Standardize continuous features
continuous_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']
scaler = StandardScaler()
X[continuous_features] = scaler.fit_transform(X[continuous_features])

# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Baseline Random Forest ---
baseline_rf = RandomForestClassifier(random_state=42)
baseline_rf.fit(X_train, y_train)
baseline_acc = baseline_rf.score(X_test, y_test)
print("Baseline Random Forest Accuracy:", baseline_acc)

# --- GridSearchCV ---
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

print("Best parameters (GridSearchCV):", grid_search.best_params_)
print("Best CV Accuracy (GridSearchCV):", grid_search.best_score_)

# --- RandomizedSearchCV ---
param_dist = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_depth": [None, 5, 10, 20, 30],
    "min_samples_split": [2, 5, 10, 15],
    "min_samples_leaf": [1, 2, 4, 6],
    "max_features": ["sqrt", "log2", None]
}

random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train, y_train)

print("Best parameters (RandomizedSearchCV):", random_search.best_params_)
print("Best CV Accuracy (RandomizedSearchCV):", random_search.best_score_)

# --- Compare on test set ---
grid_acc = grid_search.best_estimator_.score(X_test, y_test)
random_acc = random_search.best_estimator_.score(X_test, y_test)

print("\nPerformance Comparison on Test Set:")
print("Baseline Random Forest:", baseline_acc)
print("GridSearchCV Optimized:", grid_acc)
print("RandomizedSearchCV Optimized:", random_acc)


Baseline Random Forest Accuracy: 0.8852459016393442
Best parameters (GridSearchCV): {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best CV Accuracy (GridSearchCV): 0.8179421768707483
Best parameters (RandomizedSearchCV): {'n_estimators': 400, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None}
Best CV Accuracy (RandomizedSearchCV): 0.8179421768707483

Performance Comparison on Test Set:
Baseline Random Forest: 0.8852459016393442
GridSearchCV Optimized: 0.9016393442622951
RandomizedSearchCV Optimized: 0.8852459016393442
