In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# -----------------------------
# 1. Load dataset (use Selected Features for tuning)
# -----------------------------
df = pd.read_csv("../data/heart_selected_features.csv")

X = df.drop("target", axis=1)
y = df["target"]

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



# -----------------------------
# 2. Random Forest - RandomizedSearchCV
# -----------------------------
rf = RandomForestClassifier(random_state=42)

rf_param_grid = {
    "n_estimators": [100, 200, 300, 500],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"]
}

rf_random = RandomizedSearchCV(
    estimator = rf,
    param_distributions = rf_param_grid,
    n_iter = 20,
    scoring = "f1",
    cv = 5,
    random_state = 42,
    n_jobs = -1
)

rf_random.fit(X_train, y_train)

print("\nBest RF Params (RandomizedSearchCV): ", rf_random.best_params_)
best_rf = rf_random.best_estimator_
y_pred = best_rf.predict(X_test)
print(classification_report(y_test, y_pred))



# -----------------------------
# 3. SVM - GridSearchCV
# -----------------------------
svm = SVC()

svm_param_grid = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf", "poly"],
    "gamma": ["scale", "auto"]
}

svm_grid = GridSearchCV(
    estimator=svm,
    param_grid=svm_param_grid,
    scoring="f1",
    cv=5,
    n_jobs=-1
)

svm_grid.fit(X_train, y_train)

print("\nBest SVM Params (GridSearchCV):", svm_grid.best_params_)
best_svm = svm_grid.best_estimator_
y_pred = best_svm.predict(X_test)
print(classification_report(y_test, y_pred))



# -----------------------------
# 4. Compare optimized models
# -----------------------------
results = []

for model_name, model in {
    "Random Forest (tuned)": best_rf,
    "SVM (tuned)": best_svm
}.items():
    y_pred = model.predict(X_test)
    results.append([
        model_name,
        accuracy_score(y_test, y_pred),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred)
    ])

results_df = pd.DataFrame(results, columns = ["Model", "Accuracy", "Precision", "Recall", "F1"])
print("\nTuned Model Comparison")
print(results_df)


Best RF Params (RandomizedSearchCV):  {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 5}
              precision    recall  f1-score   support

           0       0.80      0.88      0.84        32
           1       0.84      0.75      0.79        28

    accuracy                           0.82        60
   macro avg       0.82      0.81      0.81        60
weighted avg       0.82      0.82      0.82        60


Best SVM Params (GridSearchCV): {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.78      0.91      0.84        32
           1       0.87      0.71      0.78        28

    accuracy                           0.82        60
   macro avg       0.83      0.81      0.81        60
weighted avg       0.82      0.82      0.81        60


Tuned Model Comparison
                   Model  Accuracy  Precision    Recall        F1
0  Random Forest (tuned)  0.

📊 OUTPUT

- Runs RandomizedSearchCV for Random Forest (fast + wide search).
- Runs GridSearchCV for SVM (smaller search, but exhaustive).
- Shows the best hyperparameters found.

Best performing model: RF
Since this is heart disease prediction, RECALL is usually more important (better to detect more patients with disease, even if a few false alarms happen).