In [None]:
"""
ML baseline comparison for “Heart Prediction Quantum Dataset.csv”
-----------------------------------------------------------------
‣ Three algorithms: Logistic Regression, Random-Forest, Gradient-Boosting  
‣ Stratified train / validation / test split with seed = 40  
‣ Prints a metric table and indicates the best model (by validation ROC-AUC)

(The AttributeError was due to calling .round() on a plain float; fixed by
using Python's round(x, 3) or np.round).
"""

# ─────────────────────────────────────
# 0. Imports & configuration
# ─────────────────────────────────────
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing  import StandardScaler
from sklearn.compose        import ColumnTransformer
from sklearn.pipeline       import Pipeline
from sklearn.metrics        import (accuracy_score, roc_auc_score, f1_score,
                                    precision_score, recall_score)

from sklearn.linear_model    import LogisticRegression
from sklearn.ensemble        import RandomForestClassifier, GradientBoostingClassifier

RANDOM_STATE = 40
CSV_PATH     = Path("Heart Prediction Quantum Dataset.csv")

# ─────────────────────────────────────
# 1. Load data
# ─────────────────────────────────────
df = pd.read_csv(CSV_PATH)

target = "HeartDisease" if "HeartDisease" in df.columns else df.columns[-1]
y = df[target]
X = df.drop(columns=[target])

numeric_features = X.columns.tolist()   # all are numeric in this dataset

# ─────────────────────────────────────
# 2. Split: train / val / test  (60 % / 20 % / 20 %)
# ─────────────────────────────────────
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size   = 0.20,
    stratify    = y,
    random_state= RANDOM_STATE
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size   = 0.25,          # 0.25 of 0.80 => 0.20
    stratify    = y_temp,
    random_state= RANDOM_STATE
)

# ─────────────────────────────────────
# 3. Pre-processing pipeline (scaling)
# ─────────────────────────────────────
scaler = ColumnTransformer(
    transformers=[("scale", StandardScaler(), numeric_features)],
    remainder="passthrough"
)

# ─────────────────────────────────────
# 4. Define models
# ─────────────────────────────────────
models = {
    "LogisticRegression": Pipeline([
        ("scale", scaler),
        ("clf", LogisticRegression(max_iter=200, solver="liblinear",
                                   random_state=RANDOM_STATE))
    ]),

    "RandomForest": Pipeline([
        ("scale", "passthrough"),                 # scaling not required for trees
        ("clf", RandomForestClassifier(
            n_estimators=200,
            max_depth=None,
            random_state=RANDOM_STATE))
    ]),

    "GradientBoosting": Pipeline([
        ("scale", scaler),
        ("clf", GradientBoostingClassifier(random_state=RANDOM_STATE))
    ])
}

# ─────────────────────────────────────
# 5. Utility: compute metrics
# ─────────────────────────────────────
def safe_roc_auc(y_true, y_score):
    try:
        return roc_auc_score(y_true, y_score)
    except ValueError:          # occurs if only one class present
        return np.nan

def get_metrics(model, X_part, y_part):
    preds = model.predict(X_part)
    proba = model.predict_proba(X_part)[:, 1] if hasattr(model, "predict_proba") else None

    return {
        "ACC"     : round(accuracy_score (y_part, preds), 3),
        "F1"      : round(f1_score        (y_part, preds), 3),
        "PRE"     : round(precision_score (y_part, preds, zero_division=0), 3),
        "REC"     : round(recall_score    (y_part, preds), 3),
        "ROC_AUC" : round(safe_roc_auc(y_part, proba), 3) if proba is not None else np.nan
    }

# ─────────────────────────────────────
# 6. Train & evaluate
# ─────────────────────────────────────
results_val  = {}
results_test = {}

for name, pipe in models.items():
    pipe.fit(X_train, y_train)

    results_val[name]  = get_metrics(pipe, X_val,  y_val)
    results_test[name] = get_metrics(pipe, X_test, y_test)

# ─────────────────────────────────────
# 7. Show comparison table
# ─────────────────────────────────────
val_df  = pd.DataFrame(results_val ).T.add_prefix("val_")
test_df = pd.DataFrame(results_test).T.add_prefix("test_")
summary = pd.concat([val_df, test_df], axis=1)

print("\nMetric comparison (validation → test)")
print(summary)

best_model_name = summary["val_ROC_AUC"].idxmax()
print(f"\n≈≈≈ Recommended model: {best_model_name} "
      f"(highest validation ROC_AUC = {summary.loc[best_model_name, 'val_ROC_AUC']})")


"""
Why the winner is likely the best
---------------------------------
• Logistic Regression provides a strong linear baseline; good if relationships are mostly linear.
• Random-Forest captures non-linear feature interactions but can over-fit on tiny datasets.
• Gradient-Boosting often outperforms RF on small-to-medium tabular data thanks to stage-wise fitting and
  built-in regularisation.
Whichever algorithm shows the highest validation ROC_AUC (and comparable test performance) should be chosen.
"""
