In [3]:
# ============================================================
# Feature Engineering, Model Training & Evaluation (Optimized)
# ============================================================

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# ------------------------------------------------------------
# Load cleaned data
# ------------------------------------------------------------
df = pd.read_csv("../data/processed/heart_disease_clean.csv")

X = df.drop(columns="target")
y = df["target"]

# ------------------------------------------------------------
# Feature groups
# ------------------------------------------------------------
numerical_features = [
    "age", "trestbps", "chol", "thalach", "oldpeak", "ca"
]

categorical_features = [
    "sex", "cp", "fbs", "restecg", "exang", "slope", "thal"
]

# ------------------------------------------------------------
# Preprocessing
# ------------------------------------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", "passthrough", categorical_features)
    ]
)

# ------------------------------------------------------------
# Cross-validation strategy
# ------------------------------------------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "accuracy": "accuracy",
    "precision": "precision",
    "recall": "recall",
    "roc_auc": "roc_auc"
}

# ============================================================
# Model 1: Logistic Regression (Optimized)
# ============================================================

logreg_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LogisticRegression(
        max_iter=1000,
        solver="liblinear"
    ))
])

logreg_param_grid = {
    "model__C": [0.01, 0.1, 1, 10],
    "model__penalty": ["l1", "l2"]
}

logreg_grid = GridSearchCV(
    logreg_pipeline,
    logreg_param_grid,
    cv=cv,
    scoring="roc_auc",
    n_jobs=-1
)

logreg_grid.fit(X, y)
best_logreg = logreg_grid.best_estimator_

logreg_scores = cross_validate(
    best_logreg, X, y, cv=cv, scoring=scoring
)

# ============================================================
# Model 2: Random Forest (Optimized)
# ============================================================

rf_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier(
        random_state=42
    ))
])

rf_param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 5, 10],
    "model__min_samples_split": [2, 5]
}

rf_grid = GridSearchCV(
    rf_pipeline,
    rf_param_grid,
    cv=cv,
    scoring="roc_auc",
    n_jobs=-1
)

rf_grid.fit(X, y)
best_rf = rf_grid.best_estimator_

rf_scores = cross_validate(
    best_rf, X, y, cv=cv, scoring=scoring
)

# ============================================================
# Results
# ============================================================

def print_scores(name, scores):
    print(f"\n{name} (Mean CV Scores)")
    for metric in scoring:
        print(f"{metric}: {scores['test_' + metric].mean():.3f}")

print_scores("Logistic Regression", logreg_scores)
print_scores("Random Forest", rf_scores)



Logistic Regression (Mean CV Scores)
accuracy: 0.835
precision: 0.833
recall: 0.806
roc_auc: 0.911

Random Forest (Mean CV Scores)
accuracy: 0.855
precision: 0.876
recall: 0.798
roc_auc: 0.917
