# Heart Disease Prediction - V11
## Simplified: 5-Fold CV with 9 Models (3 Seeds × 3 Algorithms)

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

## Load and Prepare Data

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

target_column = 'Heart Disease'
X_train = train_df.drop([target_column, 'id'], axis=1)
y_train = train_df[target_column].map({'Absence': 0, 'Presence': 1})
X_test = test_df.drop('id', axis=1)
test_ids = test_df['id'].values

X_train = X_train.fillna(X_train.median()).reset_index(drop=True)
X_test = X_test.fillna(X_test.median()).reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

print(f"Train: {X_train.shape} | Test: {X_test.shape}")
print(f"Target distribution:\n{y_train.value_counts()}")

## Train 9 Models with 5-Fold CV

In [None]:
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
SEEDS = [42, 123, 2024]

all_oof = []
all_test = []
all_scores = []

print(f"Training 9 models × {n_folds} folds...\n")

for seed in SEEDS:
    # --- XGBoost ---
    oof_xgb = np.zeros(len(X_train))
    test_xgb = np.zeros(len(X_test))
    scores_xgb = []

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        print(f"  XGB seed={seed} | Fold {fold+1}/{n_folds}")
        m = xgb.XGBClassifier(
            n_estimators=5000, max_depth=5, learning_rate=0.005,
            subsample=0.8, colsample_bytree=0.8, min_child_weight=5,
            reg_alpha=0.1, reg_lambda=2.0, gamma=0.1,
            random_state=seed, n_jobs=-1,
            eval_metric='auc', early_stopping_rounds=200
        )
        m.fit(X_train.iloc[tr_idx], y_train.iloc[tr_idx],
              eval_set=[(X_train.iloc[val_idx], y_train.iloc[val_idx])],
              verbose=False)
        oof_xgb[val_idx] = m.predict_proba(X_train.iloc[val_idx])[:, 1]
        test_xgb += m.predict_proba(X_test)[:, 1] / n_folds
        scores_xgb.append(roc_auc_score(y_train.iloc[val_idx], oof_xgb[val_idx]))

    all_oof.append(oof_xgb)
    all_test.append(test_xgb)
    all_scores.append(scores_xgb)
    print(f"   XGB seed={seed} AUC: {np.mean(scores_xgb):.6f}\n")

    # --- LightGBM ---
    oof_lgb = np.zeros(len(X_train))
    test_lgb = np.zeros(len(X_test))
    scores_lgb = []

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        print(f"  LGB seed={seed} | Fold {fold+1}/{n_folds}")
        m = lgb.LGBMClassifier(
            n_estimators=5000, max_depth=5, learning_rate=0.005,
            subsample=0.8, colsample_bytree=0.8, min_child_samples=30,
            reg_alpha=0.1, reg_lambda=2.0, num_leaves=31,
            random_state=seed, n_jobs=-1, verbose=-1
        )
        m.fit(X_train.iloc[tr_idx], y_train.iloc[tr_idx],
              eval_set=[(X_train.iloc[val_idx], y_train.iloc[val_idx])],
              callbacks=[lgb.early_stopping(200, verbose=False),
                         lgb.log_evaluation(-1)])
        oof_lgb[val_idx] = m.predict_proba(X_train.iloc[val_idx])[:, 1]
        test_lgb += m.predict_proba(X_test)[:, 1] / n_folds
        scores_lgb.append(roc_auc_score(y_train.iloc[val_idx], oof_lgb[val_idx]))

    all_oof.append(oof_lgb)
    all_test.append(test_lgb)
    all_scores.append(scores_lgb)
    print(f"   LGB seed={seed} AUC: {np.mean(scores_lgb):.6f}\n")

    # --- CatBoost ---
    oof_cat = np.zeros(len(X_train))
    test_cat = np.zeros(len(X_test))
    scores_cat = []

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        print(f"  CAT seed={seed} | Fold {fold+1}/{n_folds}")
        m = CatBoostClassifier(
            iterations=5000, depth=5, learning_rate=0.005,
            l2_leaf_reg=5, random_state=seed, verbose=False,
            thread_count=-1, early_stopping_rounds=200,
            eval_metric='AUC'
        )
        m.fit(X_train.iloc[tr_idx], y_train.iloc[tr_idx],
              eval_set=(X_train.iloc[val_idx], y_train.iloc[val_idx]),
              verbose=False)
        oof_cat[val_idx] = m.predict_proba(X_train.iloc[val_idx])[:, 1]
        test_cat += m.predict_proba(X_test)[:, 1] / n_folds
        scores_cat.append(roc_auc_score(y_train.iloc[val_idx], oof_cat[val_idx]))

    all_oof.append(oof_cat)
    all_test.append(test_cat)
    all_scores.append(scores_cat)
    print(f"   CAT seed={seed} AUC: {np.mean(scores_cat):.6f}\n")

## Model Performance Summary

In [None]:
print("="*50)
labels = [f'{m}-s{s}' for s in SEEDS for m in ['XGB','LGB','CAT']]
for i, label in enumerate(labels):
    print(f"{label:12s}: {np.mean(all_scores[i]):.6f} (+/- {np.std(all_scores[i]):.6f})")

## Ensemble Methods Evaluation

In [None]:
# Simple average (most robust)
simple_oof = np.mean(all_oof, axis=0)
simple_test = np.mean(all_test, axis=0)
simple_auc = roc_auc_score(y_train, simple_oof)
print(f"Simple Average OOF AUC:  {simple_auc:.6f}")

# Rank average
def rank_avg(preds_list):
    r = np.zeros(len(preds_list[0]))
    for p in preds_list:
        r += pd.Series(p).rank(pct=True).values
    return r / len(preds_list)

rank_oof = rank_avg(all_oof)
rank_test = rank_avg(all_test)
rank_auc = roc_auc_score(y_train, rank_oof)
print(f"Rank Average OOF AUC:    {rank_auc:.6f}")

# Meta-model
meta_model = LogisticRegression(C=0.01, max_iter=1000, random_state=42)
meta_train_df = pd.DataFrame({f'm{i}': all_oof[i] for i in range(len(all_oof))})
meta_test_df = pd.DataFrame({f'm{i}': all_test[i] for i in range(len(all_test))})
meta_model.fit(meta_train_df, y_train)
meta_oof = meta_model.predict_proba(meta_train_df)[:, 1]
meta_test_preds = meta_model.predict_proba(meta_test_df)[:, 1]
meta_auc = roc_auc_score(y_train, meta_oof)
print(f"Meta-model OOF AUC:      {meta_auc:.6f}")

## Final Submission

In [None]:
# Pick best
scores_dict = {
    'simple': (simple_auc, simple_test),
    'rank': (rank_auc, rank_test),
    'meta': (meta_auc, meta_test_preds),
}
best_name = max(scores_dict, key=lambda k: scores_dict[k][0])
best_auc, best_test = scores_dict[best_name]
print(f"Best ensemble: {best_name} (OOF AUC: {best_auc:.6f})\n")

# Save
submission = pd.DataFrame({
    'id': test_ids,
    'Heart Disease': np.clip(best_test, 0, 1)
})
submission.to_csv('submission.csv', index=False)

print("Submission saved!")
print(submission.head(10))
print(f"Range: [{submission['Heart Disease'].min():.6f}, {submission['Heart Disease'].max():.6f}]")
print("File: submission.csv")