# Heart Disease Prediction - V13
## Simpler Models with Target Encoding & Reduced Overfitting

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

## Load and Prepare Data

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

target_column = 'Heart Disease'
X_train = train_df.drop([target_column, 'id'], axis=1)
y_train = train_df[target_column].map({'Absence': 0, 'Presence': 1})
X_test = test_df.drop('id', axis=1)
test_ids = test_df['id'].values

X_train = X_train.fillna(X_train.median()).reset_index(drop=True)
X_test = X_test.fillna(X_test.median()).reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

print(f"Train: {X_train.shape} | Test: {X_test.shape}")

## Target Encoding (Inside CV Loop)

In [None]:
def target_encode(X_tr, y_tr, X_val, X_te, cols, smoothing=20):
    """Target encode categorical features within fold to avoid leakage"""
    global_mean = y_tr.mean()
    X_tr_enc = X_tr.copy()
    X_val_enc = X_val.copy()
    X_te_enc = X_te.copy()

    for col in cols:
        stats = y_tr.groupby(X_tr[col]).agg(['sum', 'count'])
        smoothed = (stats['sum'] + global_mean * smoothing) / (stats['count'] + smoothing)
        X_tr_enc[f'{col}_te'] = X_tr[col].map(smoothed).fillna(global_mean)
        X_val_enc[f'{col}_te'] = X_val[col].map(smoothed).fillna(global_mean)
        X_te_enc[f'{col}_te'] = X_te[col].map(smoothed).fillna(global_mean)

    return X_tr_enc, X_val_enc, X_te_enc

# Categorical-like columns to target encode
TE_COLS = ['Chest pain type', 'Thallium', 'Slope of ST',
           'Number of vessels fluro', 'EKG results']

## 5-Fold Cross-Validation with 2 Seeds

In [None]:
n_folds = 5
SEEDS = [42, 123]
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

all_oof = []
all_test = []
all_scores = []

print(f"Training 6 models × {n_folds} folds...\n")

for seed in SEEDS:

    # --- LightGBM ---
    oof_lgb = np.zeros(len(X_train))
    test_lgb = np.zeros(len(X_test))
    sc_lgb = []

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        print(f"  LGB s={seed} fold={fold+1}/{n_folds}")

        X_tr_enc, X_val_enc, X_te_enc = target_encode(
            X_train.iloc[tr_idx], y_train.iloc[tr_idx],
            X_train.iloc[val_idx], X_test, TE_COLS
        )

        m = lgb.LGBMClassifier(
            n_estimators=10000,
            max_depth=-1,
            learning_rate=0.003,
            subsample=0.7,
            subsample_freq=1,
            colsample_bytree=0.7,
            min_child_samples=50,
            reg_alpha=0.1,
            reg_lambda=5.0,
            num_leaves=31,
            min_split_gain=0.01,
            random_state=seed,
            n_jobs=-1,
            verbose=-1
        )
        m.fit(X_tr_enc, y_train.iloc[tr_idx],
              eval_set=[(X_val_enc, y_train.iloc[val_idx])],
              callbacks=[lgb.early_stopping(300, verbose=False),
                         lgb.log_evaluation(-1)])

        oof_lgb[val_idx] = m.predict_proba(X_val_enc)[:, 1]
        test_lgb += m.predict_proba(X_te_enc)[:, 1] / n_folds
        sc_lgb.append(roc_auc_score(y_train.iloc[val_idx], oof_lgb[val_idx]))

    all_oof.append(oof_lgb)
    all_test.append(test_lgb)
    all_scores.append(sc_lgb)
    print(f"   LGB s={seed}: {np.mean(sc_lgb):.6f}\n")

    # --- CatBoost ---
    oof_cat = np.zeros(len(X_train))
    test_cat = np.zeros(len(X_test))
    sc_cat = []

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        print(f"  CAT s={seed} fold={fold+1}/{n_folds}")

        X_tr_enc, X_val_enc, X_te_enc = target_encode(
            X_train.iloc[tr_idx], y_train.iloc[tr_idx],
            X_train.iloc[val_idx], X_test, TE_COLS
        )

        m = CatBoostClassifier(
            iterations=10000,
            depth=5,
            learning_rate=0.003,
            l2_leaf_reg=10,
            min_data_in_leaf=50,
            random_strength=1,
            bagging_temperature=0.5,
            random_state=seed,
            verbose=False,
            thread_count=-1,
            early_stopping_rounds=300,
            eval_metric='AUC'
        )
        m.fit(X_tr_enc, y_train.iloc[tr_idx],
              eval_set=(X_val_enc, y_train.iloc[val_idx]),
              verbose=False)

        oof_cat[val_idx] = m.predict_proba(X_val_enc)[:, 1]
        test_cat += m.predict_proba(X_te_enc)[:, 1] / n_folds
        sc_cat.append(roc_auc_score(y_train.iloc[val_idx], oof_cat[val_idx]))

    all_oof.append(oof_cat)
    all_test.append(test_cat)
    all_scores.append(sc_cat)
    print(f"  ✅ CAT s={seed}: {np.mean(sc_cat):.6f}\n")

    # --- XGBoost ---
    oof_xgb = np.zeros(len(X_train))
    test_xgb = np.zeros(len(X_test))
    sc_xgb = []

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        print(f"  XGB s={seed} fold={fold+1}/{n_folds}")

        X_tr_enc, X_val_enc, X_te_enc = target_encode(
            X_train.iloc[tr_idx], y_train.iloc[tr_idx],
            X_train.iloc[val_idx], X_test, TE_COLS
        )

        m = xgb.XGBClassifier(
            n_estimators=10000,
            max_depth=4,
            learning_rate=0.003,
            subsample=0.7,
            colsample_bytree=0.7,
            min_child_weight=10,
            reg_alpha=0.1,
            reg_lambda=5.0,
            gamma=0.2,
            random_state=seed,
            n_jobs=-1,
            eval_metric='auc',
            early_stopping_rounds=300
        )
        m.fit(X_tr_enc, y_train.iloc[tr_idx],
              eval_set=[(X_val_enc, y_train.iloc[val_idx])],
              verbose=False)

        oof_xgb[val_idx] = m.predict_proba(X_val_enc)[:, 1]
        test_xgb += m.predict_proba(X_te_enc)[:, 1] / n_folds
        sc_xgb.append(roc_auc_score(y_train.iloc[val_idx], oof_xgb[val_idx]))

    all_oof.append(oof_xgb)
    all_test.append(test_xgb)
    all_scores.append(sc_xgb)
    print(f"   XGB s={seed}: {np.mean(sc_xgb):.6f}\n")

## Model Performance Summary

In [None]:
print("="*50)
labels = [f'{m}-s{s}' for s in SEEDS for m in ['LGB','CAT','XGB']]
for i, label in enumerate(labels):
    print(f"{label:12s}: {np.mean(all_scores[i]):.6f} "
          f"(+/- {np.std(all_scores[i]):.6f})")

## Ensemble Methods Evaluation

In [None]:
# Simple average
simple_oof = np.mean(all_oof, axis=0)
simple_test = np.mean(all_test, axis=0)
simple_auc = roc_auc_score(y_train, simple_oof)
print(f"Simple Average OOF AUC: {simple_auc:.6f}")

# Rank average
def rank_avg(preds_list):
    r = np.zeros(len(preds_list[0]))
    for p in preds_list:
        r += pd.Series(p).rank(pct=True).values
    return r / len(preds_list)

rank_oof = rank_avg(all_oof)
rank_test = rank_avg(all_test)
rank_auc = roc_auc_score(y_train, rank_oof)
print(f"Rank Average OOF AUC:   {rank_auc:.6f}")

# Meta-model
meta = LogisticRegression(C=0.01, max_iter=1000, random_state=42)
mtr = pd.DataFrame({f'm{i}': all_oof[i] for i in range(len(all_oof))})
mte = pd.DataFrame({f'm{i}': all_test[i] for i in range(len(all_test))})
meta.fit(mtr, y_train)
meta_oof = meta.predict_proba(mtr)[:, 1]
meta_test = meta.predict_proba(mte)[:, 1]
meta_auc = roc_auc_score(y_train, meta_oof)
print(f"Meta-model OOF AUC:     {meta_auc:.6f}")

## Final Submission

In [None]:
# Pick best
scores_dict = {
    'simple': (simple_auc, simple_test),
    'rank': (rank_auc, rank_test),
    'meta': (meta_auc, meta_test),
}
best_name = max(scores_dict, key=lambda k: scores_dict[k][0])
best_auc, best_test = scores_dict[best_name]
print(f"\n Best: {best_name} (OOF AUC: {best_auc:.6f})")

# Save
submission = pd.DataFrame({
    'id': test_ids,
    'Heart Disease': np.clip(best_test, 0, 1)
})
submission.to_csv('submission.csv', index=False)

print("\nSubmission saved!")
print(submission.head(10))
print(f"Range: [{submission['Heart Disease'].min():.6f}, "
      f"{submission['Heart Disease'].max():.6f}]")
print("File: submission.csv")