# Heart Disease Prediction - V12
## Two-Round Training with Pseudo-Labeling (Advanced)

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

## Load Data

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

target_column = 'Heart Disease'
X_train = train_df.drop([target_column, 'id'], axis=1)
y_train = train_df[target_column].map({'Absence': 0, 'Presence': 1})
X_test = test_df.drop('id', axis=1)
test_ids = test_df['id'].values

X_train = X_train.fillna(X_train.median()).reset_index(drop=True)
X_test = X_test.fillna(X_test.median()).reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

print(f"Train: {X_train.shape} | Test: {X_test.shape}")

## Helper Functions

In [None]:
def train_models(X_tr, y_tr, X_te, n_folds=5, seeds=[42, 123, 2024], tag=""):
    """Train 9 models (3 seeds × 3 algorithms)"""
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    all_oof = []
    all_test = []
    all_scores = []

    for seed in seeds:
        # XGBoost
        oof_xgb = np.zeros(len(X_tr))
        test_xgb = np.zeros(len(X_te))
        sc = []
        for fold, (tr_idx, val_idx) in enumerate(skf.split(X_tr, y_tr)):
            print(f"  [{tag}] XGB s={seed} fold={fold+1}")
            m = xgb.XGBClassifier(
                n_estimators=5000, max_depth=5, learning_rate=0.005,
                subsample=0.8, colsample_bytree=0.8, min_child_weight=5,
                reg_alpha=0.1, reg_lambda=2.0, gamma=0.1,
                random_state=seed, n_jobs=-1,
                eval_metric='auc', early_stopping_rounds=200
            )
            m.fit(X_tr.iloc[tr_idx], y_tr.iloc[tr_idx],
                  eval_set=[(X_tr.iloc[val_idx], y_tr.iloc[val_idx])],
                  verbose=False)
            oof_xgb[val_idx] = m.predict_proba(X_tr.iloc[val_idx])[:, 1]
            test_xgb += m.predict_proba(X_te)[:, 1] / n_folds
            sc.append(roc_auc_score(y_tr.iloc[val_idx], oof_xgb[val_idx]))
        all_oof.append(oof_xgb)
        all_test.append(test_xgb)
        all_scores.append(sc)
        print(f"   XGB s={seed}: {np.mean(sc):.6f}")

        # LightGBM
        oof_lgb = np.zeros(len(X_tr))
        test_lgb = np.zeros(len(X_te))
        sc = []
        for fold, (tr_idx, val_idx) in enumerate(skf.split(X_tr, y_tr)):
            print(f"  [{tag}] LGB s={seed} fold={fold+1}")
            m = lgb.LGBMClassifier(
                n_estimators=5000, max_depth=5, learning_rate=0.005,
                subsample=0.8, colsample_bytree=0.8, min_child_samples=30,
                reg_alpha=0.1, reg_lambda=2.0, num_leaves=31,
                random_state=seed, n_jobs=-1, verbose=-1
            )
            m.fit(X_tr.iloc[tr_idx], y_tr.iloc[tr_idx],
                  eval_set=[(X_tr.iloc[val_idx], y_tr.iloc[val_idx])],
                  callbacks=[lgb.early_stopping(200, verbose=False),
                              lgb.log_evaluation(-1)])
            oof_lgb[val_idx] = m.predict_proba(X_tr.iloc[val_idx])[:, 1]
            test_lgb += m.predict_proba(X_te)[:, 1] / n_folds
            sc.append(roc_auc_score(y_tr.iloc[val_idx], oof_lgb[val_idx]))
        all_oof.append(oof_lgb)
        all_test.append(test_lgb)
        all_scores.append(sc)
        print(f"   LGB s={seed}: {np.mean(sc):.6f}")

        # CatBoost
        oof_cat = np.zeros(len(X_tr))
        test_cat = np.zeros(len(X_te))
        sc = []
        for fold, (tr_idx, val_idx) in enumerate(skf.split(X_tr, y_tr)):
            print(f"  [{tag}] CAT s={seed} fold={fold+1}")
            m = CatBoostClassifier(
                iterations=5000, depth=5, learning_rate=0.005,
                l2_leaf_reg=5, random_state=seed, verbose=False,
                thread_count=-1, early_stopping_rounds=200,
                eval_metric='AUC'
            )
            m.fit(X_tr.iloc[tr_idx], y_tr.iloc[tr_idx],
                  eval_set=(X_tr.iloc[val_idx], y_tr.iloc[val_idx]),
                  verbose=False)
            oof_cat[val_idx] = m.predict_proba(X_tr.iloc[val_idx])[:, 1]
            test_cat += m.predict_proba(X_te)[:, 1] / n_folds
            sc.append(roc_auc_score(y_tr.iloc[val_idx], oof_cat[val_idx]))
        all_oof.append(oof_cat)
        all_test.append(test_cat)
        all_scores.append(sc)
        print(f"   CAT s={seed}: {np.mean(sc):.6f}")

    return all_oof, all_test, all_scores


def best_ensemble(all_oof, all_test, y_tr):
    """Evaluate and select best ensemble method"""
    simple_oof = np.mean(all_oof, axis=0)
    simple_test = np.mean(all_test, axis=0)
    simple_auc = roc_auc_score(y_tr, simple_oof)

    def rank_avg(preds_list):
        r = np.zeros(len(preds_list[0]))
        for p in preds_list:
            r += pd.Series(p).rank(pct=True).values
        return r / len(preds_list)

    rank_oof = rank_avg(all_oof)
    rank_test = rank_avg(all_test)
    rank_auc = roc_auc_score(y_tr, rank_oof)

    meta = LogisticRegression(C=0.01, max_iter=1000, random_state=42)
    mtr = pd.DataFrame({f'm{i}': all_oof[i] for i in range(len(all_oof))})
    mte = pd.DataFrame({f'm{i}': all_test[i] for i in range(len(all_test))})
    meta.fit(mtr, y_tr)
    meta_oof = meta.predict_proba(mtr)[:, 1]
    meta_test = meta.predict_proba(mte)[:, 1]
    meta_auc = roc_auc_score(y_tr, meta_oof)

    scores_dict = {
        'simple': (simple_auc, simple_test),
        'rank': (rank_auc, rank_test),
        'meta': (meta_auc, meta_test),
    }
    best_name = max(scores_dict, key=lambda k: scores_dict[k][0])
    best_auc, best_preds = scores_dict[best_name]
    print(f"  Simple={simple_auc:.6f} | Rank={rank_auc:.6f} | Meta={meta_auc:.6f}")
    print(f"   Best: {best_name} ({best_auc:.6f})")
    return best_preds, best_auc

## ROUND 1: Initial Training

In [None]:
print("\n" + "="*50)
print("ROUND 1: Initial Training")
print("="*50)
all_oof_r1, all_test_r1, scores_r1 = train_models(
    X_train, y_train, X_test, tag="R1")

test_preds_r1, auc_r1 = best_ensemble(all_oof_r1, all_test_r1, y_train)
print(f"\nRound 1 Best OOF AUC: {auc_r1:.6f}")

## Pseudo-Labeling

In [None]:
print("\n" + "="*50)
print("PSEUDO-LABELING")
print("="*50)

# Use high confidence threshold
THRESHOLD = 0.05
pseudo_mask = (test_preds_r1 < THRESHOLD) | (test_preds_r1 > (1 - THRESHOLD))
pseudo_X = X_test[pseudo_mask].reset_index(drop=True)
pseudo_y = pd.Series(
    (test_preds_r1[pseudo_mask] > 0.5).astype(int)
).reset_index(drop=True)

print(f"Pseudo-labeled samples: {pseudo_mask.sum()} / {len(X_test)}")
print(f"  Pseudo 0 (Absence): {(pseudo_y == 0).sum()}")
print(f"  Pseudo 1 (Presence): {(pseudo_y == 1).sum()}")

# Combine train + pseudo-labeled test
X_train_r2 = pd.concat([X_train, pseudo_X], ignore_index=True)
y_train_r2 = pd.concat([y_train, pseudo_y], ignore_index=True)

print(f"\nRound 2 train size: {X_train_r2.shape}")

## ROUND 2: Training with Pseudo-Labels

In [None]:
print("\n" + "="*50)
print("ROUND 2: Training with Pseudo-Labels")
print("="*50)

all_oof_r2, all_test_r2, scores_r2 = train_models(
    X_train_r2, y_train_r2, X_test, tag="R2")

# Evaluate only on original train rows for fair comparison
all_oof_r2_orig = [o[:len(X_train)] for o in all_oof_r2]
test_preds_r2, auc_r2 = best_ensemble(
    all_oof_r2_orig, all_test_r2, y_train)
print(f"\nRound 2 Best OOF AUC: {auc_r2:.6f}")

## Final Blending

In [None]:
print("\n" + "="*50)
print("FINAL BLEND")
print("="*50)

# Try different blend ratios
for w in [0.3, 0.4, 0.5]:
    blend = w * test_preds_r1 + (1 - w) * test_preds_r2
    print(f"  Blend R1×{w:.1f} + R2×{1-w:.1f} — R1 AUC={auc_r1:.6f}, R2 AUC={auc_r2:.6f}")

# Use round with better OOF as primary
if auc_r2 >= auc_r1:
    final_test = 0.3 * test_preds_r1 + 0.7 * test_preds_r2
    print(f"\n Using R2-dominant blend (R2 AUC higher: {auc_r2:.6f})")
else:
    final_test = 0.7 * test_preds_r1 + 0.3 * test_preds_r2
    print(f"\n Using R1-dominant blend (R1 AUC higher: {auc_r1:.6f})")

## Save Submission

In [None]:
submission = pd.DataFrame({
    'id': test_ids,
    'Heart Disease': np.clip(final_test, 0, 1)
})
submission.to_csv('submission.csv', index=False)

print("\nSubmission saved!")
print(submission.head(10))
print(f"Range: [{submission['Heart Disease'].min():.6f}, "
      f"{submission['Heart Disease'].max():.6f}]")
print("File: submission.csv")