In [4]:
# ==============================================================================
# PROJECT: Kaggle Playground Series - S6E2 (Heart Disease)
# MISSION: The Power Ensemble (XGBoost + LightGBM)
# ==============================================================================

import pandas as pd
import numpy as np
from google.colab import drive
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# --- STEP 1: MOUNT & LOAD ---
drive.mount('/content/drive')

TRAIN_PATH = '/content/drive/MyDrive/Nihal Data/kaggle/S6E1 - heart/train.csv'
TEST_PATH = '/content/drive/MyDrive/Nihal Data/kaggle/S6E1 - heart/test.csv'

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

print(f" Data Loaded. Training shape: {train.shape}")

# --- STEP 2: ENCODING ---
# Converting 'Absence'/'Presence' to binary integers
target_mapping = {'Absence': 0, 'Presence': 1}
y = train['Heart Disease'].map(target_mapping)

features = [col for col in train.columns if col not in ['id', 'Heart Disease']]
X = train[features].copy()
X_test = test[features].copy()

# Simple categorical encoding for string/object columns
for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].astype('category').cat.codes
    X_test[col] = X_test[col].astype('category').cat.codes

# --- STEP 3: THE ENSEMBLE CROSS-VALIDATION ---
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Storage for our predictions
final_test_preds = np.zeros(len(X_test))
cv_scores = []

print(" Starting the Multi-Model Ensemble (XGBoost + LightGBM)...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # --- MODEL A: XGBoost (Updated for CUDA/GPU) ---
    xgb = XGBClassifier(
        n_estimators=1000,
        learning_rate=0.03,
        max_depth=7,
        tree_method='hist',   # Use 'hist' instead of 'gpu_hist'
        device='cuda',        # Modern way to trigger GPU in Colab
        eval_metric='auc',
        early_stopping_rounds=50,
        random_state=42
    )
    xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
    xgb_probs = xgb.predict_proba(X_val)[:, 1]

    # --- MODEL B: LightGBM ---
    # Using 'gpu' as device; if it fails, Colab's default CPU is still very fast.
    lgb = LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.03,
        num_leaves=63,
        device='gpu',        # Uses GPU for LightGBM
        metric='auc',
        importance_type='gain',
        random_state=42,
        verbosity=-1
    )
    lgb.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    lgb_probs = lgb.predict_proba(X_val)[:, 1]

    # --- BLENDING THE TWO ---
    # We use a 50/50 weighted average to combine their "opinions"
    combined_val_probs = (0.5 * xgb_probs) + (0.5 * lgb_probs)
    score = roc_auc_score(y_val, combined_val_probs)
    cv_scores.append(score)

    # Predict on test set for this fold
    fold_test_preds = (0.5 * xgb.predict_proba(X_test)[:, 1]) + (0.5 * lgb.predict_proba(X_test)[:, 1])
    final_test_preds += fold_test_preds / 5 # Average across 5 folds

    print(f" Fold {fold+1} Ensemble AUC: {score:.5f}")

print(f"\n Mean Ensemble CV AUC: {np.mean(cv_scores):.5f}")

# --- STEP 4: SUBMISSION ---
submission = pd.DataFrame({'id': test['id'], 'Heart Disease': final_test_preds})
submission.to_csv('submission_v3_ensemble.csv', index=False)
print(" 'submission_v3_ensemble.csv' has been generated and is ready for Kaggle!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Data Loaded. Training shape: (630000, 15)
 Starting the Multi-Model Ensemble (XGBoost + LightGBM)...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


 Fold 1 Ensemble AUC: 0.95534
 Fold 2 Ensemble AUC: 0.95442
 Fold 3 Ensemble AUC: 0.95522
 Fold 4 Ensemble AUC: 0.95470
 Fold 5 Ensemble AUC: 0.95553

 Mean Ensemble CV AUC: 0.95504
 'submission_v3_ensemble.csv' has been generated and is ready for Kaggle!
