In [1]:
# ==============================================================================
# PROJECT: Kaggle Playground Series - S6E2 (Heart Disease)
# MISSION: The Grand Hybrid (Blending Stacking + Pseudo-Labeling)
# ==============================================================================

import pandas as pd
import numpy as np
from google.colab import drive
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# --- STEP 1: MOUNT & LOAD ---
drive.mount('/content/drive')
TRAIN_PATH = '/content/drive/MyDrive/Nihal Data/kaggle/S6E1 - heart/train.csv'
TEST_PATH = '/content/drive/MyDrive/Nihal Data/kaggle/S6E1 - heart/test.csv'

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

target_mapping = {'Absence': 0, 'Presence': 1}
y = train['Heart Disease'].map(target_mapping)
X = train.drop(['id', 'Heart Disease'], axis=1)
X_test = test.drop(['id'], axis=1)

# Categorical Encoding
for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].astype('category').cat.codes
    X_test[col] = X_test[col].astype('category').cat.codes

# ==============================================================================
# PART A: RECREATING THE GOLD MEDAL (Trial 07 Stacking)
# ==============================================================================
print("1. Regenerating Trial 07 (Stacking)...")

# 1. Optimized XGBoost
xgb_params = {
    'n_estimators': 3000,
    'learning_rate': 0.0524336,
    'max_depth': 4,
    'subsample': 0.817428,
    'colsample_bytree': 0.50969,
    'min_child_weight': 10,
    'tree_method': 'hist',
    'n_jobs': -1,
    'random_state': 42
}

# 2. Standard LightGBM (The one that worked best!)
lgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.03,
    'num_leaves': 63,
    'verbosity': -1,
    'random_state': 42
}

base_models = [
    ('xgb', XGBClassifier(**xgb_params)),
    ('lgb', LGBMClassifier(**lgb_params))
]

stack_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

stack_model.fit(X, y)
pred_stack = stack_model.predict_proba(X_test)[:, 1]

# ==============================================================================
# PART B: RECREATING THE SILVER MEDAL (Trial 08 Pseudo-Labeling)
# ==============================================================================
print("2. Regenerating Trial 08 (Pseudo-Labeling)...")

# First pass to get high-confidence rows
model_temp = XGBClassifier(**xgb_params)
model_temp.fit(X, y)
temp_probs = model_temp.predict_proba(X_test)[:, 1]

# Identify pseudo-labels (Top/Bottom 2% confidence)
pseudo_limit = 0.02
high_conf_idx = (temp_probs > (1 - pseudo_limit)) | (temp_probs < pseudo_limit)

pseudo_X = X_test[high_conf_idx].copy()
pseudo_y = (temp_probs[high_conf_idx] > 0.5).astype(int)

# Augment Training Data
X_aug = pd.concat([X, pseudo_X])
y_aug = pd.concat([y, pd.Series(pseudo_y)])

# Final Retrain on Augmented Data
model_pseudo = XGBClassifier(**xgb_params)
model_pseudo.fit(X_aug, y_aug)
pred_pseudo = model_pseudo.predict_proba(X_test)[:, 1]

# ==============================================================================
# PART C: THE FINAL BLEND
# ==============================================================================
print("3. Blending Gold and Silver...")

# Weighted Average: 60% Stacking (More stable) + 40% Pseudo (More risky)
final_preds = (0.6 * pred_stack) + (0.4 * pred_pseudo)

submission = pd.DataFrame({'id': test['id'], 'Heart Disease': final_preds})
submission.to_csv('submission_v13_grand_hybrid.csv', index=False)

print("Grand Hybrid submission 'submission_v13_grand_hybrid.csv' generated.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
1. Regenerating Trial 07 (Stacking)...
2. Regenerating Trial 08 (Pseudo-Labeling)...
3. Blending Gold and Silver...
Grand Hybrid submission 'submission_v13_grand_hybrid.csv' generated.
