In [5]:
# ==============================================================================
# PROJECT: Kaggle Playground Series - S6E2 (Heart Disease)
# MISSION: The Diverse Trio (Restoring Standard LGBM + Adding Random Forest)
# ==============================================================================

import pandas as pd
import numpy as np
from google.colab import drive
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

# --- STEP 1: MOUNT & LOAD ---
drive.mount('/content/drive')
TRAIN_PATH = '/content/drive/MyDrive/Nihal Data/kaggle/S6E1 - heart/train.csv'
TEST_PATH = '/content/drive/MyDrive/Nihal Data/kaggle/S6E1 - heart/test.csv'

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

# --- STEP 2: PREPARATION ---
target_mapping = {'Absence': 0, 'Presence': 1}
y = train['Heart Disease'].map(target_mapping)
X = train.drop(['id', 'Heart Disease'], axis=1)
X_test = test.drop(['id'], axis=1)

# Categorical Encoding
for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].astype('category').cat.codes
    X_test[col] = X_test[col].astype('category').cat.codes

# --- STEP 3: DEFINING THE TRIO ---

# 1. OPTIMIZED XGBOOST (The MVP)
xgb_model = XGBClassifier(
    n_estimators=3000,
    learning_rate=0.0524336,
    max_depth=4,
    subsample=0.817428,
    colsample_bytree=0.50969,
    min_child_weight=10,
    tree_method='hist',
    # device='cuda', # Uncomment if using GPU
    n_jobs=-1,       # Safe for CPU
    random_state=42
)

# 2. STANDARD LIGHTGBM (Restored from Trial 07 because it scored higher!)
lgb_model = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.03,
    num_leaves=63,
    # device='gpu', # Uncomment if using GPU
    verbosity=-1,
    random_state=42
)

# 3. RANDOM FOREST (The New Stabilizer)
# Random Forest works very differently from Boosting models.
rf_model = RandomForestClassifier(
    n_estimators=1000,
    max_depth=10,      # Constrained depth to prevent overfitting
    min_samples_leaf=5,
    n_jobs=-1,
    random_state=42
)

base_models = [
    ('xgb', xgb_model),
    ('lgb', lgb_model),
    ('rf', rf_model)
]

# --- STEP 4: THE META-LEARNER ---
stack_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),
    cv=5,
    stack_method='predict_proba',
    n_jobs=1 # Using 1 to be safe, change to -1 if using CPU only
)

# --- STEP 5: TRAINING ---
print("Training the Diverse Trio... (XGB + LGBM + Random Forest)")
stack_model.fit(X, y)

# --- STEP 6: SUBMISSION ---
test_probs = stack_model.predict_proba(X_test)[:, 1]
submission = pd.DataFrame({'id': test['id'], 'Heart Disease': test_probs})
submission.to_csv('submission_v12_diverse_trio.csv', index=False)

print("Diverse Trio submission 'submission_v12_diverse_trio.csv' generated.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training the Diverse Trio... (XGB + LGBM + Random Forest)
Diverse Trio submission 'submission_v12_diverse_trio.csv' generated.
