# V18: Ensemble Baseline Solution

A foundational ensemble approach combining XGBoost, LightGBM, and CatBoost with ultra-heavy regularization. This version establishes the baseline ensemble architecture used across multiple variations.

**Key Features:**
- 3-model ensemble (XGB, LGBM, CB) with weighted averaging
- External feature encoding from 100K Diabetes dataset
- Medical domain features (BMI categories, BP categories, non-HDL cholesterol)
- 10-Fold Stratified Cross-Validation
- Isotonic calibration for probability refinement
- Feature selection using SelectFromModel

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.isotonic import IsotonicRegression
from sklearn.feature_selection import SelectFromModel

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

print("V18")

## 2. Load the Data

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv')
test  = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv')
sub   = pd.read_csv('/kaggle/input/playground-series-s5e12/sample_submission.csv')
orig  = pd.read_csv('/kaggle/input/diabetes-health-indicators-dataset/diabetes_dataset.csv')

TARGET = 'diagnosed_diabetes'

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"External dataset shape: {orig.shape}")

## 3. External Encoding

Mean and count encodings from the 100K Diabetes Health Indicators Dataset provide external statistical relationships that improve model robustness.

In [None]:
base_cols = [c for c in train.columns if c not in ['id', TARGET]]
encoded = []

for col in base_cols:
    # Mean encoding
    mean_map = orig.groupby(col)[TARGET].mean()
    train[f"enc_mean_{col}"] = train[col].map(mean_map)
    test[f"enc_mean_{col}"]  = test[col].map(mean_map)
    encoded.append(f"enc_mean_{col}")
    
    # Count encoding (log-scaled)
    count_map = orig.groupby(col).size()
    train[f"enc_cnt_{col}"] = train[col].map(count_map).fillna(1)
    test[f"enc_cnt_{col}"]  = test[col].map(count_map).fillna(1)
    train[f"enc_cnt_{col}"] = np.log1p(train[f"enc_cnt_{col}"])
    test[f"enc_cnt_{col}"]  = np.log1p(test[f"enc_cnt_{col}"])
    encoded.append(f"enc_cnt_{col}")

print(f"Generated {len(encoded)} external features")

## 4. Stable Feature Engineering

Medical domain features based on clinical standards:
- **BMI Categories**: WHO classification (underweight, normal, overweight, obese)
- **BP Categories**: AHA guidelines (normal, elevated, high)
- **Non-HDL Cholesterol**: Total - HDL (cardiovascular risk indicator)

In [None]:
train['bmi_cat'] = pd.cut(train['bmi'], bins=[0,18.5,25,30,999], labels=[0,1,2,3]).astype('int')
test['bmi_cat']  = pd.cut(test['bmi'],  bins=[0,18.5,25,30,999], labels=[0,1,2,3]).astype('int')

train['bp_cat'] = 0
train.loc[(train['systolic_bp']>=140)|(train['diastolic_bp']>=90), 'bp_cat'] = 2
train.loc[((train['systolic_bp']>=120)&(train['systolic_bp']<140))|
          ((train['diastolic_bp']>=80)&(train['diastolic_bp']<90)), 'bp_cat'] = 1

test['bp_cat'] = 0
test.loc[(test['systolic_bp']>=140)|(test['diastolic_bp']>=90), 'bp_cat'] = 2
test.loc[((test['systolic_bp']>=120)&(test['systolic_bp']<140))|
         ((test['diastolic_bp']>=80)&(test['diastolic_bp']<90)), 'bp_cat'] = 1

train['non_hdl'] = train['cholesterol_total'] - train['hdl_cholesterol']
test['non_hdl']  = test['cholesterol_total'] - test['hdl_cholesterol']

print("Medical features created: BMI categories, BP categories, Non-HDL cholesterol")

## 5. Final Feature Set Preparation

In [None]:
features = base_cols + ['bmi_cat', 'bp_cat', 'non_hdl'] + encoded

# Fill NaNs
for f in encoded:
    train[f] = train[f].fillna(train[f].median())
    test[f]  = test[f].fillna(train[f].median())

X      = train[features].copy()
y      = train[TARGET]
X_test = test[features].copy()

# Label encode categoricals
cat_cols = ['bmi_cat', 'bp_cat'] + train.select_dtypes('object').columns.tolist()
for col in cat_cols:
    if col in X.columns:
        le = LabelEncoder()
        X[col]      = le.fit_transform(X[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))

print(f"Total features: {X.shape[1]}")
print(f"Training set shape: {X.shape}")
print(f"Test set shape: {X_test.shape}")

## 6. 10-Fold Ensemble + Out-of-Fold (OOF) Predictions

Ultra-heavy regularization on all three models to prevent overfitting on 700K training samples:
- **XGBoost**: L1=3.0, L2=3.5
- **LightGBM**: L1=3.0, L2=3.5
- **CatBoost**: L2=10.0

Ensemble weights: 50% XGB + 35% LGBM + 15% CB

In [None]:
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

oof = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

print(f"\nStarting {n_splits}-fold training...\n")

for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"Fold {fold}/{n_splits}", end=" → ")
    
    X_trn, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_trn, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    # XGBoost – Ultra Heavy Reg
    model1 = xgb.XGBClassifier(
        n_estimators=5000,
        max_depth=4,
        learning_rate=0.008,
        subsample=0.7,
        colsample_bytree=0.6,
        reg_alpha=3.0,
        reg_lambda=3.5,
        random_state=42,
        tree_method='hist',
        n_jobs=-1,
        verbosity=0
    )
    model1.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], early_stopping_rounds=300, verbose=False)

    # LightGBM – Ultra Heavy Reg
    model2 = lgb.LGBMClassifier(
        n_estimators=5000,
        max_depth=4,
        learning_rate=0.008,
        num_leaves=20,
        subsample=0.7,
        colsample_bytree=0.6,
        reg_alpha=3.0,
        reg_lambda=3.5,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
    model2.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(300)])

    # CatBoost – Ultra Heavy Reg
    model3 = cb.CatBoostClassifier(
        iterations=5000,
        depth=4,
        learning_rate=0.008,
        l2_leaf_reg=10.0,
        random_seed=42,
        verbose=False,
        early_stopping_rounds=300
    )
    model3.fit(X_trn, y_trn, eval_set=(X_val, y_val), verbose=False)

    # OOF
    val_pred = (model1.predict_proba(X_val)[:,1] * 0.50 +
                model2.predict_proba(X_val)[:,1] * 0.35 +
                model3.predict_proba(X_val)[:,1] * 0.15)

    oof[val_idx] = val_pred
    print(f"AUC = {roc_auc_score(y_val, val_pred):.6f}")

    # Test
    test_preds += (model1.predict_proba(X_test)[:,1] * 0.50 +
                   model2.predict_proba(X_test)[:,1] * 0.35 +
                   model3.predict_proba(X_test)[:,1] * 0.15) / n_splits

print(f"\nFinal CV AUC: {roc_auc_score(y, oof):.6f}")

## 7. Feature Selection

SelectFromModel reduces feature dimensionality using median importance threshold from the first fold model.

In [None]:
selector = SelectFromModel(model1, threshold='median', prefit=True)
X_sel = selector.transform(X)
X_test_sel = selector.transform(X_test)
print(f"Selected {X_sel.shape[1]} features")

## 8. Final Model on Selected Features

In [None]:
final_model = xgb.XGBClassifier(
    n_estimators=2000,
    max_depth=4,
    learning_rate=0.01,
    subsample=0.7,
    colsample_bytree=0.6,
    reg_alpha=2.5,
    reg_lambda=3.0,
    random_state=42,
    tree_method='hist',
    n_jobs=-1,
    verbosity=0
)
final_model.fit(X_sel, y)

final_pred = final_model.predict_proba(X_test_sel)[:,1]
print(f"Final predictions generated: {final_pred.shape[0]} samples")

## 9. Isotonic Calibration

Refines probability predictions using isotonic regression fitted on OOF predictions, ensuring better probability calibration.

In [None]:
calibrator = IsotonicRegression(out_of_bounds='clip')
calibrator.fit(oof, y)
final_pred = calibrator.transform(final_pred)
print(f"Predictions calibrated successfully")

## 10. Submission

In [None]:
sub[TARGET] = final_pred
sub.to_csv('submission.csv', index=False)

print("\nsubmission.csv saved!")
print(f"Mean prediction: {final_pred.mean():.5f}")
print(f"Min prediction: {final_pred.min():.5f}")
print(f"Max prediction: {final_pred.max():.5f}")

print("\nFirst few predictions:")
sub.head()

## Summary

**V18 Architecture:**
- 75 base + 48 external + 3 medical = 126 total features initially
- 10-Fold CV with 3-model ensemble (XGB/LGBM/CB)
- Ultra-heavy regularization prevents overfitting
- Feature selection reduces to 38 features
- Final model with 2000 estimators
- Isotonic calibration for probability refinement

This version establishes the baseline ensemble approach that subsequent versions refine and experiment with.