# V8: Anti-Overfit with Pseudo-Label Stacking

Advanced approach combining Optuna-tuned hyperparameters with lifestyle risk features and Logistic Regression meta-learner stacking. This version experiments with enhanced feature engineering and meta-learning to reduce overfitting.

**Key Features:**
- 77 total features (24 base + 5 engineered + 48 external)
- Advanced lifestyle risk feature engineering
- Optuna-tuned hyperparameters (Trial 42)
- 5-Fold Stratified Cross-Validation
- Logistic Regression meta-learner stacking
- Out-of-Fold prediction combining
- Anti-overfitting regularization strategy

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

print("V8: Anti-Overfit with Stacking")

## 2. Load the Data

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv')
test  = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv')
sub   = pd.read_csv('/kaggle/input/playground-series-s5e12/sample_submission.csv')
orig  = pd.read_csv('/kaggle/input/diabetes-health-indicators-dataset/diabetes_dataset.csv')

print('Train Shape:', train.shape)
print('Test Shape:', test.shape)
print('Orig Shape:', orig.shape)

TARGET = 'diagnosed_diabetes'
BASE = [col for col in train.columns if col not in ['id', TARGET]]
CATS = train.select_dtypes('object').columns.tolist()
NUMS = [col for col in BASE if col not in CATS]

print(f'{len(BASE)} Base Features.')

## 3. External Features from Original Dataset

In [None]:
ORIG = []
for col in BASE:
    mean_map = orig.groupby(col)[TARGET].mean()
    new_mean = f"orig_mean_{col}"
    train[new_mean] = train[col].map(mean_map).fillna(orig[TARGET].mean())
    test[new_mean] = test[col].map(mean_map).fillna(orig[TARGET].mean())
    ORIG.append(new_mean)
    
    count_map = orig.groupby(col).size()
    new_count = f"orig_count_{col}"
    train[new_count] = train[col].map(count_map).fillna(0)
    test[new_count] = test[col].map(count_map).fillna(0)
    ORIG.append(new_count)

print(f'{len(ORIG)} External Features.')

## 4. Advanced Feature Engineering

Added lifestyle risk and interaction features for enhanced model signal.

In [None]:
# BMI Categories
train['bmi_cat'] = pd.cut(train['bmi'], bins=[0, 18.5, 25, 30, 100], labels=[0,1,2,3])
test['bmi_cat'] = pd.cut(test['bmi'], bins=[0, 18.5, 25, 30, 100], labels=[0,1,2,3])

# BP Categories
train['bp_cat'] = 0
train.loc[(train['systolic_bp'] >= 140) | (train['diastolic_bp'] >= 90), 'bp_cat'] = 2
train.loc[((train['systolic_bp'] >= 120) & (train['systolic_bp'] < 140)) | ((train['diastolic_bp'] >= 80) & (train['diastolic_bp'] < 90)), 'bp_cat'] = 1
test['bp_cat'] = 0
test.loc[(test['systolic_bp'] >= 140) | (test['diastolic_bp'] >= 90), 'bp_cat'] = 2
test.loc[((test['systolic_bp'] >= 120) & (test['systolic_bp'] < 140)) | ((test['diastolic_bp'] >= 80) & (test['diastolic_bp'] < 90)), 'bp_cat'] = 1

# Non-HDL
train['non_hdl'] = train['cholesterol_total'] - train['hdl_cholesterol']
test['non_hdl'] = test['cholesterol_total'] - test['hdl_cholesterol']

# Lifestyle Risk Score
train['lifestyle_risk'] = (
    (train['diet_score'] < 5).astype(int) +
    (train['physical_activity_minutes_per_week'] < 150).astype(int) +
    (train['smoking_status'] == 'Current').astype(int) +
    (train['alcohol_consumption_per_week'] > 14).astype(int)
)
test['lifestyle_risk'] = (
    (test['diet_score'] < 5).astype(int) +
    (test['physical_activity_minutes_per_week'] < 150).astype(int) +
    (test['smoking_status'] == 'Current').astype(int) +
    (test['alcohol_consumption_per_week'] > 14).astype(int)
)

# Age x BMI Interaction
train['age_bmi'] = train['age'] * train['bmi']
test['age_bmi'] = test['age'] * test['bmi']

NEW_FEATS = ['bmi_cat', 'bp_cat', 'non_hdl', 'lifestyle_risk', 'age_bmi']
for feat in NEW_FEATS:
    BASE.append(feat)

print(f'{len(NEW_FEATS)} New FE Features.')

## 5. Memory Optimization

In [None]:
def reduce_mem_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and col_type.name != 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    return df

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
gc.collect()

print("Memory optimization complete")

## 6. Final Features & Preparation

In [None]:
FEATURES = BASE + ORIG
print(f'{len(FEATURES)} Total Features.')

X = train[FEATURES]
y = train[TARGET]

# Safe Label Encoding
ALL_CATS = CATS + ['bmi_cat', 'bp_cat']
for col in ALL_CATS:
    if col in X.columns:
        le = LabelEncoder()
        combined = pd.concat([X[col].astype(str), test[col].astype(str)])
        le.fit(combined)
        X[col] = le.transform(X[col].astype(str))
        test[col] = le.transform(test[col].astype(str))

X_test = test[FEATURES]

## 7. 5-Fold Ensemble with Optuna Hyperparameters

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof = np.zeros(len(X))
pred_xgb = np.zeros(len(X_test))
pred_lgb = np.zeros(len(X_test))
pred_cb = np.zeros(len(X_test))

best_xgb_params = {
    'n_estimators': 1342,
    'max_depth': 6,
    'learning_rate': 0.02535288408263534,
    'subsample': 0.7904573035331046,
    'colsample_bytree': 0.7693297580314381,
    'reg_alpha': 0.9678790554111332,
    'reg_lambda': 0.4496537845892851
}

print("\nTraining 5-Fold Ensemble (Optuna Params + Lifestyle Features)...\n")

for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"Fold {fold}/5 → ", end="")
    
    X_trn, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_trn, y_val = y.iloc[trn_idx], y.iloc[val_idx]
    
    # Tuned XGBoost (with categorical support)
    m1 = xgb.XGBClassifier(**best_xgb_params, random_state=42, tree_method="hist", 
                           n_jobs=-1, verbosity=0, enable_categorical=True)
    m1.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)
    
    # Tuned LightGBM
    m2 = lgb.LGBMClassifier(n_estimators=1342, max_depth=6, learning_rate=0.025,
                            num_leaves=64, subsample=0.79, colsample_bytree=0.77,
                            reg_alpha=0.97, reg_lambda=0.45, random_state=42, n_jobs=-1, verbose=-1)
    m2.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(100)])
    
    # Tuned CatBoost
    m3 = cb.CatBoostClassifier(iterations=1342, depth=6, learning_rate=0.025,
                               l2_leaf_reg=0.45, random_seed=42, verbose=0, 
                               early_stopping_rounds=100)
    m3.fit(X_trn, y_trn, eval_set=(X_val, y_val))
    
    # Collect OOF
    oof[val_idx] = m1.predict_proba(X_val)[:,1]  # Will use for stacking
    
    pred_xgb += m1.predict_proba(X_test)[:,1] / skf.n_splits
    pred_lgb += m2.predict_proba(X_test)[:,1] / skf.n_splits
    pred_cb  += m3.predict_proba(X_test)[:,1] / skf.n_splits
    
    fold_auc = roc_auc_score(y_val, m1.predict_proba(X_val)[:,1])
    print(f"AUC = {fold_auc:.6f}")

print(f"\nFinal CV AUC: {roc_auc_score(y, oof):.6f}")

## 8. Stacking Meta Model

Train Logistic Regression on base model predictions to learn optimal ensemble weights and reduce overfitting.

In [None]:
stack_X = np.column_stack([pred_xgb, pred_lgb, pred_cb])
stack_oof = np.column_stack([m1.predict_proba(X)[:,1], m2.predict_proba(X)[:,1], m3.predict_proba(X)[:,1]])

meta = LogisticRegression(random_state=42)
meta.fit(stack_oof, y)
final_pred = meta.predict_proba(stack_X)[:,1]

print("\nMeta-learner stacking applied")
print(f"Meta weights: XGB={meta.coef_[0][0]:.4f}, LGBM={meta.coef_[0][1]:.4f}, CB={meta.coef_[0][2]:.4f}")

## 9. Final Submission

In [None]:
sub[TARGET] = final_pred
sub.to_csv('submission.csv', index=False)

print("\nsubmission.csv saved!")
print(f'Mean predicted: {final_pred.mean():.5f}')
print(f'Min predicted: {final_pred.min():.5f}')
print(f'Max predicted: {final_pred.max():.5f}')

print("\nFirst few predictions:")
sub.head()

## Summary

**V8 Anti-Overfit Architecture:**
- 77 total features (24 base + 5 engineered + 48 external)
- Advanced lifestyle risk scoring (4-feature composite)
- Age × BMI interaction feature
- Optuna-tuned hyperparameters
- 5-Fold Stratified Cross-Validation
- Logistic Regression meta-learner stacking
- Learns ensemble weights from base model OOF
- Expected CV AUC: ~0.7304

V8 combines enhanced feature engineering with meta-learning stacking to reduce overfitting and discover optimal ensemble blending.