# Diabetes Prediction Model V2 - Advanced
## Kaggle Playground Series S5E12

Improvements:
1. Optuna hyperparameter tuning
2. Advanced feature engineering  
3. Target encoding
4. Stacking ensemble with meta-learner
5. Optimized ensemble weights

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, TargetEncoder
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

print("Libraries loaded!")

Libraries loaded!


  from .autonotebook import tqdm as notebook_tqdm


## 1. Load Data

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
print(f"Train: {train.shape}, Test: {test.shape}")

Train: (700000, 26), Test: (300000, 25)


## 2. Advanced Feature Engineering

In [3]:
def advanced_feature_engineering(df):
    """Advanced feature engineering with more interactions and domain knowledge"""
    df = df.copy()
    
    # === BASIC FEATURES (from v1) ===
    # BMI categories
    df['bmi_category'] = pd.cut(df['bmi'], bins=[0, 18.5, 25, 30, 100], 
                                 labels=['underweight', 'normal', 'overweight', 'obese'])
    
    # Age groups
    df['age_group'] = pd.cut(df['age'], bins=[0, 30, 45, 60, 100], 
                              labels=['young', 'middle', 'senior', 'elderly'])
    
    # Blood pressure categories
    df['bp_category'] = pd.cut(df['systolic_bp'], bins=[0, 120, 130, 140, 200], 
                                labels=['normal', 'elevated', 'high_stage1', 'high_stage2'])
    
    # Cholesterol ratios
    df['ldl_hdl_ratio'] = df['ldl_cholesterol'] / (df['hdl_cholesterol'] + 1)
    df['total_hdl_ratio'] = df['cholesterol_total'] / (df['hdl_cholesterol'] + 1)
    df['non_hdl_cholesterol'] = df['cholesterol_total'] - df['hdl_cholesterol']
    
    # Risk score
    df['medical_risk_score'] = (df['family_history_diabetes'] + 
                                 df['hypertension_history'] + 
                                 df['cardiovascular_history'])
    
    # Activity and lifestyle
    df['activity_score'] = df['physical_activity_minutes_per_week'] / (df['screen_time_hours_per_day'] + 1)
    df['lifestyle_score'] = (df['diet_score'] + df['sleep_hours_per_day'] + 
                              df['physical_activity_minutes_per_week'] / 60 - 
                              df['screen_time_hours_per_day'] - df['alcohol_consumption_per_week'])
    
    # Blood pressure
    df['pulse_pressure'] = df['systolic_bp'] - df['diastolic_bp']
    df['mean_arterial_pressure'] = df['diastolic_bp'] + (df['pulse_pressure'] / 3)
    
    # Interactions
    df['age_bmi_interaction'] = df['age'] * df['bmi']
    df['physical_activity_per_day'] = df['physical_activity_minutes_per_week'] / 7
    
    # === NEW ADVANCED FEATURES ===
    
    # More interaction features
    df['age_systolic_interaction'] = df['age'] * df['systolic_bp']
    df['bmi_triglycerides'] = df['bmi'] * df['triglycerides']
    df['age_cholesterol'] = df['age'] * df['cholesterol_total']
    df['activity_bmi_ratio'] = df['physical_activity_minutes_per_week'] / (df['bmi'] + 1)
    
    # Family history interactions (most important feature!)
    df['family_age_risk'] = df['family_history_diabetes'] * df['age']
    df['family_bmi_risk'] = df['family_history_diabetes'] * df['bmi']
    df['family_activity_protection'] = df['family_history_diabetes'] * df['physical_activity_minutes_per_week']
    
    # Metabolic syndrome indicators
    df['metabolic_risk'] = ((df['bmi'] > 30).astype(int) + 
                            (df['triglycerides'] > 150).astype(int) + 
                            (df['hdl_cholesterol'] < 40).astype(int) +
                            (df['systolic_bp'] > 130).astype(int))
    
    # Triglyceride to HDL ratio (insulin resistance marker)
    df['tg_hdl_ratio'] = df['triglycerides'] / (df['hdl_cholesterol'] + 1)
    
    # Cardiovascular risk score
    df['cv_risk_score'] = (df['systolic_bp'] / 10 + df['ldl_cholesterol'] / 10 + 
                           df['age'] / 5 - df['hdl_cholesterol'] / 10)
    
    # Lifestyle quality score
    df['healthy_lifestyle'] = ((df['physical_activity_minutes_per_week'] > 150).astype(int) +
                                (df['diet_score'] > 7).astype(int) +
                                (df['sleep_hours_per_day'] >= 7).astype(int) +
                                (df['alcohol_consumption_per_week'] < 7).astype(int) +
                                (df['screen_time_hours_per_day'] < 4).astype(int))
    
    # Age-related risk categories
    df['age_risk'] = np.where(df['age'] < 40, 0, 
                     np.where(df['age'] < 50, 1,
                     np.where(df['age'] < 60, 2, 3)))
    
    # BMI squared (non-linear relationship)
    df['bmi_squared'] = df['bmi'] ** 2
    
    # Log transformations for skewed features
    df['log_triglycerides'] = np.log1p(df['triglycerides'])
    df['log_physical_activity'] = np.log1p(df['physical_activity_minutes_per_week'])
    
    # Waist-hip and BMI combined risk
    df['obesity_indicator'] = df['bmi'] * df['waist_to_hip_ratio']
    
    # Sleep quality indicator
    df['poor_sleep'] = ((df['sleep_hours_per_day'] < 6) | (df['sleep_hours_per_day'] > 9)).astype(int)
    
    # Sedentary lifestyle
    df['sedentary'] = ((df['physical_activity_minutes_per_week'] < 75) & 
                        (df['screen_time_hours_per_day'] > 6)).astype(int)
    
    return df

# Apply feature engineering
train_fe = advanced_feature_engineering(train)
test_fe = advanced_feature_engineering(test)

print(f"Train shape after FE: {train_fe.shape}")
print(f"Test shape after FE: {test_fe.shape}")
print(f"New features created: {train_fe.shape[1] - train.shape[1]}")

Train shape after FE: (700000, 57)
Test shape after FE: (300000, 56)
New features created: 31


## 3. Target Encoding for Categorical Variables

In [7]:
# Prepare features
target = 'diagnosed_diabetes'
drop_cols = ['id', target, 'bmi_category', 'age_group', 'bp_category']
cat_cols = ['bmi_category', 'age_group', 'bp_category']

# Identify original string categorical columns that need encoding
original_cat_cols = ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']

# Separate features
X = train_fe.drop(columns=[c for c in drop_cols if c in train_fe.columns])
y = train_fe[target]
X_test = test_fe.drop(columns=[c for c in drop_cols if c in test_fe.columns])

# Label encode the original categorical columns
from sklearn.preprocessing import LabelEncoder

for col in original_cat_cols:
    le = LabelEncoder()
    # Fit on all unique values from both train and test
    all_values = pd.concat([X[col], X_test[col]], axis=0).unique()
    le.fit(all_values)
    X[col] = le.transform(X[col])
    X_test[col] = le.transform(X_test[col])

print(f"Label encoded {len(original_cat_cols)} categorical columns")

# Target encoding for engineered categorical columns
from sklearn.preprocessing import TargetEncoder

# Get categorical columns that exist in X
existing_cat_cols = [c for c in cat_cols if c in train_fe.columns]

# Fit target encoder on full training data
target_encoder = TargetEncoder(target_type='binary', smooth='auto')

# Convert categorical columns back to original data for encoding
train_cats = train_fe[existing_cat_cols].copy()
test_cats = test_fe[existing_cat_cols].copy()

# Fit on training data
target_encoder.fit(train_cats, y)

# Transform both train and test
train_encoded = target_encoder.transform(train_cats)
test_encoded = target_encoder.transform(test_cats)

# Add encoded features to X
for i, col in enumerate(existing_cat_cols):
    X[f'{col}_target_enc'] = train_encoded[:, i]
    X_test[f'{col}_target_enc'] = test_encoded[:, i]

print(f"Features after target encoding: {X.shape[1]}")
print(f"Target encoded columns: {[f'{c}_target_enc' for c in existing_cat_cols]}")
print(f"\nFeature dtypes:\n{X.dtypes.value_counts()}")

Label encoded 6 categorical columns
Features after target encoding: 55
Target encoded columns: ['bmi_category_target_enc', 'age_group_target_enc', 'bp_category_target_enc']

Feature dtypes:
int64      31
float64    24
Name: count, dtype: int64


## 4. Optuna Hyperparameter Tuning for LightGBM

In [8]:
import optuna
from optuna_integration import LightGBMPruningCallback
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective_lgb(trial):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'n_jobs': -1,
        'random_state': 42,
        
        # Hyperparameters to tune
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }
    
    cv_scores = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[
                lgb.early_stopping(50, verbose=False),
                LightGBMPruningCallback(trial, 'auc')
            ]
        )
        
        preds = model.predict_proba(X_val)[:, 1]
        score = roc_auc_score(y_val, preds)
        cv_scores.append(score)
    
    return np.mean(cv_scores)

# Run Optuna optimization (50 trials for speed, increase for better results)
print("Starting Optuna optimization for LightGBM...")
study_lgb = optuna.create_study(direction='maximize', study_name='lgb_diabetes')
study_lgb.optimize(objective_lgb, n_trials=50, show_progress_bar=True)

print(f"\nBest LightGBM CV AUC: {study_lgb.best_value:.5f}")
print(f"Best parameters: {study_lgb.best_params}")

Starting Optuna optimization for LightGBM...


Best trial: 2. Best value: 0.725647: 100%|██████████| 50/50 [45:12<00:00, 54.25s/it]  


Best LightGBM CV AUC: 0.72565
Best parameters: {'learning_rate': 0.16650679360420056, 'n_estimators': 606, 'max_depth': 7, 'num_leaves': 20, 'min_child_samples': 94, 'subsample': 0.7718478966632039, 'colsample_bytree': 0.5805949017577372, 'reg_alpha': 1.4322521289019877e-07, 'reg_lambda': 1.1952348229966282e-07}





## 5. Train Optimized Models

In [9]:
# Get best LightGBM params from Optuna
best_lgb_params = study_lgb.best_params.copy()
best_lgb_params.update({
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'n_jobs': -1,
    'random_state': 42
})

# XGBoost params (similar structure)
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'n_estimators': best_lgb_params.get('n_estimators', 500),
    'learning_rate': best_lgb_params.get('learning_rate', 0.05),
    'max_depth': best_lgb_params.get('max_depth', 6),
    'subsample': best_lgb_params.get('subsample', 0.8),
    'colsample_bytree': best_lgb_params.get('colsample_bytree', 0.8),
    'reg_alpha': best_lgb_params.get('reg_alpha', 0.1),
    'reg_lambda': best_lgb_params.get('reg_lambda', 0.1),
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': 0
}

# CatBoost params
cat_params = {
    'iterations': best_lgb_params.get('n_estimators', 500),
    'learning_rate': best_lgb_params.get('learning_rate', 0.05),
    'depth': min(best_lgb_params.get('max_depth', 6), 10),
    'l2_leaf_reg': best_lgb_params.get('reg_lambda', 0.1),
    'random_seed': 42,
    'verbose': False,
    'eval_metric': 'AUC',
    'early_stopping_rounds': 50
}

print("Model parameters configured based on Optuna optimization")

Model parameters configured based on Optuna optimization


In [10]:
# Train all models with CV and collect OOF predictions for stacking
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Storage for OOF predictions (for stacking) and test predictions
oof_lgb = np.zeros(len(X))
oof_xgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))

test_preds_lgb = np.zeros(len(X_test))
test_preds_xgb = np.zeros(len(X_test))
test_preds_cat = np.zeros(len(X_test))

cv_scores_lgb = []
cv_scores_xgb = []
cv_scores_cat = []

print("Training models with 5-fold CV...\n")

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"=== Fold {fold + 1} ===")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # LightGBM
    lgb_model = lgb.LGBMClassifier(**best_lgb_params)
    lgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )
    oof_lgb[val_idx] = lgb_model.predict_proba(X_val)[:, 1]
    test_preds_lgb += lgb_model.predict_proba(X_test)[:, 1] / n_splits
    score_lgb = roc_auc_score(y_val, oof_lgb[val_idx])
    cv_scores_lgb.append(score_lgb)
    print(f"  LightGBM: {score_lgb:.5f}")
    
    # XGBoost
    xgb_model = xgb.XGBClassifier(**xgb_params)
    xgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    oof_xgb[val_idx] = xgb_model.predict_proba(X_val)[:, 1]
    test_preds_xgb += xgb_model.predict_proba(X_test)[:, 1] / n_splits
    score_xgb = roc_auc_score(y_val, oof_xgb[val_idx])
    cv_scores_xgb.append(score_xgb)
    print(f"  XGBoost:  {score_xgb:.5f}")
    
    # CatBoost
    cat_model = CatBoostClassifier(**cat_params)
    cat_model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        verbose=False
    )
    oof_cat[val_idx] = cat_model.predict_proba(X_val)[:, 1]
    test_preds_cat += cat_model.predict_proba(X_test)[:, 1] / n_splits
    score_cat = roc_auc_score(y_val, oof_cat[val_idx])
    cv_scores_cat.append(score_cat)
    print(f"  CatBoost: {score_cat:.5f}")
    print()

# Overall CV scores
print("=" * 50)
print(f"LightGBM CV AUC: {np.mean(cv_scores_lgb):.5f} (+/- {np.std(cv_scores_lgb):.5f})")
print(f"XGBoost CV AUC:  {np.mean(cv_scores_xgb):.5f} (+/- {np.std(cv_scores_xgb):.5f})")
print(f"CatBoost CV AUC: {np.mean(cv_scores_cat):.5f} (+/- {np.std(cv_scores_cat):.5f})")

Training models with 5-fold CV...

=== Fold 1 ===
  LightGBM: 0.72634
  XGBoost:  0.71528
  CatBoost: 0.72329

=== Fold 2 ===
  LightGBM: 0.72471
  XGBoost:  0.71456
  CatBoost: 0.72074

=== Fold 3 ===
  LightGBM: 0.72474
  XGBoost:  0.71497
  CatBoost: 0.72205

=== Fold 4 ===
  LightGBM: 0.72598
  XGBoost:  0.71681
  CatBoost: 0.72320

=== Fold 5 ===
  LightGBM: 0.72645
  XGBoost:  0.71558
  CatBoost: 0.72279

LightGBM CV AUC: 0.72565 (+/- 0.00076)
XGBoost CV AUC:  0.71544 (+/- 0.00076)
CatBoost CV AUC: 0.72241 (+/- 0.00094)


## 6. Stacking Ensemble

In [11]:
# Create stacking features from OOF predictions
stack_train = np.column_stack([oof_lgb, oof_xgb, oof_cat])
stack_test = np.column_stack([test_preds_lgb, test_preds_xgb, test_preds_cat])

# Train meta-model (Logistic Regression) with CV
from sklearn.linear_model import LogisticRegression

meta_model = LogisticRegression(C=1.0, random_state=42, max_iter=1000)
meta_scores = []

# Cross-validate the meta-model
for fold, (train_idx, val_idx) in enumerate(skf.split(stack_train, y)):
    X_meta_train, X_meta_val = stack_train[train_idx], stack_train[val_idx]
    y_meta_train, y_meta_val = y.iloc[train_idx], y.iloc[val_idx]
    
    meta_model.fit(X_meta_train, y_meta_train)
    meta_preds = meta_model.predict_proba(X_meta_val)[:, 1]
    meta_scores.append(roc_auc_score(y_meta_val, meta_preds))

print(f"Stacking Meta-Model CV AUC: {np.mean(meta_scores):.5f} (+/- {np.std(meta_scores):.5f})")

# Final meta-model training on all data
meta_model.fit(stack_train, y)
stack_preds = meta_model.predict_proba(stack_test)[:, 1]

print(f"\nMeta-model coefficients: LGB={meta_model.coef_[0][0]:.3f}, XGB={meta_model.coef_[0][1]:.3f}, CAT={meta_model.coef_[0][2]:.3f}")

Stacking Meta-Model CV AUC: 0.72624 (+/- 0.00074)

Meta-model coefficients: LGB=3.369, XGB=0.678, CAT=0.915


## 7. Optimize Ensemble Weights with Optuna

In [12]:
# Find optimal weights for simple weighted average ensemble
def objective_weights(trial):
    w_lgb = trial.suggest_float('w_lgb', 0, 1)
    w_xgb = trial.suggest_float('w_xgb', 0, 1)
    w_cat = trial.suggest_float('w_cat', 0, 1)
    
    # Normalize weights
    total = w_lgb + w_xgb + w_cat
    w_lgb, w_xgb, w_cat = w_lgb/total, w_xgb/total, w_cat/total
    
    # Weighted ensemble using OOF predictions
    ensemble_oof = w_lgb * oof_lgb + w_xgb * oof_xgb + w_cat * oof_cat
    return roc_auc_score(y, ensemble_oof)

print("Optimizing ensemble weights...")
study_weights = optuna.create_study(direction='maximize')
study_weights.optimize(objective_weights, n_trials=100, show_progress_bar=True)

# Get optimal weights
best_weights = study_weights.best_params
total = best_weights['w_lgb'] + best_weights['w_xgb'] + best_weights['w_cat']
w_lgb = best_weights['w_lgb'] / total
w_xgb = best_weights['w_xgb'] / total
w_cat = best_weights['w_cat'] / total

print(f"\nOptimal weights: LGB={w_lgb:.3f}, XGB={w_xgb:.3f}, CAT={w_cat:.3f}")
print(f"Weighted Ensemble CV AUC: {study_weights.best_value:.5f}")

# Create weighted ensemble predictions
weighted_ensemble = w_lgb * test_preds_lgb + w_xgb * test_preds_xgb + w_cat * test_preds_cat

Optimizing ensemble weights...


Best trial: 53. Best value: 0.726248: 100%|██████████| 100/100 [00:17<00:00,  5.69it/s]


Optimal weights: LGB=0.716, XGB=0.150, CAT=0.134
Weighted Ensemble CV AUC: 0.72625





## 8. Final Predictions & Submission

In [14]:
# Compare all ensemble methods
from sklearn.model_selection import cross_val_predict

simple_avg = (test_preds_lgb + test_preds_xgb + test_preds_cat) / 3

# Validate on OOF predictions
simple_avg_oof = (oof_lgb + oof_xgb + oof_cat) / 3
weighted_oof = w_lgb * oof_lgb + w_xgb * oof_xgb + w_cat * oof_cat
stack_oof = cross_val_predict(
    LogisticRegression(C=1.0, random_state=42, max_iter=1000),
    stack_train, y, cv=5, method='predict_proba'
)[:, 1]

print("Final OOF CV AUC Scores:")
print(f"  Simple Average:    {roc_auc_score(y, simple_avg_oof):.5f}")
print(f"  Weighted Average:  {roc_auc_score(y, weighted_oof):.5f}")
print(f"  Stacking:          {roc_auc_score(y, stack_oof):.5f}")

# Choose the best ensemble
best_method = max([
    ('simple_avg', roc_auc_score(y, simple_avg_oof), simple_avg),
    ('weighted', roc_auc_score(y, weighted_oof), weighted_ensemble),
    ('stacking', roc_auc_score(y, stack_oof), stack_preds)
], key=lambda x: x[1])

print(f"\nBest method: {best_method[0]} with CV AUC: {best_method[1]:.5f}")
final_preds = best_method[2]

Final OOF CV AUC Scores:
  Simple Average:    0.72539
  Weighted Average:  0.72625
  Stacking:          0.72622

Best method: weighted with CV AUC: 0.72625


In [None]:
# Create submission file
submission = pd.DataFrame({
    'id': test['id'],
    'diagnosed_diabetes': final_preds
})
submission.to_csv('submission_v2.csv', index=False)

print(f"Submission saved to submission_v2.csv")
print(f"Shape: {submission.shape}")
print(f"\nPrediction statistics:")
print(submission['diagnosed_diabetes'].describe())

Submission saved to submission_v2.csv
Shape: (300000, 2)

Prediction statistics:
count    300000.000000
mean          0.601488
std           0.192705
min           0.042057
25%           0.463048
50%           0.606288
75%           0.747484
max           0.994777
Name: diagnosed_diabetes, dtype: float64


: 