# Model V3: Seed Averaging + Full Optuna Tuning

**Improvements:**
1. Optuna tuning for ALL models (LightGBM, XGBoost, CatBoost)
2. Seed averaging (train with multiple seeds, average predictions)
3. More Optuna trials for better hyperparameters

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, TargetEncoder
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import optuna
from optuna_integration import LightGBMPruningCallback
optuna.logging.set_verbosity(optuna.logging.WARNING)
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded!")

Libraries loaded!


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
print(f"Train: {train.shape}, Test: {test.shape}")

Train: (700000, 26), Test: (300000, 25)


In [3]:
def advanced_feature_engineering(df):
    """Advanced feature engineering"""
    df = df.copy()
    
    # BMI categories
    df['bmi_category'] = pd.cut(df['bmi'], bins=[0, 18.5, 25, 30, 100], 
                                 labels=['underweight', 'normal', 'overweight', 'obese'])
    
    # Age groups
    df['age_group'] = pd.cut(df['age'], bins=[0, 30, 45, 60, 100], 
                              labels=['young', 'middle', 'senior', 'elderly'])
    
    # Blood pressure categories
    df['bp_category'] = pd.cut(df['systolic_bp'], bins=[0, 120, 130, 140, 200], 
                                labels=['normal', 'elevated', 'high_stage1', 'high_stage2'])
    
    # Cholesterol ratios
    df['ldl_hdl_ratio'] = df['ldl_cholesterol'] / (df['hdl_cholesterol'] + 1)
    df['total_hdl_ratio'] = df['cholesterol_total'] / (df['hdl_cholesterol'] + 1)
    df['non_hdl_cholesterol'] = df['cholesterol_total'] - df['hdl_cholesterol']
    
    # Risk score
    df['medical_risk_score'] = (df['family_history_diabetes'] + 
                                 df['hypertension_history'] + 
                                 df['cardiovascular_history'])
    
    # Activity and lifestyle
    df['activity_score'] = df['physical_activity_minutes_per_week'] / (df['screen_time_hours_per_day'] + 1)
    df['lifestyle_score'] = (df['diet_score'] + df['sleep_hours_per_day'] + 
                              df['physical_activity_minutes_per_week'] / 60 - 
                              df['screen_time_hours_per_day'] - df['alcohol_consumption_per_week'])
    
    # Blood pressure
    df['pulse_pressure'] = df['systolic_bp'] - df['diastolic_bp']
    df['mean_arterial_pressure'] = df['diastolic_bp'] + (df['pulse_pressure'] / 3)
    
    # Interactions
    df['age_bmi_interaction'] = df['age'] * df['bmi']
    df['physical_activity_per_day'] = df['physical_activity_minutes_per_week'] / 7
    df['age_systolic_interaction'] = df['age'] * df['systolic_bp']
    df['bmi_triglycerides'] = df['bmi'] * df['triglycerides']
    df['age_cholesterol'] = df['age'] * df['cholesterol_total']
    df['activity_bmi_ratio'] = df['physical_activity_minutes_per_week'] / (df['bmi'] + 1)
    
    # Family history interactions
    df['family_age_risk'] = df['family_history_diabetes'] * df['age']
    df['family_bmi_risk'] = df['family_history_diabetes'] * df['bmi']
    df['family_activity_protection'] = df['family_history_diabetes'] * df['physical_activity_minutes_per_week']
    
    # Metabolic syndrome indicators
    df['metabolic_risk'] = ((df['bmi'] > 30).astype(int) + 
                            (df['triglycerides'] > 150).astype(int) + 
                            (df['hdl_cholesterol'] < 40).astype(int) +
                            (df['systolic_bp'] > 130).astype(int))
    
    # Triglyceride to HDL ratio
    df['tg_hdl_ratio'] = df['triglycerides'] / (df['hdl_cholesterol'] + 1)
    
    # Cardiovascular risk score
    df['cv_risk_score'] = (df['systolic_bp'] / 10 + df['ldl_cholesterol'] / 10 + 
                           df['age'] / 5 - df['hdl_cholesterol'] / 10)
    
    # Lifestyle quality
    df['healthy_lifestyle'] = ((df['physical_activity_minutes_per_week'] > 150).astype(int) +
                                (df['diet_score'] > 7).astype(int) +
                                (df['sleep_hours_per_day'] >= 7).astype(int) +
                                (df['alcohol_consumption_per_week'] < 7).astype(int) +
                                (df['screen_time_hours_per_day'] < 4).astype(int))
    
    # Age risk
    df['age_risk'] = np.where(df['age'] < 40, 0, np.where(df['age'] < 50, 1, np.where(df['age'] < 60, 2, 3)))
    
    # Non-linear
    df['bmi_squared'] = df['bmi'] ** 2
    df['log_triglycerides'] = np.log1p(df['triglycerides'])
    df['log_physical_activity'] = np.log1p(df['physical_activity_minutes_per_week'])
    df['obesity_indicator'] = df['bmi'] * df['waist_to_hip_ratio']
    df['poor_sleep'] = ((df['sleep_hours_per_day'] < 6) | (df['sleep_hours_per_day'] > 9)).astype(int)
    df['sedentary'] = ((df['physical_activity_minutes_per_week'] < 75) & 
                        (df['screen_time_hours_per_day'] > 6)).astype(int)
    
    return df

# Apply feature engineering
train_fe = advanced_feature_engineering(train)
test_fe = advanced_feature_engineering(test)
print(f"Train shape after FE: {train_fe.shape}, Test shape: {test_fe.shape}")

Train shape after FE: (700000, 57), Test shape: (300000, 56)


In [4]:
# Prepare features
target = 'diagnosed_diabetes'
drop_cols = ['id', target, 'bmi_category', 'age_group', 'bp_category']
cat_cols = ['bmi_category', 'age_group', 'bp_category']
original_cat_cols = ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']

X = train_fe.drop(columns=[c for c in drop_cols if c in train_fe.columns])
y = train_fe[target]
X_test = test_fe.drop(columns=[c for c in drop_cols if c in test_fe.columns])

# Label encode categorical columns
for col in original_cat_cols:
    le = LabelEncoder()
    all_values = pd.concat([X[col], X_test[col]], axis=0).unique()
    le.fit(all_values)
    X[col] = le.transform(X[col])
    X_test[col] = le.transform(X_test[col])

# Target encoding
existing_cat_cols = [c for c in cat_cols if c in train_fe.columns]
target_encoder = TargetEncoder(target_type='binary', smooth='auto')
train_cats = train_fe[existing_cat_cols].copy()
test_cats = test_fe[existing_cat_cols].copy()
target_encoder.fit(train_cats, y)
train_encoded = target_encoder.transform(train_cats)
test_encoded = target_encoder.transform(test_cats)

for i, col in enumerate(existing_cat_cols):
    X[f'{col}_target_enc'] = train_encoded[:, i]
    X_test[f'{col}_target_enc'] = test_encoded[:, i]

print(f"Features: {X.shape[1]}")

Features: 55


## Optuna Tuning for All Models

In [5]:
# LightGBM Optuna objective
def objective_lgb(trial):
    params = {
        'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt',
        'verbosity': -1, 'n_jobs': -1, 'random_state': 42,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }
    
    cv_scores = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model = lgb.LGBMClassifier(**params)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
                  callbacks=[lgb.early_stopping(50, verbose=False), LightGBMPruningCallback(trial, 'auc')])
        cv_scores.append(roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))
    return np.mean(cv_scores)

print("Tuning LightGBM (50 trials)...")
study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_lgb, n_trials=50, show_progress_bar=True)
print(f"Best LightGBM AUC: {study_lgb.best_value:.5f}")

Tuning LightGBM (50 trials)...


Best trial: 16. Best value: 0.725883: 100%|██████████| 50/50 [36:18<00:00, 43.58s/it]   

Best LightGBM AUC: 0.72588





In [6]:
# XGBoost Optuna objective
def objective_xgb(trial):
    params = {
        'objective': 'binary:logistic', 'eval_metric': 'auc',
        'verbosity': 0, 'n_jobs': -1, 'random_state': 42,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }
    
    cv_scores = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model = xgb.XGBClassifier(**params)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        cv_scores.append(roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))
    return np.mean(cv_scores)

print("Tuning XGBoost (50 trials)...")
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=50, show_progress_bar=True)
print(f"Best XGBoost AUC: {study_xgb.best_value:.5f}")

Tuning XGBoost (50 trials)...


Best trial: 11. Best value: 0.725713: 100%|██████████| 50/50 [2:30:23<00:00, 180.46s/it]  

Best XGBoost AUC: 0.72571





In [7]:
# CatBoost Optuna objective
def objective_cat(trial):
    params = {
        'eval_metric': 'AUC', 'verbose': False, 'random_seed': 42,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
    }
    
    cv_scores = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model = CatBoostClassifier(**params)
        model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=False)
        cv_scores.append(roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))
    return np.mean(cv_scores)

print("Tuning CatBoost (30 trials - slower)...")
study_cat = optuna.create_study(direction='maximize')
study_cat.optimize(objective_cat, n_trials=30, show_progress_bar=True)
print(f"Best CatBoost AUC: {study_cat.best_value:.5f}")

Tuning CatBoost (30 trials - slower)...


Best trial: 27. Best value: 0.725749: 100%|██████████| 30/30 [1:17:14<00:00, 154.50s/it]

Best CatBoost AUC: 0.72575





## Seed Averaging Training

In [8]:
# Prepare best params from Optuna
best_lgb_params = study_lgb.best_params.copy()
best_lgb_params.update({'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt', 'verbosity': -1, 'n_jobs': -1})

best_xgb_params = study_xgb.best_params.copy()
best_xgb_params.update({'objective': 'binary:logistic', 'eval_metric': 'auc', 'verbosity': 0, 'n_jobs': -1})

best_cat_params = study_cat.best_params.copy()
best_cat_params.update({'eval_metric': 'AUC', 'verbose': False})

print("Best params configured:")
print(f"LightGBM: {study_lgb.best_value:.5f}")
print(f"XGBoost: {study_xgb.best_value:.5f}")
print(f"CatBoost: {study_cat.best_value:.5f}")

Best params configured:
LightGBM: 0.72588
XGBoost: 0.72571
CatBoost: 0.72575


In [9]:
# Seed averaging - train with multiple seeds and average predictions
SEEDS = [42, 123, 456, 789, 2024]
n_splits = 5

# Storage for predictions
all_oof_lgb = np.zeros((len(SEEDS), len(X)))
all_oof_xgb = np.zeros((len(SEEDS), len(X)))
all_oof_cat = np.zeros((len(SEEDS), len(X)))

all_test_lgb = np.zeros((len(SEEDS), len(X_test)))
all_test_xgb = np.zeros((len(SEEDS), len(X_test)))
all_test_cat = np.zeros((len(SEEDS), len(X_test)))

print("Training with seed averaging (5 seeds x 5 folds)...\n")

for seed_idx, seed in enumerate(SEEDS):
    print(f"=== Seed {seed} ({seed_idx+1}/{len(SEEDS)}) ===")
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    
    oof_lgb = np.zeros(len(X))
    oof_xgb = np.zeros(len(X))
    oof_cat = np.zeros(len(X))
    test_lgb = np.zeros(len(X_test))
    test_xgb = np.zeros(len(X_test))
    test_cat = np.zeros(len(X_test))
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # LightGBM
        lgb_params = best_lgb_params.copy()
        lgb_params['random_state'] = seed
        lgb_model = lgb.LGBMClassifier(**lgb_params)
        lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50, verbose=False)])
        oof_lgb[val_idx] = lgb_model.predict_proba(X_val)[:, 1]
        test_lgb += lgb_model.predict_proba(X_test)[:, 1] / n_splits
        
        # XGBoost
        xgb_params = best_xgb_params.copy()
        xgb_params['random_state'] = seed
        xgb_model = xgb.XGBClassifier(**xgb_params)
        xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        oof_xgb[val_idx] = xgb_model.predict_proba(X_val)[:, 1]
        test_xgb += xgb_model.predict_proba(X_test)[:, 1] / n_splits
        
        # CatBoost
        cat_params = best_cat_params.copy()
        cat_params['random_seed'] = seed
        cat_model = CatBoostClassifier(**cat_params)
        cat_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=False)
        oof_cat[val_idx] = cat_model.predict_proba(X_val)[:, 1]
        test_cat += cat_model.predict_proba(X_test)[:, 1] / n_splits
    
    all_oof_lgb[seed_idx] = oof_lgb
    all_oof_xgb[seed_idx] = oof_xgb
    all_oof_cat[seed_idx] = oof_cat
    all_test_lgb[seed_idx] = test_lgb
    all_test_xgb[seed_idx] = test_xgb
    all_test_cat[seed_idx] = test_cat
    
    print(f"  LGB: {roc_auc_score(y, oof_lgb):.5f}, XGB: {roc_auc_score(y, oof_xgb):.5f}, CAT: {roc_auc_score(y, oof_cat):.5f}")

# Average across seeds
oof_lgb_avg = all_oof_lgb.mean(axis=0)
oof_xgb_avg = all_oof_xgb.mean(axis=0)
oof_cat_avg = all_oof_cat.mean(axis=0)
test_lgb_avg = all_test_lgb.mean(axis=0)
test_xgb_avg = all_test_xgb.mean(axis=0)
test_cat_avg = all_test_cat.mean(axis=0)

print("\n" + "="*50)
print("Seed-Averaged CV AUC:")
print(f"  LightGBM: {roc_auc_score(y, oof_lgb_avg):.5f}")
print(f"  XGBoost:  {roc_auc_score(y, oof_xgb_avg):.5f}")
print(f"  CatBoost: {roc_auc_score(y, oof_cat_avg):.5f}")

Training with seed averaging (5 seeds x 5 folds)...

=== Seed 42 (1/5) ===
  LGB: 0.72588, XGB: 0.72571, CAT: 0.72574
=== Seed 123 (2/5) ===
  LGB: 0.72568, XGB: 0.72605, CAT: 0.72541
=== Seed 456 (3/5) ===
  LGB: 0.72566, XGB: 0.72594, CAT: 0.72579
=== Seed 789 (4/5) ===
  LGB: 0.72565, XGB: 0.72596, CAT: 0.72555
=== Seed 2024 (5/5) ===
  LGB: 0.72581, XGB: 0.72561, CAT: 0.72581

Seed-Averaged CV AUC:
  LightGBM: 0.72731
  XGBoost:  0.72750
  CatBoost: 0.72666


## Optimize Ensemble Weights

In [10]:
# Optimize ensemble weights
def objective_weights(trial):
    w_lgb = trial.suggest_float('w_lgb', 0, 1)
    w_xgb = trial.suggest_float('w_xgb', 0, 1)
    w_cat = trial.suggest_float('w_cat', 0, 1)
    total = w_lgb + w_xgb + w_cat
    w_lgb, w_xgb, w_cat = w_lgb/total, w_xgb/total, w_cat/total
    ensemble_oof = w_lgb * oof_lgb_avg + w_xgb * oof_xgb_avg + w_cat * oof_cat_avg
    return roc_auc_score(y, ensemble_oof)

print("Optimizing ensemble weights...")
study_weights = optuna.create_study(direction='maximize')
study_weights.optimize(objective_weights, n_trials=200, show_progress_bar=True)

# Get optimal weights
bw = study_weights.best_params
total = bw['w_lgb'] + bw['w_xgb'] + bw['w_cat']
w_lgb, w_xgb, w_cat = bw['w_lgb']/total, bw['w_xgb']/total, bw['w_cat']/total

print(f"\nOptimal weights: LGB={w_lgb:.3f}, XGB={w_xgb:.3f}, CAT={w_cat:.3f}")
print(f"Weighted Ensemble CV AUC: {study_weights.best_value:.5f}")

Optimizing ensemble weights...


Best trial: 189. Best value: 0.728072: 100%|██████████| 200/200 [00:28<00:00,  6.93it/s]


Optimal weights: LGB=0.468, XGB=0.532, CAT=0.000
Weighted Ensemble CV AUC: 0.72807





In [None]:
# Create final predictions
final_preds = w_lgb * test_lgb_avg + w_xgb * test_xgb_avg + w_cat * test_cat_avg

# Create submission
submission = pd.DataFrame({'id': test['id'], 'diagnosed_diabetes': final_preds})
submission.to_csv('submission_v3.csv', index=False)

print(f"Submission saved to submission_v3.csv")
print(f"Shape: {submission.shape}")
print(f"\nPrediction stats:\n{submission['diagnosed_diabetes'].describe()}")

Submission saved to submission_v3.csv
Shape: (300000, 2)

Prediction stats:
count    300000.000000
mean          0.601070
std           0.194198
min           0.042612
25%           0.460732
50%           0.606089
75%           0.749255
max           0.994643
Name: diagnosed_diabetes, dtype: float64


: 