In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from scipy.stats import rankdata
import warnings
import os

warnings.filterwarnings('ignore')

SEED = 42
N_SPLITS = 10
TARGET = 'diagnosed_diabetes'

In [3]:
# Load data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Target rate: {train[TARGET].mean():.4f}")

test_ids = test['id']
train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

def prepare_data(df):
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype('category')
    return df

train = prepare_data(train)
test = prepare_data(test)

y = train[TARGET]
X = train.drop(columns=[TARGET])
X_test = test.copy()

Train shape: (700000, 26)
Test shape: (300000, 25)
Target rate: 0.6233


## Part 1: Path Smoothing Regularization

Path smoothing prevents overfitting by smoothing leaf weights toward parent node weights.
This is especially useful when leaves have few samples (common with distribution shift).

In [4]:
def train_lgb_path_smooth(X, y, X_test, n_splits=10, path_smooth=10.0, seed=42):
    """LightGBM with path_smooth regularization"""
    
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 5000,
        'learning_rate': 0.01,
        'num_leaves': 31,
        'max_depth': 6,
        'min_child_samples': 50,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,
        'reg_alpha': 0.5,
        'reg_lambda': 0.5,
        'path_smooth': path_smooth,  # KEY: smooths leaf weights toward parent
        'random_state': seed,
        'verbose': -1,
    }
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y)):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            callbacks=[lgb.early_stopping(100, verbose=False)]
        )
        
        oof_preds[va_idx] = model.predict_proba(X_va)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / n_splits
    
    oof_auc = roc_auc_score(y, oof_preds)
    return oof_preds, test_preds, oof_auc

# Try different path_smooth values
print("=== Path Smooth Regularization ===")
for ps in [5.0, 10.0, 20.0]:
    oof, preds, auc = train_lgb_path_smooth(X, y, X_test, n_splits=N_SPLITS, path_smooth=ps, seed=SEED)
    out = f'submission_v7_pathsmooth_{int(ps)}.csv'
    pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': preds}).to_csv(out, index=False)
    print(f"path_smooth={ps:.0f} -> CV AUC {auc:.5f} | saved {out}")

=== Path Smooth Regularization ===
path_smooth=5 -> CV AUC 0.72853 | saved submission_v7_pathsmooth_5.csv
path_smooth=10 -> CV AUC 0.72853 | saved submission_v7_pathsmooth_10.csv
path_smooth=20 -> CV AUC 0.72853 | saved submission_v7_pathsmooth_20.csv


## Part 2: Multi-Seed Averaging

Train the same model with different random seeds and average predictions.
This reduces variance and often improves generalization.

In [5]:
def train_lgb_multiseed(X, y, X_test, n_splits=10, n_seeds=5, base_seed=42):
    """Train LightGBM with multiple seeds and average"""
    
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 5000,
        'learning_rate': 0.01,
        'num_leaves': 31,
        'max_depth': 6,
        'min_child_samples': 50,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,
        'reg_alpha': 0.5,
        'reg_lambda': 0.5,
        'verbose': -1,
    }
    
    all_oof = []
    all_test = []
    
    seeds = [base_seed + i * 1000 for i in range(n_seeds)]
    print(f"Training with seeds: {seeds}")
    
    for seed in seeds:
        params['random_state'] = seed
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
        oof_preds = np.zeros(len(X))
        test_preds = np.zeros(len(X_test))
        
        for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y)):
            X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
            y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
            
            model = lgb.LGBMClassifier(**params)
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_va, y_va)],
                callbacks=[lgb.early_stopping(100, verbose=False)]
            )
            
            oof_preds[va_idx] = model.predict_proba(X_va)[:, 1]
            test_preds += model.predict_proba(X_test)[:, 1] / n_splits
        
        seed_auc = roc_auc_score(y, oof_preds)
        print(f"  Seed {seed}: CV AUC = {seed_auc:.5f}")
        all_oof.append(oof_preds)
        all_test.append(test_preds)
    
    # Average across seeds
    avg_oof = np.mean(all_oof, axis=0)
    avg_test = np.mean(all_test, axis=0)
    avg_auc = roc_auc_score(y, avg_oof)
    
    return avg_oof, avg_test, avg_auc, all_oof, all_test

print("\n=== Multi-Seed Averaging (5 seeds) ===")
oof_ms, test_ms, auc_ms, all_oof, all_test = train_lgb_multiseed(X, y, X_test, n_splits=N_SPLITS, n_seeds=5)
print(f"Averaged CV AUC: {auc_ms:.5f}")

pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': test_ms}).to_csv('submission_v7_multiseed_5.csv', index=False)
print("Saved: submission_v7_multiseed_5.csv")


=== Multi-Seed Averaging (5 seeds) ===
Training with seeds: [42, 1042, 2042, 3042, 4042]
  Seed 42: CV AUC = 0.72853
  Seed 1042: CV AUC = 0.72836
  Seed 2042: CV AUC = 0.72839
  Seed 3042: CV AUC = 0.72844
  Seed 4042: CV AUC = 0.72838
Averaged CV AUC: 0.72871
Saved: submission_v7_multiseed_5.csv


## Part 3: Stacked Generalization (Level-2 Meta-Model)

Use OOF predictions from diverse base models as features for a meta-learner.
This can capture complementary patterns from different algorithms.

In [6]:
def train_base_models(X, y, X_test, n_splits=10, seed=42):
    """Train diverse base models and return OOF predictions"""
    
    base_oof = {}
    base_test = {}
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    
    # Model 1: LightGBM (default)
    print("Training LightGBM...")
    lgb_oof = np.zeros(len(X))
    lgb_test = np.zeros(len(X_test))
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y)):
        model = lgb.LGBMClassifier(
            n_estimators=3000, learning_rate=0.01, num_leaves=31, max_depth=6,
            min_child_samples=50, feature_fraction=0.7, bagging_fraction=0.7,
            bagging_freq=5, reg_alpha=0.5, reg_lambda=0.5, random_state=seed, verbose=-1
        )
        model.fit(X.iloc[tr_idx], y.iloc[tr_idx], eval_set=[(X.iloc[va_idx], y.iloc[va_idx])],
                  callbacks=[lgb.early_stopping(100, verbose=False)])
        lgb_oof[va_idx] = model.predict_proba(X.iloc[va_idx])[:, 1]
        lgb_test += model.predict_proba(X_test)[:, 1] / n_splits
    base_oof['lgb'] = lgb_oof
    base_test['lgb'] = lgb_test
    print(f"  LGB CV: {roc_auc_score(y, lgb_oof):.5f}")
    
    # Model 2: CatBoost
    print("Training CatBoost...")
    cat_cols = X.select_dtypes(include=['category']).columns.tolist()
    cat_oof = np.zeros(len(X))
    cat_test = np.zeros(len(X_test))
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y)):
        model = cb.CatBoostClassifier(
            iterations=2000, learning_rate=0.02, depth=5, l2_leaf_reg=5,
            random_seed=seed+fold, verbose=False, early_stopping_rounds=150
        )
        model.fit(X.iloc[tr_idx], y.iloc[tr_idx], cat_features=cat_cols,
                  eval_set=(X.iloc[va_idx], y.iloc[va_idx]))
        cat_oof[va_idx] = model.predict_proba(X.iloc[va_idx])[:, 1]
        cat_test += model.predict_proba(X_test)[:, 1] / n_splits
    base_oof['cat'] = cat_oof
    base_test['cat'] = cat_test
    print(f"  CAT CV: {roc_auc_score(y, cat_oof):.5f}")
    
    # Model 3: XGBoost
    print("Training XGBoost...")
    X_enc = X.copy()
    X_test_enc = X_test.copy()
    for col in X_enc.select_dtypes(include=['category']).columns:
        X_enc[col] = X_enc[col].cat.codes
        X_test_enc[col] = X_test_enc[col].cat.codes
    
    xgb_oof = np.zeros(len(X))
    xgb_test = np.zeros(len(X_test))
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_enc, y)):
        model = xgb.XGBClassifier(
            n_estimators=2000, learning_rate=0.02, max_depth=5, subsample=0.7,
            colsample_bytree=0.5, reg_lambda=2.0, reg_alpha=0.5,
            random_state=seed+fold, tree_method='hist', early_stopping_rounds=150
        )
        model.fit(X_enc.iloc[tr_idx], y.iloc[tr_idx],
                  eval_set=[(X_enc.iloc[va_idx], y.iloc[va_idx])], verbose=False)
        xgb_oof[va_idx] = model.predict_proba(X_enc.iloc[va_idx])[:, 1]
        xgb_test += model.predict_proba(X_test_enc)[:, 1] / n_splits
    base_oof['xgb'] = xgb_oof
    base_test['xgb'] = xgb_test
    print(f"  XGB CV: {roc_auc_score(y, xgb_oof):.5f}")
    
    return base_oof, base_test

print("\n=== Training Base Models for Stacking ===")
base_oof, base_test = train_base_models(X, y, X_test, n_splits=N_SPLITS, seed=SEED)


=== Training Base Models for Stacking ===
Training LightGBM...
  LGB CV: 0.72707
Training CatBoost...
  CAT CV: 0.72287
Training XGBoost...
  XGB CV: 0.72627


In [7]:
# Create stacking features
stack_train = pd.DataFrame(base_oof)
stack_test = pd.DataFrame(base_test)

print("\n=== Meta-Model Training ===")

# Method 1: Simple averaging (baseline)
avg_oof = stack_train.mean(axis=1)
avg_test = stack_test.mean(axis=1)
print(f"Simple average: CV AUC = {roc_auc_score(y, avg_oof):.5f}")

# Method 2: Rank averaging
rank_oof = stack_train.apply(lambda x: rankdata(x) / len(x)).mean(axis=1)
rank_test = stack_test.apply(lambda x: rankdata(x) / len(x)).mean(axis=1)
print(f"Rank average: CV AUC = {roc_auc_score(y, rank_oof):.5f}")

# Method 3: Logistic Regression meta-model
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
lr_oof = np.zeros(len(y))
lr_test = np.zeros(len(X_test))

for fold, (tr_idx, va_idx) in enumerate(skf.split(stack_train, y)):
    meta = LogisticRegression(C=1.0, max_iter=1000, random_state=SEED)
    meta.fit(stack_train.iloc[tr_idx], y.iloc[tr_idx])
    lr_oof[va_idx] = meta.predict_proba(stack_train.iloc[va_idx])[:, 1]
    lr_test += meta.predict_proba(stack_test)[:, 1] / 5

print(f"LR meta-model: CV AUC = {roc_auc_score(y, lr_oof):.5f}")

# Save stacking submissions
pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': avg_test}).to_csv('submission_v7_stack_avg.csv', index=False)
pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': rank_test}).to_csv('submission_v7_stack_rank.csv', index=False)
pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': lr_test}).to_csv('submission_v7_stack_lr.csv', index=False)
print("\nSaved: submission_v7_stack_avg.csv, submission_v7_stack_rank.csv, submission_v7_stack_lr.csv")


=== Meta-Model Training ===
Simple average: CV AUC = 0.72599
Rank average: CV AUC = 0.72595
LR meta-model: CV AUC = 0.72778

Saved: submission_v7_stack_avg.csv, submission_v7_stack_rank.csv, submission_v7_stack_lr.csv


## Part 4: Target Distribution Calibration

If the test target distribution differs from train, calibrating predictions can help.

In [8]:
def calibrate_to_prior(preds, target_prior):
    """Calibrate predictions to match a target prior (positive rate)"""
    # Use Platt scaling idea: adjust threshold
    current_mean = preds.mean()
    if abs(current_mean - target_prior) < 0.001:
        return preds
    
    # Simple linear scaling to match target prior
    # Find scaling factor that shifts mean to target
    scaled = preds.copy()
    
    # Use rank-based calibration (preserves ordering)
    ranks = rankdata(preds) / len(preds)
    
    # Map to target distribution
    from scipy.stats import norm
    # Transform to normal, shift, transform back
    z_scores = norm.ppf(np.clip(ranks, 0.001, 0.999))
    # Shift mean to match target prior in probability space
    target_z = norm.ppf(target_prior)
    current_z = z_scores.mean()
    shifted_z = z_scores - current_z + target_z
    calibrated = norm.cdf(shifted_z)
    
    return calibrated

# Train target rate
train_prior = y.mean()
print(f"Train positive rate: {train_prior:.4f}")

# Try different target priors (in case test has different rate)
print("\n=== Target Distribution Calibration ===")
for target_prior in [0.60, 0.62, 0.65]:
    calib_preds = calibrate_to_prior(test_ms, target_prior)
    out = f'submission_v7_calib_{int(target_prior*100)}.csv'
    pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': calib_preds}).to_csv(out, index=False)
    print(f"Calibrated to {target_prior:.0%} -> saved {out}")

Train positive rate: 0.6233

=== Target Distribution Calibration ===
Calibrated to 60% -> saved submission_v7_calib_60.csv
Calibrated to 62% -> saved submission_v7_calib_62.csv
Calibrated to 65% -> saved submission_v7_calib_65.csv


## Part 5: Blend Best V6 Submissions with V7

Combine best previous submissions with new V7 predictions.

In [9]:
# Load best previous submissions
prev_subs = {}
prev_files = [
    ('submission_v5_lgb_simple.csv', 'V5_LGB', 0.69771),
    ('submission.csv', 'V1', 0.69720),
    ('submission_v6_quantile_norm.csv', 'V6_QN', None),
]

for fname, name, lb in prev_files:
    if os.path.exists(fname):
        prev_subs[name] = pd.read_csv(fname)['diagnosed_diabetes'].values
        print(f"Loaded {name}" + (f" (LB: {lb})" if lb else ""))

# Create final blend
print("\n=== Final Blend ===")
if 'V5_LGB' in prev_subs and 'V1' in prev_subs:
    # Blend: V7 multi-seed (new) + V5_LGB (best LB) + V1 (stable)
    final_blend = (
        test_ms * 0.4 +           # V7 multi-seed
        prev_subs['V5_LGB'] * 0.35 +  # V5 LGB (best LB)
        prev_subs['V1'] * 0.25       # V1 (stable)
    )
    pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': final_blend}).to_csv('submission_v7_final_blend.csv', index=False)
    print("Saved: submission_v7_final_blend.csv (V7*0.4 + V5_LGB*0.35 + V1*0.25)")
    
    # Also try rank blend
    r1 = rankdata(test_ms) / len(test_ms)
    r2 = rankdata(prev_subs['V5_LGB']) / len(prev_subs['V5_LGB'])
    r3 = rankdata(prev_subs['V1']) / len(prev_subs['V1'])
    rank_final = (r1 * 0.4 + r2 * 0.35 + r3 * 0.25)
    pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': rank_final}).to_csv('submission_v7_final_rank.csv', index=False)
    print("Saved: submission_v7_final_rank.csv (rank blend)")

Loaded V5_LGB (LB: 0.69771)
Loaded V1 (LB: 0.6972)
Loaded V6_QN

=== Final Blend ===
Saved: submission_v7_final_blend.csv (V7*0.4 + V5_LGB*0.35 + V1*0.25)
Saved: submission_v7_final_rank.csv (rank blend)


## Summary

In [10]:
print("\n" + "="*60)
print("V7 SUBMISSIONS READY FOR APPROVAL")
print("="*60)

v7_files = [
    'submission_v7_pathsmooth_5.csv',
    'submission_v7_pathsmooth_10.csv', 
    'submission_v7_pathsmooth_20.csv',
    'submission_v7_multiseed_5.csv',
    'submission_v7_stack_avg.csv',
    'submission_v7_stack_rank.csv',
    'submission_v7_stack_lr.csv',
    'submission_v7_calib_60.csv',
    'submission_v7_calib_62.csv',
    'submission_v7_calib_65.csv',
    'submission_v7_final_blend.csv',
    'submission_v7_final_rank.csv',
]

print("\nSubmissions created:")
for i, fname in enumerate(v7_files, 1):
    if os.path.exists(fname):
        print(f"  {i}. {fname}")

print("\n" + "="*60)
print("TOP PICKS TO TRY:")
print("  1. submission_v7_multiseed_5.csv - reduces variance")
print("  2. submission_v7_stack_rank.csv - diverse model blend")
print("  3. submission_v7_final_blend.csv - combines best of all")
print("="*60)


V7 SUBMISSIONS READY FOR APPROVAL

Submissions created:
  1. submission_v7_pathsmooth_5.csv
  2. submission_v7_pathsmooth_10.csv
  3. submission_v7_pathsmooth_20.csv
  4. submission_v7_multiseed_5.csv
  5. submission_v7_stack_avg.csv
  6. submission_v7_stack_rank.csv
  7. submission_v7_stack_lr.csv
  8. submission_v7_calib_60.csv
  9. submission_v7_calib_62.csv
  10. submission_v7_calib_65.csv
  11. submission_v7_final_blend.csv
  12. submission_v7_final_rank.csv

TOP PICKS TO TRY:
  1. submission_v7_multiseed_5.csv - reduces variance
  2. submission_v7_stack_rank.csv - diverse model blend
  3. submission_v7_final_blend.csv - combines best of all
