In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold, StratifiedGroupKFold
from sklearn.metrics import roc_auc_score

warnings.filterwarnings('ignore')

SEED = 42
N_SPLITS = 10
TARGET = 'diagnosed_diabetes'
COMP = 'playground-series-s5e12'


In [None]:
import os
import warnings

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold, KFold
from sklearn.metrics import roc_auc_score

warnings.filterwarnings('ignore')

SEED = 42
N_SPLITS = 10
TARGET = 'diagnosed_diabetes'
DATA_DIR = 'data'

In [None]:
# Load data
train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

print('Train shape:', train.shape)
print('Test shape :', test.shape)
print('Target rate:', train[TARGET].mean())

test_ids = test['id']
train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

def to_category(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in out.columns:
        if out[c].dtype == 'object':
            out[c] = out[c].astype('category')
    return out

train = to_category(train)
test = to_category(test)

y = train[TARGET].astype(int)
X = train.drop(columns=[TARGET])
X_test = test.copy()

cat_cols = X.select_dtypes(include=['category']).columns.tolist()
print('Categorical cols:', cat_cols)

## 1) Domain classifier → test-likeness score

We learn `p_test(x) = P(is_test=1 | x)` using a domain classifier (train vs test).

In [None]:
def fit_domain_classifier_get_p_test(X_train: pd.DataFrame, X_test: pd.DataFrame, seed: int = 42):
    X_all = pd.concat([X_train, X_test], axis=0, ignore_index=True)
    y_dom = np.concatenate([np.zeros(len(X_train), dtype=int), np.ones(len(X_test), dtype=int)])

    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 4000,
        'learning_rate': 0.02,
        'num_leaves': 63,
        'max_depth': -1,
        'min_child_samples': 100,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'reg_alpha': 0.0,
        'reg_lambda': 1.0,
        'random_state': seed,
        'verbose': -1,
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    oof = np.zeros(len(X_all))
    for tr, va in kf.split(X_all, y_dom):
        m = lgb.LGBMClassifier(**params)
        m.fit(
            X_all.iloc[tr], y_dom[tr],
            eval_set=[(X_all.iloc[va], y_dom[va])],
            callbacks=[lgb.early_stopping(100, verbose=False)]
        )
        oof[va] = m.predict_proba(X_all.iloc[va])[:, 1]

    dom_auc = roc_auc_score(y_dom, oof)
    print(f'Domain (train vs test) CV AUC: {dom_auc:.5f}')

    # fit final model for consistent scoring
    final = lgb.LGBMClassifier(**params)
    final.fit(X_all, y_dom)
    p_all = final.predict_proba(X_all)[:, 1]
    p_train = p_all[: len(X_train)]
    p_test = p_all[len(X_train):]
    return p_train, p_test, dom_auc

p_test_train, p_test_test, dom_auc = fit_domain_classifier_get_p_test(X, X_test, seed=SEED)
print('p_test(train) quantiles:', np.quantile(p_test_train, [0, 0.1, 0.5, 0.9, 1]))
print('p_test(test)  quantiles:', np.quantile(p_test_test, [0, 0.1, 0.5, 0.9, 1]))

## 2) Shift-aware CV metrics

We report:
- Standard AUC
- Weighted AUC using density-ratio weights (proxy for test risk)
- AUC on the most test-like slice within each fold (top 30% by `p_test`)

In [None]:
def density_ratio_weights(p: np.ndarray, clip=(0.2, 5.0)):
    eps = 1e-6
    p = np.clip(p, eps, 1 - eps)
    w = p / (1 - p)
    w = np.clip(w, clip[0], clip[1])
    w = w / np.mean(w)
    return w

w_train = density_ratio_weights(p_test_train, clip=(0.2, 5.0))
print('weights quantiles:', np.quantile(w_train, [0, 0.1, 0.5, 0.9, 1]))

def make_shift_groups(p_test: np.ndarray, n_bins: int = 50):
    qs = np.quantile(p_test, np.linspace(0, 1, n_bins + 1))
    qs = np.unique(qs)
    if len(qs) <= 2:
        return np.zeros_like(p_test, dtype=int)
    groups = np.digitize(p_test, qs[1:-1], right=True)
    return groups

def weighted_auc(y_true, y_score, sample_weight):
    return roc_auc_score(y_true, y_score, sample_weight=sample_weight)

def cv_lgb_shift_metrics(
    X: pd.DataFrame,
    y: pd.Series,
    X_test: pd.DataFrame,
    p_test_train: np.ndarray,
    sample_weight: np.ndarray | None = None,
    seed: int = 42,
    n_splits: int = 10,
    use_shift_groups: bool = True,
    label: str = 'model',
):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 8000,
        'learning_rate': 0.01,
        'num_leaves': 31,
        'max_depth': 6,
        'min_child_samples': 80,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,
        'reg_alpha': 0.5,
        'reg_lambda': 0.5,
        'random_state': seed,
        'verbose': -1,
    }

    if use_shift_groups:
        groups = make_shift_groups(p_test_train, n_bins=50)
        splitter = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
        splits = splitter.split(X, y, groups=groups)
    else:
        splitter = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
        splits = splitter.split(X, y)

    oof = np.zeros(len(X))
    test_pred = np.zeros(len(X_test))
    fold_top = []

    w_proxy = density_ratio_weights(p_test_train, clip=(0.2, 5.0))

    for tr, va in splits:
        X_tr, X_va = X.iloc[tr], X.iloc[va]
        y_tr, y_va = y.iloc[tr], y.iloc[va]
        fit_w = sample_weight[tr] if sample_weight is not None else None

        m = lgb.LGBMClassifier(**params)
        m.fit(
            X_tr, y_tr,
            sample_weight=fit_w,
            eval_set=[(X_va, y_va)],
            callbacks=[lgb.early_stopping(200, verbose=False)]
        )
        p_va = m.predict_proba(X_va)[:, 1]
        oof[va] = p_va
        test_pred += m.predict_proba(X_test)[:, 1] / n_splits

        thr = np.quantile(p_test_train[va], 0.70)
        idx_top = va[p_test_train[va] >= thr]
        if len(idx_top) > 50:
            fold_top.append(roc_auc_score(y.iloc[idx_top], oof[idx_top]))
        else:
            fold_top.append(np.nan)

    overall_std = roc_auc_score(y, oof)
    overall_w = weighted_auc(y, oof, w_proxy)
    overall_top = np.nanmean(fold_top)

    print(f'[{label}] CV AUC std: {overall_std:.5f} | weighted: {overall_w:.5f} | val-top30%: {overall_top:.5f}')
    return oof, test_pred

In [None]:
# Step 2.0: baseline shift-aware model
oof_v8_base, pred_v8_base = cv_lgb_shift_metrics(
    X, y, X_test, p_test_train,
    sample_weight=None, seed=SEED, n_splits=N_SPLITS, use_shift_groups=True,
    label='V8_BASE_SHIFT_CV'
)
pd.DataFrame({'id': test_ids, TARGET: pred_v8_base}).to_csv('submission_v8_base_shiftcv.csv', index=False)
print('Saved: submission_v8_base_shiftcv.csv')

## 3) Mixture training: full vs top-50% test-like subset

Train a model only on the most test-like rows (by `p_test(train)`) and blend predictions with the full-data model.

In [None]:
def cv_lgb_subset_model(
    X: pd.DataFrame,
    y: pd.Series,
    X_test: pd.DataFrame,
    p_test_train: np.ndarray,
    subset_quantile: float = 0.5,
    seed: int = 42,
    n_splits: int = 10,
    label: str = 'subset',
):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 12000,
        'learning_rate': 0.008,
        'num_leaves': 31,
        'max_depth': 6,
        'min_child_samples': 120,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,
        'reg_alpha': 0.8,
        'reg_lambda': 1.0,
        'random_state': seed,
        'verbose': -1,
    }

    groups = make_shift_groups(p_test_train, n_bins=50)
    splitter = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    splits = splitter.split(X, y, groups=groups)

    oof = np.zeros(len(X))
    test_pred = np.zeros(len(X_test))

    q_thr = np.quantile(p_test_train, subset_quantile)
    subset_mask = p_test_train >= q_thr
    print(f'[{label}] subset_quantile={subset_quantile} -> train rows used ~{subset_mask.mean():.3f}')

    for tr, va in splits:
        tr_sub = tr[subset_mask[tr]]
        if len(tr_sub) < 1000:
            tr_sub = tr

        m = lgb.LGBMClassifier(**params)
        m.fit(
            X.iloc[tr_sub], y.iloc[tr_sub],
            eval_set=[(X.iloc[va], y.iloc[va])],
            callbacks=[lgb.early_stopping(250, verbose=False)]
        )
        oof[va] = m.predict_proba(X.iloc[va])[:, 1]
        test_pred += m.predict_proba(X_test)[:, 1] / n_splits

    std_auc = roc_auc_score(y, oof)
    w_auc = roc_auc_score(y, oof, sample_weight=density_ratio_weights(p_test_train, clip=(0.2, 5.0)))
    print(f'[{label}] CV AUC std: {std_auc:.5f} | weighted: {w_auc:.5f}')
    return oof, test_pred

oof_v8_sub50, pred_v8_sub50 = cv_lgb_subset_model(
    X, y, X_test, p_test_train, subset_quantile=0.50, seed=SEED, n_splits=N_SPLITS, label='V8_SUBSET_TOP50'
)
pd.DataFrame({'id': test_ids, TARGET: pred_v8_sub50}).to_csv('submission_v8_subset_top50.csv', index=False)
print('Saved: submission_v8_subset_top50.csv')

pred_v8_mixture_60_40 = pred_v8_base * 0.6 + pred_v8_sub50 * 0.4
pd.DataFrame({'id': test_ids, TARGET: pred_v8_mixture_60_40}).to_csv('submission_v8_mixture_60_40.csv', index=False)
print('Saved: submission_v8_mixture_60_40.csv')

## 4) Importance-weighted training

Train on all rows with sample weights `w(x) = p/(1-p)` (clipped + normalized).

In [None]:
oof_v8_w, pred_v8_w = cv_lgb_shift_metrics(
    X, y, X_test, p_test_train,
    sample_weight=w_train, seed=SEED, n_splits=N_SPLITS, use_shift_groups=True,
    label='V8_IMPORTANCE_WEIGHTED'
)
pd.DataFrame({'id': test_ids, TARGET: pred_v8_w}).to_csv('submission_v8_importance_weighted.csv', index=False)
print('Saved: submission_v8_importance_weighted.csv')

blend_bw = pred_v8_base * 0.6 + pred_v8_w * 0.4
pd.DataFrame({'id': test_ids, TARGET: blend_bw}).to_csv('submission_v8_blend_base_weighted_60_40.csv', index=False)
print('Saved: submission_v8_blend_base_weighted_60_40.csv')

## 5) Soft de-emphasis of shifted features (quantile binning)

We coarsen selected (shift-prone) numeric features by quantile binning.
Implementation note: we store bins as integer codes (not Interval categoricals) to avoid LightGBM JSON serialization issues.

In [None]:
def quantile_bin_joint(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    cols: list[str],
    n_bins: int = 50,
    drop_original: bool = True,
    suffix: str = '__qbin',
) -> tuple[pd.DataFrame, pd.DataFrame]:
    Xt = X_train.copy()
    Xs = X_test.copy()

    for c in cols:
        if c not in Xt.columns or c not in Xs.columns:
            continue
        if not pd.api.types.is_numeric_dtype(Xt[c]):
            continue

        s_all = pd.concat([Xt[c], Xs[c]], axis=0, ignore_index=True)
        try:
            b_all = pd.qcut(s_all, q=n_bins, duplicates='drop')
        except Exception:
            continue

        # Store as stable integer codes (avoids Interval categoricals)
        codes = b_all.cat.codes.astype('int16')
        Xt[c + suffix] = codes.iloc[: len(Xt)].reset_index(drop=True)
        Xs[c + suffix] = codes.iloc[len(Xt):].reset_index(drop=True)

        if drop_original:
            Xt = Xt.drop(columns=[c])
            Xs = Xs.drop(columns=[c])

    return Xt, Xs

shift_candidates = [
    'triglycerides',
    'cholesterol_total',
    'cholesterol_hdl',
    'cholesterol_ldl',
    'glucose',
    'hba1c',
    'insulin',
    'bmi',
    'waist_circumference',
    'hip_circumference',
    'waist_hip_ratio',
    'systolic_bp',
    'diastolic_bp',
    'age',
    'income',
    'education_years',
    'physical_activity',
    'sleep_hours',
    'smoking_pack_years',
    'alcohol_units',
    'family_history_diabetes',
    'diet_score',
    'stress_level',
    'heart_rate',
    'creatinine',
    'egfr',
    'alt',
    'ast',
    'crp',
]

X_soft, X_test_soft = quantile_bin_joint(X, X_test, shift_candidates, n_bins=50, drop_original=True)
print('Soft-binned shapes:', X_soft.shape, X_test_soft.shape)

oof_v8_soft, pred_v8_soft = cv_lgb_shift_metrics(
    X_soft, y, X_test_soft, p_test_train,
    sample_weight=None, seed=SEED, n_splits=N_SPLITS, use_shift_groups=True,
    label='V8_SOFT_BIN'
)
pd.DataFrame({'id': test_ids, TARGET: pred_v8_soft}).to_csv('submission_v8_soft_bin.csv', index=False)
print('Saved: submission_v8_soft_bin.csv')

pred_v8_blend_base_soft = 0.6 * pred_v8_base + 0.4 * pred_v8_soft
pd.DataFrame({'id': test_ids, TARGET: pred_v8_blend_base_soft}).to_csv('submission_v8_blend_base_softbin_60_40.csv', index=False)
print('Saved: submission_v8_blend_base_softbin_60_40.csv')

## 6) Gated blend on test (row-wise mixture)

Blend base vs subset predictions with a gate `α(x)` based on `p_test(test)` (more test-like → more weight on subset model).

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Gate centered at test q70 (reasonable default)
q50, q70, q85 = np.quantile(p_test_test, [0.5, 0.7, 0.85])
print('p_test(test) q50/q70/q85:', (q50, q70, q85))

t0 = float(q70)
temp = 0.06
alpha = sigmoid((p_test_test - t0) / temp)
pred_v8_gated = (1 - alpha) * pred_v8_base + alpha * pred_v8_sub50
pd.DataFrame({'id': test_ids, TARGET: pred_v8_gated}).to_csv('submission_v8_gated_base_sub50_sigmoid.csv', index=False)
print('Saved: submission_v8_gated_base_sub50_sigmoid.csv')

temp2 = 0.04
alpha2 = sigmoid((p_test_test - t0) / temp2)
pred_v8_gated2 = (1 - alpha2) * pred_v8_base + alpha2 * pred_v8_sub50
pd.DataFrame({'id': test_ids, TARGET: pred_v8_gated2}).to_csv('submission_v8_gated_base_sub50_sigmoid_sharp.csv', index=False)
print('Saved: submission_v8_gated_base_sub50_sigmoid_sharp.csv')

In [2]:
# Load data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print('Train shape:', train.shape)
print('Test shape :', test.shape)
print('Target rate:', train[TARGET].mean())

test_ids = test['id']
train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

def to_category(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in out.columns:
        if out[c].dtype == 'object':
            out[c] = out[c].astype('category')
    return out

train = to_category(train)
test = to_category(test)

y = train[TARGET].astype(int)
X = train.drop(columns=[TARGET])
X_test = test.copy()

cat_cols = X.select_dtypes(include=['category']).columns.tolist()
print('Categorical cols:', cat_cols)

Train shape: (700000, 26)
Test shape : (300000, 25)
Target rate: 0.6232957142857143
Categorical cols: ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']


## 1) Shift-aware evaluation: adversarial test-likeness score

We train a domain classifier to predict `is_test` using features only, then use:
- `p_test(x)` as a *test-likeness* score
- density-ratio weights $w(x) pprox rac{p_{test}(x)}{p_{train}(x)} = rac{p}{1-p}$
- shift-aware fold grouping via quantile bins of `p_test(x)`

In [3]:
def fit_domain_classifier_get_p_test(X_train: pd.DataFrame, X_test: pd.DataFrame, seed: int = 42):
    X_all = pd.concat([X_train, X_test], axis=0, ignore_index=True)
    y_dom = np.concatenate([np.zeros(len(X_train), dtype=int), np.ones(len(X_test), dtype=int)])

    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 4000,
        'learning_rate': 0.02,
        'num_leaves': 63,
        'max_depth': -1,
        'min_child_samples': 100,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'reg_alpha': 0.0,
        'reg_lambda': 1.0,
        'random_state': seed,
        'verbose': -1,
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    oof = np.zeros(len(X_all))
    for tr, va in kf.split(X_all, y_dom):
        m = lgb.LGBMClassifier(**params)
        m.fit(
            X_all.iloc[tr], y_dom[tr],
            eval_set=[(X_all.iloc[va], y_dom[va])],
            callbacks=[lgb.early_stopping(100, verbose=False)]
        )
        oof[va] = m.predict_proba(X_all.iloc[va])[:, 1]
    dom_auc = roc_auc_score(y_dom, oof)
    print(f'Domain (train vs test) CV AUC: {dom_auc:.5f}')

    # Fit final model on all data to score train/test consistently
    final = lgb.LGBMClassifier(**params)
    final.fit(X_all, y_dom)
    p_all = final.predict_proba(X_all)[:, 1]
    p_train = p_all[: len(X_train)]
    p_test = p_all[len(X_train) :]
    return p_train, p_test, dom_auc

p_test_train, p_test_test, dom_auc = fit_domain_classifier_get_p_test(X, X_test, seed=SEED)
print('p_test(train) summary:', np.quantile(p_test_train, [0, 0.1, 0.5, 0.9, 1]))
print('p_test(test)  summary:', np.quantile(p_test_test, [0, 0.1, 0.5, 0.9, 1]))

Domain (train vs test) CV AUC: 0.64956
p_test(train) summary: [0.08491359 0.19052777 0.24689076 0.37697885 0.95289083]
p_test(test)  summary: [0.1139838  0.22369724 0.32782366 0.59714959 0.98700743]


In [4]:
def density_ratio_weights(p: np.ndarray, clip=(0.2, 5.0)):
    eps = 1e-6
    p = np.clip(p, eps, 1 - eps)
    w = p / (1 - p)
    w = np.clip(w, clip[0], clip[1])
    # normalize mean weight to 1
    w = w / np.mean(w)
    return w

w_train = density_ratio_weights(p_test_train, clip=(0.2, 5.0))
print('weights summary:', np.quantile(w_train, [0, 0.1, 0.5, 0.9, 1]))

weights summary: [ 0.50547551  0.594876    0.82854668  1.52927051 12.63688783]


In [5]:
def make_shift_groups(p_test: np.ndarray, n_bins: int = 50):
    # quantile bins (groups) for StratifiedGroupKFold
    qs = np.quantile(p_test, np.linspace(0, 1, n_bins + 1))
    # guard duplicates
    qs = np.unique(qs)
    # if too many duplicates, fall back
    if len(qs) <= 2:
        return np.zeros_like(p_test, dtype=int)
    groups = np.digitize(p_test, qs[1:-1], right=True)
    return groups

shift_groups = make_shift_groups(p_test_train, n_bins=50)
print('unique shift groups:', len(np.unique(shift_groups)))

unique shift groups: 50


### Shift-aware CV metrics

We report:
- Standard AUC
- Weighted AUC using density-ratio weights (proxy for test risk)
- AUC on the most test-like slice of each validation fold (top 30% by `p_test`)

In [6]:
def weighted_auc(y_true, y_score, sample_weight):
    # sklearn roc_auc_score supports sample_weight
    return roc_auc_score(y_true, y_score, sample_weight=sample_weight)

def cv_lgb_shift_metrics(
    X: pd.DataFrame,
    y: pd.Series,
    X_test: pd.DataFrame,
    p_test_train: np.ndarray,
    sample_weight: np.ndarray | None = None,
    seed: int = 42,
    n_splits: int = 10,
    use_shift_groups: bool = True,
    label: str = 'model',
):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 8000,
        'learning_rate': 0.01,
        'num_leaves': 31,
        'max_depth': 6,
        'min_child_samples': 80,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,
        'reg_alpha': 0.5,
        'reg_lambda': 0.5,
        'random_state': seed,
        'verbose': -1,
    }

    if use_shift_groups:
        groups = make_shift_groups(p_test_train, n_bins=50)
        splitter = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
        splits = splitter.split(X, y, groups=groups)
    else:
        splitter = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
        splits = splitter.split(X, y)

    oof = np.zeros(len(X))
    test_pred = np.zeros(len(X_test))
    fold_std = []
    fold_w = []
    fold_top = []

    w_proxy = density_ratio_weights(p_test_train, clip=(0.2, 5.0))

    for fold, (tr, va) in enumerate(splits):
        X_tr, X_va = X.iloc[tr], X.iloc[va]
        y_tr, y_va = y.iloc[tr], y.iloc[va]

        fit_w = sample_weight[tr] if sample_weight is not None else None

        m = lgb.LGBMClassifier(**params)
        m.fit(
            X_tr, y_tr,
            sample_weight=fit_w,
            eval_set=[(X_va, y_va)],
            callbacks=[lgb.early_stopping(200, verbose=False)]
        )
        p_va = m.predict_proba(X_va)[:, 1]
        oof[va] = p_va
        test_pred += m.predict_proba(X_test)[:, 1] / n_splits

        auc_std = roc_auc_score(y_va, p_va)
        auc_w = weighted_auc(y_va, p_va, w_proxy[va])
        # top test-like slice within validation
        thr = np.quantile(p_test_train[va], 0.70)
        idx_top = va[p_test_train[va] >= thr]
        if len(idx_top) > 50:
            auc_top = roc_auc_score(y.iloc[idx_top], oof[idx_top])
        else:
            auc_top = np.nan

        fold_std.append(auc_std)
        fold_w.append(auc_w)
        fold_top.append(auc_top)

    overall_std = roc_auc_score(y, oof)
    overall_w = weighted_auc(y, oof, w_proxy)
    overall_top = np.nanmean(fold_top)

    print(f'[{label}] CV AUC std: {overall_std:.5f} | weighted: {overall_w:.5f} | val-top30%: {overall_top:.5f}')
    return oof, test_pred, {'std': overall_std, 'weighted': overall_w, 'top30': overall_top}

# Baseline shift-aware evaluation (no special weights, shift-group folds)
oof_v8_base, pred_v8_base, m_v8_base = cv_lgb_shift_metrics(
    X, y, X_test, p_test_train, sample_weight=None, seed=SEED, n_splits=N_SPLITS, use_shift_groups=True, label='V8_BASE_SHIFT_CV'
)
pd.DataFrame({'id': test_ids, TARGET: pred_v8_base}).to_csv('submission_v8_base_shiftcv.csv', index=False)
print('Saved: submission_v8_base_shiftcv.csv')

[V8_BASE_SHIFT_CV] CV AUC std: 0.72897 | weighted: 0.73988 | val-top30%: 0.74692
Saved: submission_v8_base_shiftcv.csv


## 2) Mixture training: full-train + test-like subset + blend

We train:
- Model A: on all training rows
- Model B: on top-X% most test-like training rows (by `p_test(train)`)
Then blend predictions.

In [10]:
# Step 2: Mixture training (full-train + test-like subset)
def cv_lgb_subset_model(
    X: pd.DataFrame,
    y: pd.Series,
    X_test: pd.DataFrame,
    p_test_train: np.ndarray,
    subset_quantile: float = 0.5,
    seed: int = 42,
    n_splits: int = 10,
    label: str = 'subset',
):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 12000,
        'learning_rate': 0.008,
        'num_leaves': 31,
        'max_depth': 6,
        'min_child_samples': 120,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,
        'reg_alpha': 0.8,
        'reg_lambda': 1.0,
        'random_state': seed,
        'verbose': -1,
    }

    groups = make_shift_groups(p_test_train, n_bins=50)
    splitter = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    splits = splitter.split(X, y, groups=groups)

    oof = np.zeros(len(X))
    test_pred = np.zeros(len(X_test))

    q_thr = np.quantile(p_test_train, subset_quantile)
    subset_mask = p_test_train >= q_thr
    print(f'[{label}] subset_quantile={subset_quantile} -> train rows used ~{subset_mask.mean():.3f}')

    for fold, (tr, va) in enumerate(splits):
        tr_sub = tr[subset_mask[tr]]
        if len(tr_sub) < 1000:
            tr_sub = tr  # fallback

        m = lgb.LGBMClassifier(**params)
        m.fit(
            X.iloc[tr_sub], y.iloc[tr_sub],
            eval_set=[(X.iloc[va], y.iloc[va])],
            callbacks=[lgb.early_stopping(250, verbose=False)]
        )
        oof[va] = m.predict_proba(X.iloc[va])[:, 1]
        test_pred += m.predict_proba(X_test)[:, 1] / n_splits

    std_auc = roc_auc_score(y, oof)
    w_auc = roc_auc_score(y, oof, sample_weight=density_ratio_weights(p_test_train, clip=(0.2, 5.0)))
    print(f'[{label}] CV AUC std: {std_auc:.5f} | weighted: {w_auc:.5f}')
    return oof, test_pred, {'std': std_auc, 'weighted': w_auc}

oof_v8_sub50, pred_v8_sub50, m_v8_sub50 = cv_lgb_subset_model(
    X, y, X_test, p_test_train, subset_quantile=0.50, seed=SEED, n_splits=N_SPLITS, label='V8_SUBSET_TOP50'
)
pd.DataFrame({'id': test_ids, TARGET: pred_v8_sub50}).to_csv('submission_v8_subset_top50.csv', index=False)
print('Saved: submission_v8_subset_top50.csv')

# Simple global blend with base shift-aware model
pred_v8_mixture_60_40 = pred_v8_base * 0.6 + pred_v8_sub50 * 0.4
pd.DataFrame({'id': test_ids, TARGET: pred_v8_mixture_60_40}).to_csv('submission_v8_mixture_60_40.csv', index=False)
print('Saved: submission_v8_mixture_60_40.csv')

[V8_SUBSET_TOP50] subset_quantile=0.5 -> train rows used ~0.500
[V8_SUBSET_TOP50] CV AUC std: 0.72665 | weighted: 0.73815
Saved: submission_v8_subset_top50.csv
Saved: submission_v8_mixture_60_40.csv


## 3) Importance-weighted training (density ratio weights)

Train a model on all rows but weight examples by clipped density ratio $w(x)=rac{p}{1-p}$ where $p=p(istest=1|x)$.

In [11]:
# Importance-weighted model
oof_v8_w, pred_v8_w, m_v8_w = cv_lgb_shift_metrics(
    X, y, X_test, p_test_train, sample_weight=w_train, seed=SEED, n_splits=N_SPLITS, use_shift_groups=True, label='V8_IMPORTANCE_WEIGHTED'
)
pd.DataFrame({'id': test_ids, TARGET: pred_v8_w}).to_csv('submission_v8_importance_weighted.csv', index=False)
print('Saved: submission_v8_importance_weighted.csv')

# Blend base and weighted
blend_bw = pred_v8_base * 0.6 + pred_v8_w * 0.4
pd.DataFrame({'id': test_ids, TARGET: blend_bw}).to_csv('submission_v8_blend_base_weighted_60_40.csv', index=False)
print('Saved: submission_v8_blend_base_weighted_60_40.csv')

[V8_IMPORTANCE_WEIGHTED] CV AUC std: 0.72811 | weighted: 0.73929 | val-top30%: 0.74641
Saved: submission_v8_importance_weighted.csv
Saved: submission_v8_blend_base_weighted_60_40.csv


## 4) Soft de-emphasis of shifted features (quantile binning)

Instead of dropping shifted features (which hurt CV), we *coarsen* them by quantile-binning to reduce over-sensitivity.

In [15]:
def quantile_bin_columns(df: pd.DataFrame, cols: list[str], n_bins: int = 50) -> pd.DataFrame:
    out = df.copy()
    for c in cols:
        if c not in out.columns:
            continue
        if pd.api.types.is_numeric_dtype(out[c]):
            # qcut to category bins
            try:
                binned = pd.qcut(out[c], q=n_bins, duplicates='drop')
                out[c + '__qbin'] = binned.astype('category')
            except Exception:
                pass
    return out

# Candidate shifted features from prior analysis (keep only if present)
shift_candidates = [
    'triglycerides',
    'cholesterol_total',

SyntaxError: incomplete input (3936525092.py, line 18)