# V8 — shift-first pipeline (clean rebuild)

This notebook reimplements V8 end-to-end on disk (no corrupted cells).

Steps:
1. Domain classifier → p_test(train), p_test(test)
2. Shift-aware CV metrics
3. Mixture: full vs top-50% test-like subset + blend
4. Importance-weighted training
5. Soft de-emphasis: quantile binning (integer codes)
6. Gated blend on test (row-wise)


In [7]:
import os
import warnings

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold, KFold
from sklearn.metrics import roc_auc_score

warnings.filterwarnings('ignore')

SEED = 42
N_SPLITS = 10
TARGET = 'diagnosed_diabetes'
DATA_DIR = 'data'


In [8]:
train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

print('Train shape:', train.shape)
print('Test shape :', test.shape)
print('Target rate:', train[TARGET].mean())

test_ids = test['id']
train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

def to_category(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in out.columns:
        if out[c].dtype == 'object':
            out[c] = out[c].astype('category')
    return out

train = to_category(train)
test = to_category(test)

y = train[TARGET].astype(int)
X = train.drop(columns=[TARGET])
X_test = test.copy()

cat_cols = X.select_dtypes(include=['category']).columns.tolist()
print('Categorical cols:', cat_cols)


Train shape: (700000, 26)
Test shape : (300000, 25)
Target rate: 0.6232957142857143
Categorical cols: ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']


## 1) Domain classifier → test-likeness score


In [9]:
def fit_domain_classifier_get_p_test(X_train: pd.DataFrame, X_test: pd.DataFrame, seed: int = 42):
    X_all = pd.concat([X_train, X_test], axis=0, ignore_index=True)
    y_dom = np.concatenate([np.zeros(len(X_train), dtype=int), np.ones(len(X_test), dtype=int)])

    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 4000,
        'learning_rate': 0.02,
        'num_leaves': 63,
        'max_depth': -1,
        'min_child_samples': 100,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'reg_alpha': 0.0,
        'reg_lambda': 1.0,
        'random_state': seed,
        'verbose': -1,
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    oof = np.zeros(len(X_all))
    for tr, va in kf.split(X_all, y_dom):
        m = lgb.LGBMClassifier(**params)
        m.fit(
            X_all.iloc[tr], y_dom[tr],
            eval_set=[(X_all.iloc[va], y_dom[va])],
            callbacks=[lgb.early_stopping(100, verbose=False)]
        )
        oof[va] = m.predict_proba(X_all.iloc[va])[:, 1]

    dom_auc = roc_auc_score(y_dom, oof)
    print(f'Domain (train vs test) CV AUC: {dom_auc:.5f}')

    final = lgb.LGBMClassifier(**params)
    final.fit(X_all, y_dom)
    p_all = final.predict_proba(X_all)[:, 1]
    p_train = p_all[: len(X_train)]
    p_test = p_all[len(X_train):]
    return p_train, p_test, dom_auc

p_test_train, p_test_test, dom_auc = fit_domain_classifier_get_p_test(X, X_test, seed=SEED)
print('p_test(train) quantiles:', np.quantile(p_test_train, [0, 0.1, 0.5, 0.9, 1]))
print('p_test(test)  quantiles:', np.quantile(p_test_test, [0, 0.1, 0.5, 0.9, 1]))


Domain (train vs test) CV AUC: 0.64956
p_test(train) quantiles: [0.08491359 0.19052777 0.24689076 0.37697885 0.95289083]
p_test(test)  quantiles: [0.1139838  0.22369724 0.32782366 0.59714959 0.98700743]


## 2) Shift-aware CV metrics + baseline


In [10]:
def density_ratio_weights(p: np.ndarray, clip=(0.2, 5.0)):
    eps = 1e-6
    p = np.clip(p, eps, 1 - eps)
    w = p / (1 - p)
    w = np.clip(w, clip[0], clip[1])
    w = w / np.mean(w)
    return w

w_train = density_ratio_weights(p_test_train, clip=(0.2, 5.0))
print('weights quantiles:', np.quantile(w_train, [0, 0.1, 0.5, 0.9, 1]))

def make_shift_groups(p_test: np.ndarray, n_bins: int = 50):
    qs = np.quantile(p_test, np.linspace(0, 1, n_bins + 1))
    qs = np.unique(qs)
    if len(qs) <= 2:
        return np.zeros_like(p_test, dtype=int)
    return np.digitize(p_test, qs[1:-1], right=True)

def cv_lgb_shift_metrics(
    X: pd.DataFrame,
    y: pd.Series,
    X_test: pd.DataFrame,
    p_test_train: np.ndarray,
    sample_weight: np.ndarray | None = None,
    seed: int = 42,
    n_splits: int = 10,
    use_shift_groups: bool = True,
    label: str = 'model',
):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 8000,
        'learning_rate': 0.01,
        'num_leaves': 31,
        'max_depth': 6,
        'min_child_samples': 80,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,
        'reg_alpha': 0.5,
        'reg_lambda': 0.5,
        'random_state': seed,
        'verbose': -1,
    }

    if use_shift_groups:
        groups = make_shift_groups(p_test_train, n_bins=50)
        splitter = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
        splits = splitter.split(X, y, groups=groups)
    else:
        splitter = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
        splits = splitter.split(X, y)

    oof = np.zeros(len(X))
    test_pred = np.zeros(len(X_test))
    fold_top = []

    w_proxy = density_ratio_weights(p_test_train, clip=(0.2, 5.0))

    for tr, va in splits:
        X_tr, X_va = X.iloc[tr], X.iloc[va]
        y_tr, y_va = y.iloc[tr], y.iloc[va]
        fit_w = sample_weight[tr] if sample_weight is not None else None

        m = lgb.LGBMClassifier(**params)
        m.fit(
            X_tr, y_tr,
            sample_weight=fit_w,
            eval_set=[(X_va, y_va)],
            callbacks=[lgb.early_stopping(200, verbose=False)]
        )
        p_va = m.predict_proba(X_va)[:, 1]
        oof[va] = p_va
        test_pred += m.predict_proba(X_test)[:, 1] / n_splits

        thr = np.quantile(p_test_train[va], 0.70)
        idx_top = va[p_test_train[va] >= thr]
        if len(idx_top) > 50:
            fold_top.append(roc_auc_score(y.iloc[idx_top], oof[idx_top]))
        else:
            fold_top.append(np.nan)

    overall_std = roc_auc_score(y, oof)
    overall_w = roc_auc_score(y, oof, sample_weight=w_proxy)
    overall_top = np.nanmean(fold_top)

    print(f'[{label}] CV AUC std: {overall_std:.5f} | weighted: {overall_w:.5f} | val-top30%: {overall_top:.5f}')
    return oof, test_pred

oof_v8_base, pred_v8_base = cv_lgb_shift_metrics(
    X, y, X_test, p_test_train,
    sample_weight=None, seed=SEED, n_splits=N_SPLITS, use_shift_groups=True,
    label='V8_BASE_SHIFT_CV'
)
pd.DataFrame({'id': test_ids, TARGET: pred_v8_base}).to_csv('submission_v8_base_shiftcv.csv', index=False)
print('Saved: submission_v8_base_shiftcv.csv')


weights quantiles: [ 0.50547551  0.594876    0.82854668  1.52927051 12.63688783]
[V8_BASE_SHIFT_CV] CV AUC std: 0.72897 | weighted: 0.73988 | val-top30%: 0.74692
Saved: submission_v8_base_shiftcv.csv


## 3) Mixture: top-50% test-like subset + blend


In [11]:
def cv_lgb_subset_model(
    X: pd.DataFrame,
    y: pd.Series,
    X_test: pd.DataFrame,
    p_test_train: np.ndarray,
    subset_quantile: float = 0.5,
    seed: int = 42,
    n_splits: int = 10,
    label: str = 'subset',
):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 12000,
        'learning_rate': 0.008,
        'num_leaves': 31,
        'max_depth': 6,
        'min_child_samples': 120,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,
        'reg_alpha': 0.8,
        'reg_lambda': 1.0,
        'random_state': seed,
        'verbose': -1,
    }

    groups = make_shift_groups(p_test_train, n_bins=50)
    splitter = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    splits = splitter.split(X, y, groups=groups)

    oof = np.zeros(len(X))
    test_pred = np.zeros(len(X_test))

    q_thr = np.quantile(p_test_train, subset_quantile)
    subset_mask = p_test_train >= q_thr
    print(f'[{label}] subset_quantile={subset_quantile} -> train rows used ~{subset_mask.mean():.3f}')

    for tr, va in splits:
        tr_sub = tr[subset_mask[tr]]
        if len(tr_sub) < 1000:
            tr_sub = tr

        m = lgb.LGBMClassifier(**params)
        m.fit(
            X.iloc[tr_sub], y.iloc[tr_sub],
            eval_set=[(X.iloc[va], y.iloc[va])],
            callbacks=[lgb.early_stopping(250, verbose=False)]
        )
        oof[va] = m.predict_proba(X.iloc[va])[:, 1]
        test_pred += m.predict_proba(X_test)[:, 1] / n_splits

    std_auc = roc_auc_score(y, oof)
    w_auc = roc_auc_score(y, oof, sample_weight=density_ratio_weights(p_test_train, clip=(0.2, 5.0)))
    print(f'[{label}] CV AUC std: {std_auc:.5f} | weighted: {w_auc:.5f}')
    return oof, test_pred

oof_v8_sub50, pred_v8_sub50 = cv_lgb_subset_model(
    X, y, X_test, p_test_train, subset_quantile=0.50, seed=SEED, n_splits=N_SPLITS, label='V8_SUBSET_TOP50'
)
pd.DataFrame({'id': test_ids, TARGET: pred_v8_sub50}).to_csv('submission_v8_subset_top50.csv', index=False)
print('Saved: submission_v8_subset_top50.csv')

pred_v8_mixture_60_40 = 0.6 * pred_v8_base + 0.4 * pred_v8_sub50
pd.DataFrame({'id': test_ids, TARGET: pred_v8_mixture_60_40}).to_csv('submission_v8_mixture_60_40.csv', index=False)
print('Saved: submission_v8_mixture_60_40.csv')


[V8_SUBSET_TOP50] subset_quantile=0.5 -> train rows used ~0.500
[V8_SUBSET_TOP50] CV AUC std: 0.72665 | weighted: 0.73815
Saved: submission_v8_subset_top50.csv
Saved: submission_v8_mixture_60_40.csv


## 4) Importance-weighted + blend


In [12]:
oof_v8_w, pred_v8_w = cv_lgb_shift_metrics(
    X, y, X_test, p_test_train,
    sample_weight=w_train, seed=SEED, n_splits=N_SPLITS, use_shift_groups=True,
    label='V8_IMPORTANCE_WEIGHTED'
)
pd.DataFrame({'id': test_ids, TARGET: pred_v8_w}).to_csv('submission_v8_importance_weighted.csv', index=False)
print('Saved: submission_v8_importance_weighted.csv')

blend_bw = 0.6 * pred_v8_base + 0.4 * pred_v8_w
pd.DataFrame({'id': test_ids, TARGET: blend_bw}).to_csv('submission_v8_blend_base_weighted_60_40.csv', index=False)
print('Saved: submission_v8_blend_base_weighted_60_40.csv')


[V8_IMPORTANCE_WEIGHTED] CV AUC std: 0.72811 | weighted: 0.73929 | val-top30%: 0.74641
Saved: submission_v8_importance_weighted.csv
Saved: submission_v8_blend_base_weighted_60_40.csv


## 5) Soft de-emphasis via quantile binning (integer bin codes)


In [13]:
def quantile_bin_joint(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    cols: list[str],
    n_bins: int = 50,
    drop_original: bool = True,
    suffix: str = '__qbin',
) -> tuple[pd.DataFrame, pd.DataFrame]:
    Xt = X_train.copy()
    Xs = X_test.copy()

    for c in cols:
        if c not in Xt.columns or c not in Xs.columns:
            continue
        if not pd.api.types.is_numeric_dtype(Xt[c]):
            continue

        s_all = pd.concat([Xt[c], Xs[c]], axis=0, ignore_index=True)
        try:
            b_all = pd.qcut(s_all, q=n_bins, duplicates='drop')
        except Exception:
            continue

        codes = b_all.cat.codes.astype('int16')
        Xt[c + suffix] = codes.iloc[: len(Xt)].reset_index(drop=True)
        Xs[c + suffix] = codes.iloc[len(Xt):].reset_index(drop=True)

        if drop_original:
            Xt = Xt.drop(columns=[c])
            Xs = Xs.drop(columns=[c])

    return Xt, Xs

shift_candidates = [
    'triglycerides', 'cholesterol_total', 'cholesterol_hdl', 'cholesterol_ldl',
    'glucose', 'hba1c', 'insulin', 'bmi',
    'waist_circumference', 'hip_circumference', 'waist_hip_ratio',
    'systolic_bp', 'diastolic_bp',
    'age', 'income', 'education_years',
    'physical_activity', 'sleep_hours',
    'smoking_pack_years', 'alcohol_units',
    'family_history_diabetes', 'diet_score', 'stress_level',
    'heart_rate', 'creatinine', 'egfr', 'alt', 'ast', 'crp',
]

X_soft, X_test_soft = quantile_bin_joint(X, X_test, shift_candidates, n_bins=50, drop_original=True)
print('Soft-binned shapes:', X_soft.shape, X_test_soft.shape)

oof_v8_soft, pred_v8_soft = cv_lgb_shift_metrics(
    X_soft, y, X_test_soft, p_test_train,
    sample_weight=None, seed=SEED, n_splits=N_SPLITS, use_shift_groups=True,
    label='V8_SOFT_BIN'
)
pd.DataFrame({'id': test_ids, TARGET: pred_v8_soft}).to_csv('submission_v8_soft_bin.csv', index=False)
print('Saved: submission_v8_soft_bin.csv')

pred_v8_blend_base_soft = 0.6 * pred_v8_base + 0.4 * pred_v8_soft
pd.DataFrame({'id': test_ids, TARGET: pred_v8_blend_base_soft}).to_csv('submission_v8_blend_base_softbin_60_40.csv', index=False)
print('Saved: submission_v8_blend_base_softbin_60_40.csv')


Soft-binned shapes: (700000, 24) (300000, 24)
[V8_SOFT_BIN] CV AUC std: 0.69189 | weighted: 0.70316 | val-top30%: 0.70965
Saved: submission_v8_soft_bin.csv
Saved: submission_v8_blend_base_softbin_60_40.csv


## 6) Gated blend (row-wise)


In [14]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

q50, q70, q85 = np.quantile(p_test_test, [0.5, 0.7, 0.85])
print('p_test(test) q50/q70/q85:', (q50, q70, q85))

t0 = float(q70)
temp = 0.06
alpha = sigmoid((p_test_test - t0) / temp)
pred_v8_gated = (1 - alpha) * pred_v8_base + alpha * pred_v8_sub50
pd.DataFrame({'id': test_ids, TARGET: pred_v8_gated}).to_csv('submission_v8_gated_base_sub50_sigmoid.csv', index=False)
print('Saved: submission_v8_gated_base_sub50_sigmoid.csv')

temp2 = 0.04
alpha2 = sigmoid((p_test_test - t0) / temp2)
pred_v8_gated2 = (1 - alpha2) * pred_v8_base + alpha2 * pred_v8_sub50
pd.DataFrame({'id': test_ids, TARGET: pred_v8_gated2}).to_csv('submission_v8_gated_base_sub50_sigmoid_sharp.csv', index=False)
print('Saved: submission_v8_gated_base_sub50_sigmoid_sharp.csv')


p_test(test) q50/q70/q85: (np.float64(0.32782365731136753), np.float64(0.4123312768067911), np.float64(0.5313312422923425))
Saved: submission_v8_gated_base_sub50_sigmoid.csv
Saved: submission_v8_gated_base_sub50_sigmoid_sharp.csv
