In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
import os

warnings.filterwarnings('ignore')

SEED = 42
N_SPLITS = 10
TARGET = 'diagnosed_diabetes'

In [2]:
# Load data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

test_ids = test['id']
train_ids = train['id']
train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

Train shape: (700000, 26)
Test shape: (300000, 25)


## Part 1: Submission Blending

Blend the best performing submissions to potentially get a better result

In [3]:
# Load existing submissions
submissions = {}
submission_files = [
    ('submission.csv', 'V1', 0.69720),
    ('submission_v2.csv', 'V2', 0.69714),
    ('submission_v3.csv', 'V3', 0.69668),
    ('submission_v4.csv', 'V4', 0.69484),
    ('submission_v4_raw.csv', 'V4_raw', 0.69560),
    ('submission_v5_lgb_simple.csv', 'V5_LGB', 0.69771),
    ('submission_v5_catboost.csv', 'V5_CAT', 0.69585),
]

for filename, name, lb_score in submission_files:
    if os.path.exists(filename):
        submissions[name] = {
            'data': pd.read_csv(filename),
            'lb_score': lb_score
        }
        print(f"Loaded {name}: LB={lb_score:.5f}")

print(f"\nLoaded {len(submissions)} submissions")

Loaded V1: LB=0.69720
Loaded V2: LB=0.69714
Loaded V3: LB=0.69668
Loaded V4: LB=0.69484
Loaded V4_raw: LB=0.69560
Loaded V5_LGB: LB=0.69771
Loaded V5_CAT: LB=0.69585

Loaded 7 submissions


In [4]:
# Create blended submissions
print("\n=== Creating Blended Submissions ===")

# Blend 1: Top 2 LB scores (V5_LGB + V1)
if 'V5_LGB' in submissions and 'V1' in submissions:
    blend_top2 = (
        submissions['V5_LGB']['data']['diagnosed_diabetes'] * 0.5 +
        submissions['V1']['data']['diagnosed_diabetes'] * 0.5
    )
    sub_blend_top2 = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': blend_top2})
    sub_blend_top2.to_csv('submission_v6_blend_top2.csv', index=False)
    print("Created: submission_v6_blend_top2.csv (V5_LGB + V1, 50/50)")

# Blend 2: Top 3 LB scores
if all(k in submissions for k in ['V5_LGB', 'V1', 'V2']):
    blend_top3 = (
        submissions['V5_LGB']['data']['diagnosed_diabetes'] * 0.4 +
        submissions['V1']['data']['diagnosed_diabetes'] * 0.3 +
        submissions['V2']['data']['diagnosed_diabetes'] * 0.3
    )
    sub_blend_top3 = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': blend_top3})
    sub_blend_top3.to_csv('submission_v6_blend_top3.csv', index=False)
    print("Created: submission_v6_blend_top3.csv (V5_LGB=40% + V1=30% + V2=30%)")

# Blend 3: All submissions with LB-weighted average
if len(submissions) > 0:
    # Higher LB score = higher weight (inverse of score since lower is worse)
    total_weight = 0
    blend_all = np.zeros(len(test_ids))
    for name, sub in submissions.items():
        weight = sub['lb_score']  # Use LB score as weight
        blend_all += sub['data']['diagnosed_diabetes'].values * weight
        total_weight += weight
    blend_all /= total_weight
    
    sub_blend_all = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': blend_all})
    sub_blend_all.to_csv('submission_v6_blend_all.csv', index=False)
    print(f"Created: submission_v6_blend_all.csv (LB-weighted blend of {len(submissions)} submissions)")


=== Creating Blended Submissions ===
Created: submission_v6_blend_top2.csv (V5_LGB + V1, 50/50)
Created: submission_v6_blend_top3.csv (V5_LGB=40% + V1=30% + V2=30%)
Created: submission_v6_blend_all.csv (LB-weighted blend of 7 submissions)


## Part 2: Adversarial Weighted Training

Use adversarial validation to create sample weights that upweight training samples similar to test data

In [5]:
def prepare_data_simple(df):
    """Minimal preprocessing - convert object cols to category dtype"""
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype('category')
    return df

train_simple = prepare_data_simple(train.copy())
test_simple = prepare_data_simple(test.copy())

y = train_simple[TARGET]
X = train_simple.drop(columns=[TARGET])
X_test = test_simple.copy()

In [6]:
def compute_adversarial_weights(X_train, X_test):
    """Compute sample weights based on how test-like each training sample is"""
    
    # Combine train and test
    X_all = pd.concat([X_train, X_test], axis=0, ignore_index=True)
    y_domain = np.concatenate([np.zeros(len(X_train)), np.ones(len(X_test))])
    
    # Train adversarial classifier
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(len(X_all))
    
    print("Computing adversarial weights...")
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_all, y_domain)):
        model = lgb.LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            num_leaves=31,
            random_state=SEED,
            verbose=-1
        )
        model.fit(
            X_all.iloc[tr_idx], y_domain[tr_idx],
            eval_set=[(X_all.iloc[va_idx], y_domain[va_idx])],
            callbacks=[lgb.early_stopping(50, verbose=False)]
        )
        oof_preds[va_idx] = model.predict_proba(X_all.iloc[va_idx])[:, 1]
    
    adv_auc = roc_auc_score(y_domain, oof_preds)
    print(f"Adversarial AUC: {adv_auc:.5f}")
    
    # Convert probabilities to weights
    train_probs = oof_preds[:len(X_train)]
    weights = (train_probs + 1e-6) / (1 - train_probs + 1e-6)
    weights = np.clip(weights, np.percentile(weights, 1), np.percentile(weights, 99))
    weights = weights / weights.mean()
    
    print(f"Weight stats: min={weights.min():.3f}, max={weights.max():.3f}, mean={weights.mean():.3f}")
    
    return weights, adv_auc

adv_weights, adv_auc = compute_adversarial_weights(X, X_test)

Computing adversarial weights...
Adversarial AUC: 0.63259
Weight stats: min=0.559, max=3.746, mean=1.000


In [7]:
def train_lgb_weighted(X, y, X_test, weights, n_splits=10):
    """Train LightGBM with adversarial sample weights"""
    
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 5000,
        'learning_rate': 0.01,
        'num_leaves': 31,
        'max_depth': 6,
        'min_child_samples': 50,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,
        'reg_alpha': 0.5,
        'reg_lambda': 0.5,
        'random_state': SEED,
        'verbose': -1
    }
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    
    print(f"Training LightGBM with adversarial weights ({n_splits}-fold)...")
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        w_tr = weights[train_idx]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            callbacks=[lgb.early_stopping(100, verbose=False)],
            sample_weight=w_tr
        )
        
        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / n_splits
        
        fold_auc = roc_auc_score(y_val, oof_preds[val_idx])
        print(f"  Fold {fold+1} AUC: {fold_auc:.5f}")
    
    oof_auc = roc_auc_score(y, oof_preds)
    print(f"Overall OOF AUC: {oof_auc:.5f}")
    
    return oof_preds, test_preds, oof_auc

print("\n=== Training with Adversarial Weights ===")
oof_adv, test_adv, auc_adv = train_lgb_weighted(X, y, X_test, adv_weights, n_splits=N_SPLITS)

# Save submission
sub_adv = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': test_adv})
sub_adv.to_csv('submission_v6_adversarial.csv', index=False)
print(f"\nSaved: submission_v6_adversarial.csv (CV AUC: {auc_adv:.5f})")


=== Training with Adversarial Weights ===
Training LightGBM with adversarial weights (10-fold)...
  Fold 1 AUC: 0.72720
  Fold 2 AUC: 0.72827
  Fold 3 AUC: 0.72623
  Fold 4 AUC: 0.72637
  Fold 5 AUC: 0.72734
  Fold 6 AUC: 0.72730
  Fold 7 AUC: 0.72561
  Fold 8 AUC: 0.73014
  Fold 9 AUC: 0.72958
  Fold 10 AUC: 0.72622
Overall OOF AUC: 0.72742

Saved: submission_v6_adversarial.csv (CV AUC: 0.72742)


## Part 3: Pseudo-Labeling

Use high-confidence predictions on test data as additional training samples

In [8]:
def train_with_pseudo_labels(X, y, X_test, initial_preds, confidence_threshold=0.9, n_splits=10):
    """
    Train with pseudo-labeling:
    1. Use initial predictions to identify high-confidence test samples
    2. Add these as training data with their predicted labels
    3. Retrain the model
    """
    
    # Identify high-confidence samples
    high_conf_pos = initial_preds >= confidence_threshold  # Confident positive
    high_conf_neg = initial_preds <= (1 - confidence_threshold)  # Confident negative
    high_conf_mask = high_conf_pos | high_conf_neg
    
    n_pseudo = high_conf_mask.sum()
    print(f"High-confidence samples (threshold={confidence_threshold}): {n_pseudo} ({100*n_pseudo/len(X_test):.1f}%)")
    print(f"  - Positive: {high_conf_pos.sum()}")
    print(f"  - Negative: {high_conf_neg.sum()}")
    
    if n_pseudo == 0:
        print("No high-confidence samples found. Skipping pseudo-labeling.")
        return None, None, None
    
    # Create pseudo-labeled data
    X_pseudo = X_test[high_conf_mask].copy()
    y_pseudo = (initial_preds[high_conf_mask] >= 0.5).astype(float)
    
    # Combine original training data with pseudo-labeled data
    X_combined = pd.concat([X, X_pseudo], axis=0, ignore_index=True)
    y_combined = pd.concat([y, pd.Series(y_pseudo)], axis=0, ignore_index=True)
    
    print(f"Combined training size: {len(X_combined)} (original: {len(X)}, pseudo: {n_pseudo})")
    
    # Train on combined data
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 5000,
        'learning_rate': 0.01,
        'num_leaves': 31,
        'max_depth': 6,
        'min_child_samples': 50,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,
        'reg_alpha': 0.5,
        'reg_lambda': 0.5,
        'random_state': SEED,
        'verbose': -1
    }
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(len(X))  # OOF only for original data
    test_preds = np.zeros(len(X_test))
    
    print(f"Training with pseudo-labels ({n_splits}-fold)...")
    
    # We need to be careful - only validate on original data
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        # Get original data splits
        X_tr_orig = X.iloc[train_idx]
        y_tr_orig = y.iloc[train_idx]
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]
        
        # Add all pseudo-labeled data to training
        X_tr = pd.concat([X_tr_orig, X_pseudo], axis=0, ignore_index=True)
        y_tr = pd.concat([y_tr_orig, pd.Series(y_pseudo)], axis=0, ignore_index=True)
        
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            callbacks=[lgb.early_stopping(100, verbose=False)]
        )
        
        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / n_splits
        
        fold_auc = roc_auc_score(y_val, oof_preds[val_idx])
        print(f"  Fold {fold+1} AUC: {fold_auc:.5f}")
    
    oof_auc = roc_auc_score(y, oof_preds)
    print(f"Overall OOF AUC: {oof_auc:.5f}")
    
    return oof_preds, test_preds, oof_auc

# Use our best submission as initial predictions for pseudo-labeling
if 'V5_LGB' in submissions:
    initial_preds = submissions['V5_LGB']['data']['diagnosed_diabetes'].values
    
    print("\n=== Pseudo-Labeling (threshold=0.90) ===")
    oof_pseudo_90, test_pseudo_90, auc_pseudo_90 = train_with_pseudo_labels(
        X, y, X_test, initial_preds, confidence_threshold=0.90, n_splits=N_SPLITS
    )
    
    if test_pseudo_90 is not None:
        sub_pseudo_90 = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': test_pseudo_90})
        sub_pseudo_90.to_csv('submission_v6_pseudo_90.csv', index=False)
        print(f"Saved: submission_v6_pseudo_90.csv (CV AUC: {auc_pseudo_90:.5f})")


=== Pseudo-Labeling (threshold=0.90) ===
High-confidence samples (threshold=0.9): 18621 (6.2%)
  - Positive: 18418
  - Negative: 203
Combined training size: 718621 (original: 700000, pseudo: 18621)
Training with pseudo-labels (10-fold)...
  Fold 1 AUC: 0.72831
  Fold 2 AUC: 0.72958
  Fold 3 AUC: 0.72716
  Fold 4 AUC: 0.72747
  Fold 5 AUC: 0.72854
  Fold 6 AUC: 0.72859
  Fold 7 AUC: 0.72705
  Fold 8 AUC: 0.73125
  Fold 9 AUC: 0.73052
  Fold 10 AUC: 0.72752
Overall OOF AUC: 0.72860
Saved: submission_v6_pseudo_90.csv (CV AUC: 0.72860)


In [9]:
# Try with lower confidence threshold
if 'V5_LGB' in submissions:
    print("\n=== Pseudo-Labeling (threshold=0.85) ===")
    oof_pseudo_85, test_pseudo_85, auc_pseudo_85 = train_with_pseudo_labels(
        X, y, X_test, initial_preds, confidence_threshold=0.85, n_splits=N_SPLITS
    )
    
    if test_pseudo_85 is not None:
        sub_pseudo_85 = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': test_pseudo_85})
        sub_pseudo_85.to_csv('submission_v6_pseudo_85.csv', index=False)
        print(f"Saved: submission_v6_pseudo_85.csv (CV AUC: {auc_pseudo_85:.5f})")


=== Pseudo-Labeling (threshold=0.85) ===
High-confidence samples (threshold=0.85): 37119 (12.4%)
  - Positive: 35617
  - Negative: 1502
Combined training size: 737119 (original: 700000, pseudo: 37119)
Training with pseudo-labels (10-fold)...
  Fold 1 AUC: 0.72818
  Fold 2 AUC: 0.72947
  Fold 3 AUC: 0.72706
  Fold 4 AUC: 0.72748
  Fold 5 AUC: 0.72846
  Fold 6 AUC: 0.72846
  Fold 7 AUC: 0.72686
  Fold 8 AUC: 0.73130
  Fold 9 AUC: 0.73049
  Fold 10 AUC: 0.72739
Overall OOF AUC: 0.72851
Saved: submission_v6_pseudo_85.csv (CV AUC: 0.72851)


## Part 4: Different Ensemble Strategy

Try rank averaging instead of probability averaging

In [10]:
from scipy.stats import rankdata

def rank_average_blend(submission_dict):
    """Blend submissions using rank averaging instead of probability averaging"""
    ranks = []
    for name, sub in submission_dict.items():
        preds = sub['data']['diagnosed_diabetes'].values
        rank = rankdata(preds) / len(preds)  # Normalize to [0, 1]
        ranks.append(rank)
    
    avg_rank = np.mean(ranks, axis=0)
    return avg_rank

if len(submissions) > 1:
    print("\n=== Rank Average Ensemble ===")
    rank_blend = rank_average_blend(submissions)
    
    sub_rank = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': rank_blend})
    sub_rank.to_csv('submission_v6_rank_blend.csv', index=False)
    print(f"Saved: submission_v6_rank_blend.csv (rank average of {len(submissions)} submissions)")


=== Rank Average Ensemble ===
Saved: submission_v6_rank_blend.csv (rank average of 7 submissions)


## Part 5: Train with More Regularization

Try even more aggressive regularization since CV-LB gap suggests overfitting

In [11]:
def train_lgb_ultra_conservative(X, y, X_test, n_splits=10):
    """Train with very aggressive regularization"""
    
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 3000,
        'learning_rate': 0.005,  # Even lower
        'num_leaves': 15,  # Much fewer leaves
        'max_depth': 4,  # Shallower
        'min_child_samples': 100,  # More samples per leaf
        'feature_fraction': 0.5,
        'bagging_fraction': 0.6,
        'bagging_freq': 5,
        'reg_alpha': 2.0,  # Strong L1
        'reg_lambda': 2.0,  # Strong L2
        'min_gain_to_split': 0.1,  # Need more gain to split
        'random_state': SEED,
        'verbose': -1
    }
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    
    print(f"Training Ultra-Conservative LightGBM ({n_splits}-fold)...")
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            callbacks=[lgb.early_stopping(100, verbose=False)]
        )
        
        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / n_splits
        
        fold_auc = roc_auc_score(y_val, oof_preds[val_idx])
        print(f"  Fold {fold+1} AUC: {fold_auc:.5f}")
    
    oof_auc = roc_auc_score(y, oof_preds)
    print(f"Overall OOF AUC: {oof_auc:.5f}")
    
    return oof_preds, test_preds, oof_auc

print("\n=== Ultra-Conservative Model ===")
oof_ultra, test_ultra, auc_ultra = train_lgb_ultra_conservative(X, y, X_test, n_splits=N_SPLITS)

sub_ultra = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': test_ultra})
sub_ultra.to_csv('submission_v6_ultra_conservative.csv', index=False)
print(f"Saved: submission_v6_ultra_conservative.csv (CV AUC: {auc_ultra:.5f})")


=== Ultra-Conservative Model ===
Training Ultra-Conservative LightGBM (10-fold)...
  Fold 1 AUC: 0.71851
  Fold 2 AUC: 0.71978
  Fold 3 AUC: 0.71766
  Fold 4 AUC: 0.71645
  Fold 5 AUC: 0.71887
  Fold 6 AUC: 0.71746
  Fold 7 AUC: 0.71659
  Fold 8 AUC: 0.72097
  Fold 9 AUC: 0.72099
  Fold 10 AUC: 0.71692
Overall OOF AUC: 0.71841
Saved: submission_v6_ultra_conservative.csv (CV AUC: 0.71841)


## Part 6: Domain-Shift Tactics (New)

These aim to reduce the CV→LB gap caused by train/test distribution shift:

1. **Shift-feature dropping**: identify features that best separate train vs test, then drop the most-shifted.
2. **Train on most test-like rows**: only train on the subset of training rows that look most like the test distribution.
3. **Numeric distribution alignment**: quantile-transform numeric features using train+test (unsupervised).

In [13]:
from sklearn.preprocessing import QuantileTransformer

def train_lgb_generic(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
    n_splits: int = 10,
    sample_weight: np.ndarray | None = None,
    seed: int = 42,
    params: dict | None = None,
):
    if params is None:
        params = {
            'objective': 'binary',
            'metric': 'auc',
            'boosting_type': 'gbdt',
            'n_estimators': 5000,
            'learning_rate': 0.01,
            'num_leaves': 31,
            'max_depth': 6,
            'min_child_samples': 50,
            'feature_fraction': 0.7,
            'bagging_fraction': 0.7,
            'bagging_freq': 5,
            'reg_alpha': 0.5,
            'reg_lambda': 0.5,
            'random_state': seed,
            'verbose': -1,
        }

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X_train))
    test_preds = np.zeros(len(X_test))

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_train, y_train)):
        X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
        y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]
        w_tr = sample_weight[tr_idx] if sample_weight is not None else None

        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_tr,
            y_tr,
            eval_set=[(X_va, y_va)],
            callbacks=[lgb.early_stopping(100, verbose=False)],
            sample_weight=w_tr,
        )

        oof_preds[va_idx] = model.predict_proba(X_va)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / n_splits

    oof_auc = roc_auc_score(y_train, oof_preds)
    return oof_preds, test_preds, oof_auc


def compute_domain_shift_importance(X_train: pd.DataFrame, X_test: pd.DataFrame, n_splits: int = 5, seed: int = 42):
    """Train a domain classifier (train=0, test=1) and return mean gain importances."""
    X_all = pd.concat([X_train, X_test], axis=0, ignore_index=True)
    y_domain = np.concatenate([np.zeros(len(X_train)), np.ones(len(X_test))])

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    importances = []
    oof = np.zeros(len(X_all))

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_all, y_domain)):
        X_tr, X_va = X_all.iloc[tr_idx], X_all.iloc[va_idx]
        y_tr, y_va = y_domain[tr_idx], y_domain[va_idx]

        dom = lgb.LGBMClassifier(
            n_estimators=800,
            learning_rate=0.05,
            num_leaves=63,
            min_child_samples=50,
            feature_fraction=0.8,
            bagging_fraction=0.8,
            bagging_freq=5,
            random_state=seed + fold,
            verbose=-1,
        )
        dom.fit(
            X_tr,
            y_tr,
            eval_set=[(X_va, y_va)],
            callbacks=[lgb.early_stopping(80, verbose=False)],
        )

        oof[va_idx] = dom.predict_proba(X_va)[:, 1]
        booster = dom.booster_
        gain = booster.feature_importance(importance_type='gain')
        importances.append(gain)

    auc = roc_auc_score(y_domain, oof)
    mean_gain = np.mean(np.vstack(importances), axis=0)
    imp = pd.DataFrame({'feature': X_all.columns, 'gain': mean_gain}).sort_values('gain', ascending=False)
    return imp, auc


def compute_testlikeness_scores(X_train: pd.DataFrame, X_test: pd.DataFrame, n_splits: int = 5, seed: int = 42):
    """Return p(test|x) for each training row (higher => more test-like)."""
    X_all = pd.concat([X_train, X_test], axis=0, ignore_index=True)
    y_domain = np.concatenate([np.zeros(len(X_train)), np.ones(len(X_test))])

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof = np.zeros(len(X_all))

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_all, y_domain)):
        X_tr, X_va = X_all.iloc[tr_idx], X_all.iloc[va_idx]
        y_tr, y_va = y_domain[tr_idx], y_domain[va_idx]

        dom = lgb.LGBMClassifier(
            n_estimators=800,
            learning_rate=0.05,
            num_leaves=63,
            min_child_samples=50,
            feature_fraction=0.8,
            bagging_fraction=0.8,
            bagging_freq=5,
            random_state=seed + 100 + fold,
            verbose=-1,
        )
        dom.fit(
            X_tr,
            y_tr,
            eval_set=[(X_va, y_va)],
            callbacks=[lgb.early_stopping(80, verbose=False)],
        )
        oof[va_idx] = dom.predict_proba(X_va)[:, 1]

    auc = roc_auc_score(y_domain, oof)
    train_scores = oof[: len(X_train)]
    return train_scores, auc

### 6.1 Shift-feature dropping

Idea: if a feature mostly helps distinguish train vs test, it can encourage learning spurious train-only patterns.

We’ll compute domain-model importances and drop the most-shifted features at a few cutoffs.

In [14]:
shift_imp, shift_auc = compute_domain_shift_importance(X, X_test, n_splits=5, seed=SEED)
print(f"Domain AUC (feature-shift model): {shift_auc:.5f}")

print("\nTop shifted features:")
display(shift_imp.head(15))

# QUICK grid (keeps runtime reasonable). Expand if these look promising.
quick = True
k_list = [3, 5] if quick else [1, 2, 3, 5, 8]

for k in k_list:
    drop_feats = shift_imp['feature'].head(k).tolist()
    cols = [c for c in X.columns if c not in drop_feats]

    oof, preds, auc = train_lgb_generic(X[cols], y, X_test[cols], n_splits=N_SPLITS, seed=SEED)
    out = f"submission_v6_drop_shift_top{k}.csv"
    pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': preds}).to_csv(out, index=False)
    print(f"Dropped top {k} shifted feats -> CV AUC {auc:.5f} | saved {out}")

Domain AUC (feature-shift model): 0.64125

Top shifted features:


Unnamed: 0,feature,gain
2,physical_activity_minutes_per_week,194870.28633
14,triglycerides,118628.283649
6,bmi,70277.446444
11,cholesterol_total,66719.265039
10,heart_rate,44523.692213
13,ldl_cholesterol,40157.67669
0,age,34998.297314
8,systolic_bp,30760.607453
5,screen_time_hours_per_day,25906.422665
12,hdl_cholesterol,25886.692568


Dropped top 3 shifted feats -> CV AUC 0.66962 | saved submission_v6_drop_shift_top3.csv
Dropped top 5 shifted feats -> CV AUC 0.66773 | saved submission_v6_drop_shift_top5.csv


### 6.2 Train only on most test-like rows

Idea: if the test set is drawn from a different mixture of subpopulations, training on the most test-like slice can reduce mismatch.

We compute a test-likeness score $p(\text{test}|x)$ via a domain classifier and train on the top percentiles.

In [15]:
train_scores, score_auc = compute_testlikeness_scores(X, X_test, n_splits=5, seed=SEED)
print(f"Domain AUC (score model): {score_auc:.5f}")

# QUICK grid (keeps runtime reasonable). Expand if these look promising.
quick = True
pct_list = [0.50] if quick else [0.30, 0.50, 0.70]
order = np.argsort(train_scores)  # ascending

for pct in pct_list:
    k = int(len(order) * pct)
    idx = order[-k:]

    X_sub = X.iloc[idx].reset_index(drop=True)
    y_sub = y.iloc[idx].reset_index(drop=True)

    # Option A: no weights
    oof, preds, auc = train_lgb_generic(X_sub, y_sub, X_test, n_splits=N_SPLITS, seed=SEED)
    out = f"submission_v6_top{int(pct*100)}pct_testlike.csv"
    pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': preds}).to_csv(out, index=False)
    print(f"Top {int(pct*100)}% test-like -> CV AUC {auc:.5f} | saved {out}")

for pct in pct_list:
    k = int(len(order) * pct)
    idx = order[-k:]

    X_sub = X.iloc[idx].reset_index(drop=True)
    y_sub = y.iloc[idx].reset_index(drop=True)
    w_sub = adv_weights[idx]

    oof, preds, auc = train_lgb_generic(X_sub, y_sub, X_test, n_splits=N_SPLITS, seed=SEED, sample_weight=w_sub)
    out = f"submission_v6_top{int(pct*100)}pct_testlike_weighted.csv"
    pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': preds}).to_csv(out, index=False)
    print(f"Top {int(pct*100)}% test-like + weights -> CV AUC {auc:.5f} | saved {out}")

Domain AUC (score model): 0.64113
Top 50% test-like -> CV AUC 0.74238 | saved submission_v6_top50pct_testlike.csv
Top 50% test-like + weights -> CV AUC 0.74127 | saved submission_v6_top50pct_testlike_weighted.csv


### 6.3 Numeric distribution alignment (Quantile transform)

Idea: apply a monotonic transform to numeric features so their marginal distributions match better between train and test.

This is unsupervised (uses $X$ only), and often helps under covariate shift.

In [None]:
num_cols = X.select_dtypes(include=['number', 'int64', 'float64']).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

# Fit transform on combined train+test numerics (subsample for speed)
qt = QuantileTransformer(
    n_quantiles=2000,
    output_distribution='normal',
    subsample=200_000,
    random_state=SEED,
)

X_num_all = pd.concat([X[num_cols], X_test[num_cols]], axis=0, ignore_index=True)
qt.fit(X_num_all)

X_q = X.copy()
X_test_q = X_test.copy()
X_q[num_cols] = qt.transform(X[num_cols])
X_test_q[num_cols] = qt.transform(X_test[num_cols])

# Keep categoricals as-is
for c in cat_cols:
    if str(X_q[c].dtype) == 'category':
        continue

oof, preds, auc = train_lgb_generic(X_q, y, X_test_q, n_splits=N_SPLITS, seed=SEED)
out = 'submission_v6_quantile_norm.csv'
pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': preds}).to_csv(out, index=False)
print(f"Quantile-normalized numerics -> CV AUC {auc:.5f} | saved {out}")

Quantile-normalized numerics -> CV AUC 0.72856 | saved submission_v6_quantile_norm.csv


: 

## Summary

In [12]:
print("\n" + "="*60)
print("V6 SUBMISSIONS READY FOR APPROVAL")
print("="*60)

v6_submissions = [
    ('submission_v6_blend_top2.csv', 'Blend V5_LGB + V1 (50/50)'),
    ('submission_v6_blend_top3.csv', 'Blend V5_LGB + V1 + V2'),
    ('submission_v6_blend_all.csv', 'LB-weighted blend of all'),
    ('submission_v6_adversarial.csv', f'Adversarial weights (CV: {auc_adv:.5f})'),
    ('submission_v6_rank_blend.csv', 'Rank average blend'),
    ('submission_v6_ultra_conservative.csv', f'Ultra-conservative (CV: {auc_ultra:.5f})'),
]

if 'auc_pseudo_90' in dir() and auc_pseudo_90 is not None:
    v6_submissions.append(('submission_v6_pseudo_90.csv', f'Pseudo-label 90% (CV: {auc_pseudo_90:.5f})'))
if 'auc_pseudo_85' in dir() and auc_pseudo_85 is not None:
    v6_submissions.append(('submission_v6_pseudo_85.csv', f'Pseudo-label 85% (CV: {auc_pseudo_85:.5f})'))

print("\nSubmissions created:")
for i, (filename, desc) in enumerate(v6_submissions, 1):
    if os.path.exists(filename):
        print(f"  {i}. {filename}")
        print(f"     → {desc}")

print("\n" + "="*60)
print("WAITING FOR YOUR APPROVAL TO SUBMIT")
print("="*60)


V6 SUBMISSIONS READY FOR APPROVAL

Submissions created:
  1. submission_v6_blend_top2.csv
     → Blend V5_LGB + V1 (50/50)
  2. submission_v6_blend_top3.csv
     → Blend V5_LGB + V1 + V2
  3. submission_v6_blend_all.csv
     → LB-weighted blend of all
  4. submission_v6_adversarial.csv
     → Adversarial weights (CV: 0.72742)
  5. submission_v6_rank_blend.csv
     → Rank average blend
  6. submission_v6_ultra_conservative.csv
     → Ultra-conservative (CV: 0.71841)
  7. submission_v6_pseudo_90.csv
     → Pseudo-label 90% (CV: 0.72860)
  8. submission_v6_pseudo_85.csv
     → Pseudo-label 85% (CV: 0.72851)

WAITING FOR YOUR APPROVAL TO SUBMIT
