In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings

warnings.filterwarnings('ignore')

SEED = 42
N_SPLITS = 10  # Use 10 folds like top solutions
TARGET = 'diagnosed_diabetes'

In [2]:
# Load data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

test_ids = test['id']
train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

Train shape: (700000, 26)
Test shape: (300000, 25)


## Part 1: Adversarial Validation
Check if train and test distributions are similar

In [3]:
# Adversarial validation to check train-test similarity
def run_adversarial_validation(train_df, test_df, target_col):
    """Train a model to distinguish train from test data.
    AUC close to 0.5 = similar distributions (good)
    AUC >> 0.5 = different distributions (may need domain adaptation)
    """
    # Prepare features
    X_train = train_df.drop(columns=[target_col])
    X_test = test_df.copy()
    
    # Convert categoricals
    cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    for col in cat_cols:
        combined = pd.concat([X_train[col], X_test[col]], axis=0)
        categories = pd.Categorical(combined).categories
        X_train[col] = pd.Categorical(X_train[col], categories=categories)
        X_test[col] = pd.Categorical(X_test[col], categories=categories)
    
    # Create domain labels: 0=train, 1=test
    X_all = pd.concat([X_train, X_test], axis=0, ignore_index=True)
    y_domain = np.concatenate([np.zeros(len(X_train)), np.ones(len(X_test))])
    
    # Train adversarial model
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(len(X_all))
    
    print("Running adversarial validation...")
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_all, y_domain)):
        X_tr, X_val = X_all.iloc[train_idx], X_all.iloc[val_idx]
        y_tr, y_val = y_domain[train_idx], y_domain[val_idx]
        
        model = lgb.LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            num_leaves=31,
            random_state=SEED,
            verbose=-1
        )
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            callbacks=[lgb.early_stopping(50, verbose=False)]
        )
        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    
    adv_auc = roc_auc_score(y_domain, oof_preds)
    print(f"\nAdversarial AUC: {adv_auc:.5f}")
    print("(Close to 0.5 = train/test are similar, >> 0.5 = distribution shift)")
    
    # Return train sample weights (higher weight for samples that look more like test)
    train_probs = oof_preds[:len(X_train)]
    weights = (train_probs + 1e-6) / (1 - train_probs + 1e-6)
    weights = np.clip(weights, np.percentile(weights, 1), np.percentile(weights, 99))
    weights = weights / weights.mean()
    
    return adv_auc, weights

adv_auc, sample_weights = run_adversarial_validation(train, test, TARGET)

Running adversarial validation...

Adversarial AUC: 0.63259
(Close to 0.5 = train/test are similar, >> 0.5 = distribution shift)


## Part 2: Data Preparation (Minimal - Following Top Solutions)

In [4]:
# Simple categorical encoding - let the models handle it natively
def prepare_data_simple(df):
    """Minimal preprocessing - convert object cols to category dtype"""
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype('category')
    return df

# Simple ordinal mapping (from top-34 solution)
def prepare_data_mapped(df):
    """Map categoricals to ordinals for XGBoost"""
    df = df.copy()
    
    # Ordinal mappings
    smoke_map = {'Never': 0, 'Former': 1, 'Current': 2}
    gender_map = {'Female': 0, 'Male': 1}
    
    if 'smoking_status' in df.columns:
        df['smoking_status'] = df['smoking_status'].map(smoke_map).fillna(0)
    if 'gender' in df.columns:
        df['gender'] = df['gender'].map(gender_map).fillna(0)
    
    # Cast boolean columns to int
    bool_cols = ['family_history_diabetes', 'hypertension_history', 'cardiovascular_history']
    for col in bool_cols:
        if col in df.columns:
            df[col] = df[col].astype(int)
    
    return df

# Prepare different versions
train_simple = prepare_data_simple(train.copy())
test_simple = prepare_data_simple(test.copy())

print("Data types (simple):")
print(train_simple.dtypes)

Data types (simple):
age                                      int64
alcohol_consumption_per_week             int64
physical_activity_minutes_per_week       int64
diet_score                             float64
sleep_hours_per_day                    float64
screen_time_hours_per_day              float64
bmi                                    float64
waist_to_hip_ratio                     float64
systolic_bp                              int64
diastolic_bp                             int64
heart_rate                               int64
cholesterol_total                        int64
hdl_cholesterol                          int64
ldl_cholesterol                          int64
triglycerides                            int64
gender                                category
ethnicity                             category
education_level                       category
income_level                          category
smoking_status                        category
employment_status                     c

## Part 3: Minimal Feature Engineering (Only Proven Features)

In [5]:
def add_minimal_features(df):
    """Add only the most robust features from top solutions"""
    df = df.copy()
    
    # From top-34 solution - cardiovascular indicators
    df['MAP'] = (df['systolic_bp'] + 2 * df['diastolic_bp']) / 3
    df['Pulse_Pressure'] = df['systolic_bp'] - df['diastolic_bp']
    
    # Lipid ratios (clinically validated)
    df['Total_HDL_Ratio'] = df['cholesterol_total'] / (df['hdl_cholesterol'] + 1e-5)
    df['TG_HDL_Ratio'] = df['triglycerides'] / (df['hdl_cholesterol'] + 1e-5)
    
    # Metabolic syndrome index
    df['Metabolic_Index'] = df['bmi'] * df['waist_to_hip_ratio']
    
    return df

train_fe = add_minimal_features(train_simple.copy())
test_fe = add_minimal_features(test_simple.copy())

print(f"Features after minimal FE: {train_fe.shape[1]}")

Features after minimal FE: 30


## Part 4: Model Training - Simple Baseline (No FE)

In [6]:
def train_lgb_simple(X, y, X_test, n_splits=10, use_weights=False, weights=None):
    """Train LightGBM with native categorical handling (like AbyssSkb)"""
    
    cat_cols = X.select_dtypes(include=['category']).columns.tolist()
    
    # Very conservative parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 5000,
        'learning_rate': 0.01,
        'num_leaves': 31,
        'max_depth': 6,
        'min_child_samples': 50,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,
        'reg_alpha': 0.5,
        'reg_lambda': 0.5,
        'random_state': SEED,
        'verbose': -1
    }
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    
    print(f"Training LightGBM Simple ({n_splits}-fold)...")
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        sample_weight = weights[train_idx] if use_weights and weights is not None else None
        
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            callbacks=[lgb.early_stopping(100, verbose=False)],
            sample_weight=sample_weight
        )
        
        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / n_splits
        
        fold_auc = roc_auc_score(y_val, oof_preds[val_idx])
        print(f"  Fold {fold+1} AUC: {fold_auc:.5f}")
    
    oof_auc = roc_auc_score(y, oof_preds)
    print(f"Overall OOF AUC: {oof_auc:.5f}")
    
    return oof_preds, test_preds, oof_auc

# Train on simple data (no FE)
y = train_simple[TARGET]
X_simple = train_simple.drop(columns=[TARGET])
X_test_simple = test_simple.copy()

print("\n=== Training WITHOUT feature engineering (like AbyssSkb) ===")
oof_lgb_simple, test_lgb_simple, auc_lgb_simple = train_lgb_simple(
    X_simple, y, X_test_simple, n_splits=N_SPLITS
)


=== Training WITHOUT feature engineering (like AbyssSkb) ===
Training LightGBM Simple (10-fold)...
  Fold 1 AUC: 0.72824
  Fold 2 AUC: 0.72937
  Fold 3 AUC: 0.72719
  Fold 4 AUC: 0.72729
  Fold 5 AUC: 0.72845
  Fold 6 AUC: 0.72873
  Fold 7 AUC: 0.72697
  Fold 8 AUC: 0.73115
  Fold 9 AUC: 0.73044
  Fold 10 AUC: 0.72751
Overall OOF AUC: 0.72853


## Part 5: Model Training - With Minimal FE

In [7]:
# Train on data with minimal FE
X_fe = train_fe.drop(columns=[TARGET])
X_test_fe = test_fe.copy()

print("\n=== Training WITH minimal feature engineering ===")
oof_lgb_fe, test_lgb_fe, auc_lgb_fe = train_lgb_simple(
    X_fe, y, X_test_fe, n_splits=N_SPLITS
)


=== Training WITH minimal feature engineering ===
Training LightGBM Simple (10-fold)...
  Fold 1 AUC: 0.72782
  Fold 1 AUC: 0.72782
  Fold 2 AUC: 0.72888
  Fold 2 AUC: 0.72888
  Fold 3 AUC: 0.72706
  Fold 3 AUC: 0.72706
  Fold 4 AUC: 0.72674
  Fold 4 AUC: 0.72674
  Fold 5 AUC: 0.72773
  Fold 5 AUC: 0.72773
  Fold 6 AUC: 0.72822
  Fold 6 AUC: 0.72822
  Fold 7 AUC: 0.72650
  Fold 7 AUC: 0.72650
  Fold 8 AUC: 0.73091
  Fold 8 AUC: 0.73091
  Fold 9 AUC: 0.73012
  Fold 9 AUC: 0.73012
  Fold 10 AUC: 0.72671
Overall OOF AUC: 0.72807
  Fold 10 AUC: 0.72671
Overall OOF AUC: 0.72807


## Part 6: CatBoost Model (From Top-34)

In [8]:
def train_catboost(X, y, X_test, n_splits=10):
    """Train CatBoost with native categorical handling"""
    
    cat_cols = X.select_dtypes(include=['category', 'object']).columns.tolist()
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    
    print(f"Training CatBoost ({n_splits}-fold)...")
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = cb.CatBoostClassifier(
            iterations=3000,
            learning_rate=0.01,
            depth=5,
            l2_leaf_reg=5,
            random_seed=SEED + fold,
            verbose=False,
            early_stopping_rounds=200
        )
        
        model.fit(
            X_tr, y_tr,
            cat_features=cat_cols,
            eval_set=(X_val, y_val)
        )
        
        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / n_splits
        
        fold_auc = roc_auc_score(y_val, oof_preds[val_idx])
        print(f"  Fold {fold+1} AUC: {fold_auc:.5f}")
    
    oof_auc = roc_auc_score(y, oof_preds)
    print(f"Overall OOF AUC: {oof_auc:.5f}")
    
    return oof_preds, test_preds, oof_auc

print("\n=== Training CatBoost (simple data) ===")
oof_cat_simple, test_cat_simple, auc_cat_simple = train_catboost(
    X_simple, y, X_test_simple, n_splits=N_SPLITS
)


=== Training CatBoost (simple data) ===
Training CatBoost (10-fold)...
  Fold 1 AUC: 0.72109
  Fold 1 AUC: 0.72109
  Fold 2 AUC: 0.72197
  Fold 2 AUC: 0.72197
  Fold 3 AUC: 0.72000
  Fold 3 AUC: 0.72000
  Fold 4 AUC: 0.71904
  Fold 4 AUC: 0.71904
  Fold 5 AUC: 0.72112
  Fold 5 AUC: 0.72112
  Fold 6 AUC: 0.72044
  Fold 6 AUC: 0.72044
  Fold 7 AUC: 0.71949
  Fold 7 AUC: 0.71949
  Fold 8 AUC: 0.72358
  Fold 8 AUC: 0.72358
  Fold 9 AUC: 0.72321
  Fold 9 AUC: 0.72321
  Fold 10 AUC: 0.71946
  Fold 10 AUC: 0.71946
Overall OOF AUC: 0.72093
Overall OOF AUC: 0.72093


## Part 7: XGBoost Model

In [9]:
def train_xgboost(X, y, X_test, n_splits=10):
    """Train XGBoost (requires numeric encoding)"""
    
    # Convert categories to codes for XGBoost
    X_enc = X.copy()
    X_test_enc = X_test.copy()
    
    for col in X_enc.select_dtypes(include=['category']).columns:
        X_enc[col] = X_enc[col].cat.codes
        X_test_enc[col] = X_test_enc[col].cat.codes
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    
    print(f"Training XGBoost ({n_splits}-fold)...")
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_enc, y)):
        X_tr, X_val = X_enc.iloc[train_idx], X_enc.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = xgb.XGBClassifier(
            n_estimators=3000,
            learning_rate=0.01,
            max_depth=5,
            subsample=0.7,
            colsample_bytree=0.5,
            reg_lambda=2.0,
            reg_alpha=0.5,
            random_state=SEED + fold,
            tree_method='hist',
            early_stopping_rounds=200
        )
        
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            verbose=False
        )
        
        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
        test_preds += model.predict_proba(X_test_enc)[:, 1] / n_splits
        
        fold_auc = roc_auc_score(y_val, oof_preds[val_idx])
        print(f"  Fold {fold+1} AUC: {fold_auc:.5f}")
    
    oof_auc = roc_auc_score(y, oof_preds)
    print(f"Overall OOF AUC: {oof_auc:.5f}")
    
    return oof_preds, test_preds, oof_auc

print("\n=== Training XGBoost (simple data) ===")
oof_xgb_simple, test_xgb_simple, auc_xgb_simple = train_xgboost(
    X_simple, y, X_test_simple, n_splits=N_SPLITS
)


=== Training XGBoost (simple data) ===
Training XGBoost (10-fold)...
Training XGBoost (10-fold)...
  Fold 1 AUC: 0.72508
  Fold 1 AUC: 0.72508
  Fold 2 AUC: 0.72704
  Fold 2 AUC: 0.72704
  Fold 3 AUC: 0.72381
  Fold 3 AUC: 0.72381
  Fold 4 AUC: 0.72370
  Fold 4 AUC: 0.72370
  Fold 5 AUC: 0.72510
  Fold 5 AUC: 0.72510
  Fold 6 AUC: 0.72488
  Fold 6 AUC: 0.72488
  Fold 7 AUC: 0.72377
  Fold 7 AUC: 0.72377
  Fold 8 AUC: 0.72843
  Fold 8 AUC: 0.72843
  Fold 9 AUC: 0.72747
  Fold 9 AUC: 0.72747
  Fold 10 AUC: 0.72421
Overall OOF AUC: 0.72534
  Fold 10 AUC: 0.72421
Overall OOF AUC: 0.72534


## Part 8: Ensemble (Equal Weights - Avoid Overfitting)

In [10]:
print("\n=== Model Comparison ===")
print(f"LGB Simple (no FE):  CV AUC = {auc_lgb_simple:.5f}")
print(f"LGB with FE:         CV AUC = {auc_lgb_fe:.5f}")
print(f"CatBoost Simple:     CV AUC = {auc_cat_simple:.5f}")
print(f"XGBoost Simple:      CV AUC = {auc_xgb_simple:.5f}")

# Simple equal-weight ensemble (avoid optimizing weights on CV)
print("\n=== Ensemble Results ===")

# Ensemble 1: LGB + CatBoost + XGBoost (all simple, equal weights)
oof_ensemble = (oof_lgb_simple + oof_cat_simple + oof_xgb_simple) / 3
test_ensemble = (test_lgb_simple + test_cat_simple + test_xgb_simple) / 3
auc_ensemble = roc_auc_score(y, oof_ensemble)
print(f"Ensemble (LGB+CAT+XGB, equal):    CV AUC = {auc_ensemble:.5f}")

# Ensemble 2: XGB + CatBoost blend (like top-34 solution: 60/40)
oof_blend_6040 = 0.6 * oof_xgb_simple + 0.4 * oof_cat_simple
test_blend_6040 = 0.6 * test_xgb_simple + 0.4 * test_cat_simple
auc_blend_6040 = roc_auc_score(y, oof_blend_6040)
print(f"Blend XGB+CAT (60/40):            CV AUC = {auc_blend_6040:.5f}")

# Ensemble 3: Just LGB (simplest)
print(f"LGB only:                         CV AUC = {auc_lgb_simple:.5f}")


=== Model Comparison ===
LGB Simple (no FE):  CV AUC = 0.72853
LGB with FE:         CV AUC = 0.72807
CatBoost Simple:     CV AUC = 0.72093
XGBoost Simple:      CV AUC = 0.72534

=== Ensemble Results ===
Ensemble (LGB+CAT+XGB, equal):    CV AUC = 0.72593
Blend XGB+CAT (60/40):            CV AUC = 0.72415
LGB only:                         CV AUC = 0.72853
Ensemble (LGB+CAT+XGB, equal):    CV AUC = 0.72593
Blend XGB+CAT (60/40):            CV AUC = 0.72415
LGB only:                         CV AUC = 0.72853


## Part 9: Create Submissions

In [None]:
# Save multiple submissions for testing

# 1. LGB Simple (no FE) - most conservative
sub_lgb = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': test_lgb_simple})
sub_lgb.to_csv('submission_v5_lgb_simple.csv', index=False)
print(f"Saved submission_v5_lgb_simple.csv (CV AUC: {auc_lgb_simple:.5f})")

# 2. Equal weight ensemble
sub_ensemble = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': test_ensemble})
sub_ensemble.to_csv('submission_v5_ensemble.csv', index=False)
print(f"Saved submission_v5_ensemble.csv (CV AUC: {auc_ensemble:.5f})")

# 3. XGB+CAT 60/40 blend (top-34 style)
sub_blend = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': test_blend_6040})
sub_blend.to_csv('submission_v5_blend_6040.csv', index=False)
print(f"Saved submission_v5_blend_6040.csv (CV AUC: {auc_blend_6040:.5f})")

# 4. CatBoost only
sub_cat = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': test_cat_simple})
sub_cat.to_csv('submission_v5_catboost.csv', index=False)
print(f"Saved submission_v5_catboost.csv (CV AUC: {auc_cat_simple:.5f})")

Saved submission_v5_lgb_simple.csv (CV AUC: 0.72853)
Saved submission_v5_ensemble.csv (CV AUC: 0.72593)
Saved submission_v5_ensemble.csv (CV AUC: 0.72593)
Saved submission_v5_blend_6040.csv (CV AUC: 0.72415)
Saved submission_v5_blend_6040.csv (CV AUC: 0.72415)
Saved submission_v5_catboost.csv (CV AUC: 0.72093)
Saved submission_v5_catboost.csv (CV AUC: 0.72093)


: 

## Summary

Key insights from top Kaggle solutions:

1. **Simpler models often generalize better** - The top solutions avoided complex feature engineering
2. **Native categorical handling** works well (LightGBM/CatBoost)
3. **10-fold CV** provides more stable estimates
4. **Conservative hyperparameters** (low learning rate, strong regularization)
5. **Equal weight ensembles** are safer than optimized weights (which overfit to CV)

Submissions to try (in order of expected LB performance):
1. `submission_v5_lgb_simple.csv` - Most conservative, likely best LB
2. `submission_v5_catboost.csv` - CatBoost handles categories well
3. `submission_v5_ensemble.csv` - Equal weight blend
4. `submission_v5_blend_6040.csv` - Top-34 style blend