# Model V4: Back to Basics - Focus on Generalization

**Problem:** CV scores improved but LB scores got worse = overfitting

**Strategy:**
1. Minimal feature engineering (only domain-relevant features)
2. Strong regularization
3. Simpler models with fewer hyperparameters
4. Proper CV-based target encoding (with fold-out)
5. Remove target encoding entirely (it often leaks)
6. Use original data + only most robust features

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded!")

Libraries loaded!


In [2]:
# Load data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
print(f"Train: {train.shape}, Test: {test.shape}")

Train: (700000, 26), Test: (300000, 25)


## Minimal Feature Engineering
Only add features that are clearly domain-relevant and unlikely to overfit.

In [3]:
def minimal_feature_engineering(df):
    """Only add robust, domain-relevant features"""
    df = df.copy()
    
    # Simple cholesterol ratios (medically meaningful)
    df['ldl_hdl_ratio'] = df['ldl_cholesterol'] / (df['hdl_cholesterol'] + 1)
    
    # Combined risk factors (simple counts, not complex scores)
    df['risk_factor_count'] = (df['family_history_diabetes'] + 
                                df['hypertension_history'] + 
                                df['cardiovascular_history'])
    
    # Blood pressure derived
    df['pulse_pressure'] = df['systolic_bp'] - df['diastolic_bp']
    
    # Simple BMI-age interaction (well-known diabetes risk factor)
    df['age_bmi'] = df['age'] * df['bmi']
    
    return df

train_fe = minimal_feature_engineering(train)
test_fe = minimal_feature_engineering(test)
print(f"Train: {train_fe.shape}, Test: {test_fe.shape}")
print(f"Added features: {train_fe.shape[1] - train.shape[1]}")

Train: (700000, 30), Test: (300000, 29)
Added features: 4


In [4]:
# Prepare features - NO target encoding (common source of leakage)
target = 'diagnosed_diabetes'
cat_cols = ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']

X = train_fe.drop(columns=['id', target])
y = train_fe[target]
X_test = test_fe.drop(columns=['id'])

# Simple label encoding for categoricals
for col in cat_cols:
    le = LabelEncoder()
    all_vals = pd.concat([X[col], X_test[col]]).unique()
    le.fit(all_vals)
    X[col] = le.transform(X[col])
    X_test[col] = le.transform(X_test[col])

print(f"Features: {X.shape[1]}")
print(f"Feature list: {list(X.columns)}")

Features: 28
Feature list: ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi', 'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides', 'gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status', 'family_history_diabetes', 'hypertension_history', 'cardiovascular_history', 'ldl_hdl_ratio', 'risk_factor_count', 'pulse_pressure', 'age_bmi']


## Conservative Model Parameters
Use stronger regularization to prevent overfitting.

In [5]:
# Conservative parameters - prioritize generalization over CV score
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.03,  # Lower learning rate
    'n_estimators': 500,
    'max_depth': 5,  # Shallower trees
    'num_leaves': 20,  # Fewer leaves
    'min_child_samples': 100,  # More samples per leaf
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'reg_alpha': 1.0,  # Strong L1 regularization
    'reg_lambda': 1.0,  # Strong L2 regularization
    'random_state': 42,
    'verbosity': -1,
    'n_jobs': -1
}

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'learning_rate': 0.03,
    'n_estimators': 500,
    'max_depth': 5,
    'min_child_weight': 100,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'reg_alpha': 1.0,
    'reg_lambda': 1.0,
    'random_state': 42,
    'verbosity': 0,
    'n_jobs': -1
}

cat_params = {
    'iterations': 500,
    'learning_rate': 0.03,
    'depth': 5,
    'l2_leaf_reg': 10,  # Strong regularization
    'min_data_in_leaf': 100,
    'random_seed': 42,
    'verbose': False,
    'eval_metric': 'AUC'
}

print("Conservative parameters configured")

Conservative parameters configured


In [6]:
# Train with 5-fold CV
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_lgb = np.zeros(len(X))
oof_xgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))

test_lgb = np.zeros(len(X_test))
test_xgb = np.zeros(len(X_test))
test_cat = np.zeros(len(X_test))

print("Training with conservative parameters...\n")

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"Fold {fold + 1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # LightGBM
    lgb_model = lgb.LGBMClassifier(**lgb_params)
    lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
                  callbacks=[lgb.early_stopping(50, verbose=False)])
    oof_lgb[val_idx] = lgb_model.predict_proba(X_val)[:, 1]
    test_lgb += lgb_model.predict_proba(X_test)[:, 1] / n_splits
    
    # XGBoost
    xgb_model = xgb.XGBClassifier(**xgb_params)
    xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    oof_xgb[val_idx] = xgb_model.predict_proba(X_val)[:, 1]
    test_xgb += xgb_model.predict_proba(X_test)[:, 1] / n_splits
    
    # CatBoost
    cat_model = CatBoostClassifier(**cat_params)
    cat_model.fit(X_train, y_train, eval_set=(X_val, y_val), 
                  early_stopping_rounds=50, verbose=False)
    oof_cat[val_idx] = cat_model.predict_proba(X_val)[:, 1]
    test_cat += cat_model.predict_proba(X_test)[:, 1] / n_splits
    
    print(f"  LGB: {roc_auc_score(y_val, oof_lgb[val_idx]):.5f}, "
          f"XGB: {roc_auc_score(y_val, oof_xgb[val_idx]):.5f}, "
          f"CAT: {roc_auc_score(y_val, oof_cat[val_idx]):.5f}")

print("\n" + "="*50)
print(f"LightGBM CV AUC: {roc_auc_score(y, oof_lgb):.5f}")
print(f"XGBoost CV AUC:  {roc_auc_score(y, oof_xgb):.5f}")
print(f"CatBoost CV AUC: {roc_auc_score(y, oof_cat):.5f}")

Training with conservative parameters...

Fold 1
  LGB: 0.72092, XGB: 0.72113, CAT: 0.71380
Fold 2
  LGB: 0.71913, XGB: 0.71891, CAT: 0.71206
Fold 3
  LGB: 0.72010, XGB: 0.72002, CAT: 0.71227
Fold 4
  LGB: 0.72075, XGB: 0.72114, CAT: 0.71355
Fold 5
  LGB: 0.72041, XGB: 0.72089, CAT: 0.71359

LightGBM CV AUC: 0.72026
XGBoost CV AUC:  0.72041
CatBoost CV AUC: 0.71304


In [7]:
# Simple averaging ensemble (equal weights - less prone to overfitting)
oof_ensemble = (oof_lgb + oof_xgb + oof_cat) / 3
test_ensemble = (test_lgb + test_xgb + test_cat) / 3

print(f"Simple Average Ensemble CV AUC: {roc_auc_score(y, oof_ensemble):.5f}")

# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'diagnosed_diabetes': test_ensemble
})
submission.to_csv('submission_v4.csv', index=False)
print(f"\nSubmission saved: submission_v4.csv")
print(f"Prediction stats:\n{submission['diagnosed_diabetes'].describe()}")

Simple Average Ensemble CV AUC: 0.71862

Submission saved: submission_v4.csv
Prediction stats:
count    300000.000000
mean          0.603035
std           0.180007
min           0.065644
25%           0.476365
50%           0.604267
75%           0.733117
max           0.985545
Name: diagnosed_diabetes, dtype: float64


## Also try: Pure raw features (no engineering)

In [None]:
# Try with ZERO feature engineering - just raw data
X_raw = train.drop(columns=['id', target])
X_test_raw = test.drop(columns=['id'])

# Label encode categoricals
for col in cat_cols:
    le = LabelEncoder()
    all_vals = pd.concat([X_raw[col], X_test_raw[col]]).unique()
    le.fit(all_vals)
    X_raw[col] = le.transform(X_raw[col])
    X_test_raw[col] = le.transform(X_test_raw[col])

print(f"Raw features: {X_raw.shape[1]}")

# Train just LightGBM (fastest) with raw features
oof_raw = np.zeros(len(X_raw))
test_raw = np.zeros(len(X_test_raw))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_raw, y)):
    X_train, X_val = X_raw.iloc[train_idx], X_raw.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    lgb_model = lgb.LGBMClassifier(**lgb_params)
    lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
                  callbacks=[lgb.early_stopping(50, verbose=False)])
    oof_raw[val_idx] = lgb_model.predict_proba(X_val)[:, 1]
    test_raw += lgb_model.predict_proba(X_test_raw)[:, 1] / n_splits

print(f"Raw LightGBM CV AUC: {roc_auc_score(y, oof_raw):.5f}")

# Save raw submission
submission_raw = pd.DataFrame({'id': test['id'], 'diagnosed_diabetes': test_raw})
submission_raw.to_csv('submission_v4_raw.csv', index=False)
print("Saved submission_v4_raw.csv")

Raw features: 24
Raw LightGBM CV AUC: 0.72070
Saved submission_v4_raw.csv


: 