In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")
import gc

# 1. Load Data
train_path = "/kaggle/input/playground-series-s5e11/train.csv"
test_path  = "/kaggle/input/playground-series-s5e11/test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

print("Train shape:", train.shape)
print("Test shape:", test.shape)

# Identify target
target_col = [c for c in train.columns if c.lower() in ["target", "label", "y", "loan_status"]]
target_col = target_col[0] if target_col else train.columns[-1]

# Identify ID
id_col = [c for c in test.columns if c.lower() in ["id", "loan_id", "customer_id"]]
id_col = id_col[0] if id_col else None

print("Target:", target_col)
if id_col:
    print("ID:", id_col)

# 2. Prepare Data
X = train.drop(columns=[target_col])
y = train[target_col]
X_test = test.copy()

# Label Encode Categorical Features
for col in X.columns:
    if X[col].dtype == "object":
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))
    else:
        X[col] = X[col].fillna(X[col].median())
        X_test[col] = X_test[col].fillna(X[col].median())

# 3. Model Parameters (Optimized)

lgb_params = {
    "objective": "binary",
    "metric": "auc",
    "num_leaves": 45,
    "learning_rate": 0.02,
    "feature_fraction": 0.85,
    "bagging_fraction": 0.85,
    "bagging_freq": 5,
    "min_data_in_leaf": 25,
    "lambda_l1": 1,
    "lambda_l2": 1,
    "verbose": -1,
    "seed": 42
}

xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.02,
    "max_depth": 7,
    "subsample": 0.85,
    "colsample_bytree": 0.85,
    "tree_method": "hist",
    "lambda": 1,
    "alpha": 1,
    "seed": 42,
    "verbosity": 0
}

# 4. Cross-Validation / Training

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_lgb = np.zeros(len(X))
oof_xgb = np.zeros(len(X))

pred_lgb = np.zeros(len(X_test))
pred_xgb = np.zeros(len(X_test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):

    X_tr, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    # ===== LightGBM =====
    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dval   = lgb.Dataset(X_val, label=y_val)

    model_lgb = lgb.train(
        lgb_params,
        dtrain,
        num_boost_round=5000,
        valid_sets=[dtrain, dval],
        callbacks=[lgb.early_stopping(200)]
    )

    oof_lgb[val_idx] = model_lgb.predict(X_val)
    pred_lgb += model_lgb.predict(X_test) / folds.n_splits

    # ===== XGBoost =====
    dtrain_xgb = xgb.DMatrix(X_tr, label=y_tr)
    dval_xgb   = xgb.DMatrix(X_val, label=y_val)

    model_xgb = xgb.train(
        xgb_params,
        dtrain_xgb,
        num_boost_round=5000,
        evals=[(dval_xgb, "eval")],
        early_stopping_rounds=200,
        verbose_eval=False
    )

    oof_xgb[val_idx] = model_xgb.predict(xgb.DMatrix(X_val))
    pred_xgb += model_xgb.predict(xgb.DMatrix(X_test)) / folds.n_splits

    gc.collect()

# 5. Ensemble Blending (Only LGB + XGB)
final_oof = 0.60 * oof_lgb + 0.40 * oof_xgb
final_pred = 0.60 * pred_lgb + 0.40 * pred_xgb

final_auc = roc_auc_score(y, final_oof)
print("\n======================================")
print(f" Final Cross-Validated AUC: {final_auc:.6f}")
print("======================================\n")

# 6. Save Submission
submission = pd.DataFrame({
    id_col if id_col else "id": test[id_col] if id_col else np.arange(len(test)),
    "target": final_pred
})
submission.to_csv("submission.csv", index=False)
print("Submission saved → submission.csv")
print(submission.head())


Train shape: (593994, 13)
Test shape: (254569, 12)
Target: loan_paid_back
ID: id
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2449]	training's auc: 0.939407	valid_1's auc: 0.923462
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2860]	training's auc: 0.941638	valid_1's auc: 0.923091
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2900]	training's auc: 0.942601	valid_1's auc: 0.921963
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[3066]	training's auc: 0.943317	valid_1's auc: 0.922904
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2534]	training's auc: 0.940143	valid_1's auc: 0.922253

 Final Cross-Validated AUC: 0.922540

Submission saved → submission.csv
       id    target
0  593994  0.936804
1  593995  0.982529
2  593996  0.521333
3