In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e12/sample_submission.csv
/kaggle/input/playground-series-s5e12/train.csv
/kaggle/input/playground-series-s5e12/test.csv


In [2]:

import pandas as pd
import numpy as np
import seaborn as sns

# -------------------------
# LOAD DATA
# -------------------------
dataTrain = pd.read_csv("/kaggle/input/playground-series-s5e12/train.csv")
dataTest  = pd.read_csv("/kaggle/input/playground-series-s5e12/test.csv")

dfTrain = pd.DataFrame(dataTrain)
dfTest  = pd.DataFrame(dataTest)

# -------------------------
# FEATURE ENGINEERING (your original features)
# -------------------------
dfTrain["age_bmi"] = dfTrain["age"] * dfTrain["bmi"]
dfTrain["waist_hip_ratio&age"] = dfTrain["waist_to_hip_ratio"] * dfTrain["age"]
dfTrain["bmi_waist"] = dfTrain["waist_to_hip_ratio"] * dfTrain["bmi"]
dfTrain["age_bmi_waist"] = dfTrain["age"] * dfTrain["bmi"] * dfTrain["waist_to_hip_ratio"]

dfTest["age_bmi"] = dfTest["age"] * dfTest["bmi"]
dfTest["waist_hip_ratio&age"] = dfTest["waist_to_hip_ratio"] * dfTest["age"]
dfTest["bmi_waist"] = dfTest["waist_to_hip_ratio"] * dfTest["bmi"]
dfTest["age_bmi_waist"] = dfTest["age"] * dfTest["bmi"] * dfTest["waist_to_hip_ratio"]

# -------------------------
# Separate categorical and numerical columns
# -------------------------
# Note: ensure the true target column name is 'diagnosed_diabetes' as in your original code
TARGET = "diagnosed_diabetes"

cat_cols = dfTrain.select_dtypes(exclude=np.number).columns.tolist()
# If there are no categorical cols this will be empty; OneHotEncoder will handle it.
num_cols = dfTrain.select_dtypes(include=np.number).columns.drop(TARGET).tolist()

cat_train = dfTrain[cat_cols]
cat_test  = dfTest[cat_cols]

num_train = dfTrain[num_cols]
num_test  = dfTest[num_cols]

# -------------------------
# ONE-HOT ENCODING (your original approach)
# -------------------------
from sklearn.preprocessing import OneHotEncoder
# older sklearn uses sparse=False; using that for broad compatibility
ohe = OneHotEncoder(drop="first", handle_unknown="ignore", sparse=False)
if len(cat_cols) > 0:
    X_cat = ohe.fit_transform(cat_train)
    Y_cat = ohe.transform(cat_test)
else:
    # no categorical columns -> create empty arrays with correct number of rows
    X_cat = np.zeros((len(dfTrain), 0))
    Y_cat = np.zeros((len(dfTest), 0))

# combine OHE output with numeric arrays
dfTrain_final = np.hstack([X_cat, num_train.to_numpy()])
dfTest_final  = np.hstack([Y_cat,  num_test.to_numpy()])

# Quick check shapes
print("Train final shape:", dfTrain_final.shape)
print("Test final shape :", dfTest_final.shape)

# -------------------------
# Prepare data variables used by stacking
# -------------------------
X = dfTrain_final            # numpy array
y = dfTrain[TARGET].values   # 1D numpy array
X_test = dfTest_final        # numpy array

# -------------------------
# Imports for stacking models
# -------------------------
from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

# -------------------------
# BEST XGB PARAMS (manually provided by you)
# -------------------------
best_params = {
    "n_estimators": 2000,
    "max_depth": 4,
    "learning_rate": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.8
}
print("Using best xgb params:", best_params)

# -------------------------
# Prepare OOF and test prediction containers
# -------------------------
NFOLDS = 5
skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

oof_xgb = np.zeros(len(X), dtype=float)
oof_lgb = np.zeros(len(X), dtype=float)
oof_cat = np.zeros(len(X), dtype=float)

test_pred_xgb = np.zeros(len(X_test), dtype=float)
test_pred_lgb = np.zeros(len(X_test), dtype=float)
test_pred_cat = np.zeros(len(X_test), dtype=float)

# -------------------------
# K-FOLD TRAINING LOOP
# -------------------------
fold = 1
for train_idx, val_idx in skf.split(X, y):
    print(f"\n========== FOLD {fold} ==========")
    # select rows (X is numpy array)
    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    # ---- XGBoost
    xgb_model = XGBClassifier(
        n_estimators=best_params["n_estimators"],
        max_depth=best_params["max_depth"],
        learning_rate=best_params["learning_rate"],
        subsample=best_params["subsample"],
        colsample_bytree=best_params["colsample_bytree"],
        eval_metric="auc",
        use_label_encoder=False,
        random_state=42,
        n_jobs=-1
    )
    xgb_model.fit(X_tr, y_tr)
    oof_xgb[val_idx] = xgb_model.predict_proba(X_val)[:, 1]
    test_pred_xgb += xgb_model.predict_proba(X_test)[:, 1] / NFOLDS
    print("XGB Fold AUC:", roc_auc_score(y_val, oof_xgb[val_idx]))

    # ---- LightGBM
    lgb_model = lgb.LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.03,
        subsample=0.8,
        colsample_bytree=0.7,
        max_depth=-1,
        random_state=42,
        objective="binary"
    )
    lgb_model.fit(X_tr, y_tr)
    oof_lgb[val_idx] = lgb_model.predict_proba(X_val)[:, 1]
    test_pred_lgb += lgb_model.predict_proba(X_test)[:, 1] / NFOLDS
    print("LGB Fold AUC:", roc_auc_score(y_val, oof_lgb[val_idx]))

    # ---- CatBoost
    cat_model = CatBoostClassifier(
        iterations=2000,
        learning_rate=0.07,
        l2_leaf_reg=1,
        depth=6,
        loss_function="Logloss",
        verbose=False,
        random_state=42
    )
    cat_model.fit(X_tr, y_tr)
    oof_cat[val_idx] = cat_model.predict_proba(X_val)[:, 1]
    test_pred_cat += cat_model.predict_proba(X_test)[:, 1] / NFOLDS
    print("CAT Fold AUC:", roc_auc_score(y_val, oof_cat[val_idx]))

    fold += 1

# -------------------------
# Print OOF performance for each base model
# -------------------------
print("\n===== OOF SCORES =====")
print("XGB OOF AUC:", roc_auc_score(y, oof_xgb))
print("LGB OOF AUC:", roc_auc_score(y, oof_lgb))
print("CAT OOF AUC:", roc_auc_score(y, oof_cat))

# -------------------------
# META-MODEL TRAINING (stacking)
# -------------------------
meta_train = np.vstack([oof_xgb, oof_lgb, oof_cat]).T
meta_test  = np.vstack([test_pred_xgb, test_pred_lgb, test_pred_cat]).T

meta_model = LogisticRegression(max_iter=1000, solver="lbfgs")
meta_model.fit(meta_train, y)

stacked_pred_test = meta_model.predict_proba(meta_test)[:, 1]

# -------------------------
# Create submission
# -------------------------
submission = pd.read_csv("/kaggle/input/playground-series-s5e12/sample_submission.csv")
# Ensure sample_submission has 'id' column; use same name as file if different adjust below
submission["diagnosed_diabetes"] = stacked_pred_test
submission = submission[["id", "diagnosed_diabetes"]]
submission.to_csv("stacking_submission.csv", index=False)
print("\nSaved stacking_submission.csv")
display(submission.head())




Train final shape: (700000, 41)
Test final shape : (300000, 41)
Using best xgb params: {'n_estimators': 2000, 'max_depth': 4, 'learning_rate': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.8}

XGB Fold AUC: 0.7273424257258376
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061263 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2927
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623295 -> initscore=0.503556
[LightGBM] [Info] Start training from score 0.503556
LGB Fold AUC: 0.7265178849300913
CAT Fold AUC: 0.7277981155035234

XGB Fold AUC: 0.7255888559640764
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Auto-choosing row-wise m

Unnamed: 0,id,diagnosed_diabetes
0,700000,0.500515
1,700001,0.730713
2,700002,0.802217
3,700003,0.38669
4,700004,0.888286
