In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [None]:
data_train = pd.read_csv("data/train.csv")
data_test = pd.read_csv("data/test.csv")

In [4]:


# --- Config ---
TARGET_COL = "y"
ID_COL_CANDIDATES = ["id", "ID", "Id"]
N_SPLITS = 5
RANDOM_STATE = 42

# --- Safety checks & light cleaning ---
train = data_train.copy()
test  = data_test.copy()

# Map y from "yes"/"no" to 1/0 if needed
if train[TARGET_COL].dtype == "O":
    train[TARGET_COL] = train[TARGET_COL].str.strip().str.lower().map({"yes": 1, "no": 0})

# Identify ID column (fallback to index)
id_col = None
for c in ID_COL_CANDIDATES:
    if c in test.columns:
        id_col = c
        break
if id_col is None:
    id_col = "id"
    test[id_col] = np.arange(len(test))

# Features: drop target + id if present
drop_cols = {TARGET_COL, id_col} & set(train.columns)
features = [c for c in train.columns if c not in drop_cols]

# Find categoricals and cast to category (LightGBM-native handling)
cat_cols = [c for c in features if train[c].dtype == "O"]
for c in cat_cols:
    train[c] = train[c].astype("category")
    if c in test.columns:
        test[c] = test[c].astype("category")

# Align categorical levels across train/test
for c in cat_cols:
    all_cats = sorted(list(set(train[c].cat.categories.tolist()) | set(test[c].cat.categories.tolist())))
    train[c] = train[c].cat.set_categories(all_cats)
    test[c]  = test[c].cat.set_categories(all_cats)

X = train[features]
y = train[TARGET_COL].astype(int)
X_test = test[features]

# --- CV & Training ---
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.03,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "min_data_in_leaf": 50,
    "max_depth": -1,
    "verbose": -1,
    "is_unbalance": True,
    "seed": RANDOM_STATE
}

oof = np.zeros(len(X))
test_pred = np.zeros(len(X_test))
fold_aucs = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
    X_va, y_va = X.iloc[va_idx], y.iloc[va_idx]

    dtr = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols, free_raw_data=False)
    dva = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_cols, reference=dtr, free_raw_data=False)

    model = lgb.train(
        params,
        dtr,
        num_boost_round=5000,
        valid_sets=[dtr, dva],
        valid_names=["train", "valid"],
        callbacks=[
            lgb.early_stopping(200),
            lgb.log_evaluation(200)
        ]
    )

    oof[va_idx] = model.predict(X_va, num_iteration=model.best_iteration)
    fold_auc = roc_auc_score(y_va, oof[va_idx])
    fold_aucs.append(fold_auc)
    print(f"[Fold {fold}] AUC: {fold_auc:.5f}")

    test_pred += model.predict(X_test, num_iteration=model.best_iteration) / N_SPLITS

cv_auc = roc_auc_score(y, oof)
print(f"\nOOF AUC: {cv_auc:.5f} | Folds: {', '.join(f'{a:.5f}' for a in fold_aucs)}")

# --- Submission file ---
sub = pd.DataFrame({
    id_col: test[id_col].values,
    TARGET_COL: test_pred
})
sub_path = "submission_lgbm_baseline.csv"
sub.to_csv(sub_path, index=False)
print(f"Saved: {sub_path}")


Training until validation scores don't improve for 200 rounds
[200]	train's auc: 0.966324	valid's auc: 0.965469
[400]	train's auc: 0.970058	valid's auc: 0.967781
[600]	train's auc: 0.972043	valid's auc: 0.968412
[800]	train's auc: 0.973696	valid's auc: 0.968809
[1000]	train's auc: 0.975083	valid's auc: 0.968999
[1200]	train's auc: 0.976365	valid's auc: 0.969159
[1400]	train's auc: 0.97753	valid's auc: 0.969264
[1600]	train's auc: 0.978596	valid's auc: 0.969339
[1800]	train's auc: 0.979618	valid's auc: 0.9694
[2000]	train's auc: 0.980567	valid's auc: 0.969438
[2200]	train's auc: 0.981454	valid's auc: 0.969458
[2400]	train's auc: 0.982299	valid's auc: 0.969465
Early stopping, best iteration is:
[2315]	train's auc: 0.981942	valid's auc: 0.969475
[Fold 1] AUC: 0.96947
Training until validation scores don't improve for 200 rounds
[200]	train's auc: 0.966612	valid's auc: 0.96448
[400]	train's auc: 0.970275	valid's auc: 0.966665
[600]	train's auc: 0.972338	valid's auc: 0.967366
[800]	train's 

In [5]:
import pandas as pd

# Path to your existing predictions file
sub_path = "submission_lgbm_baseline.csv"

# Read the file
sub = pd.read_csv(sub_path)

# Convert predictions to 0/1
sub["y"] = (sub["y"] >= 0.55).astype(int)

# Save updated predictions
sub.to_csv(sub_path, index=False)
print(f"Updated predictions in {sub_path} to binary 0/1")


Updated predictions in submission_lgbm_baseline.csv to binary 0/1
