# Tabular Data Experiment

This notebook is a template for tabular data competitions using GBDT models.

In [None]:
import gc
import pickle
import warnings
from pathlib import Path

import config
import lightgbm as lgb
import numpy as np
import pandas as pd
import polars as pl
from lightgbm import LGBMClassifier, LGBMRegressor
from metric import score
from seed import seed_everything
from sklearn.model_selection import KFold, StratifiedKFold

warnings.filterwarnings("ignore")

In [None]:
# =============================================================================
# Configuration
# =============================================================================
class CFG:
    SEED = 42
    N_FOLDS = 5
    TARGET_COL = "target"  # Update with your target column
    
    # Paths from config.py
    DATA_PATH = config.COMP_DATASET_DIR
    OUTPUT_DIR = config.OUTPUT_DIR
    MODEL_PATH = config.OUTPUT_DIR / "models"
    
    # LightGBM parameters
    lgb_params = {
        "objective": "regression",  # or "binary", "multiclass"
        "metric": "rmse",
        "learning_rate": 0.05,
        "max_depth": 6,
        "num_leaves": 31,
        "colsample_bytree": 0.8,
        "subsample": 0.8,
        "seed": SEED,
        "verbosity": -1,
    }
    
    NUM_BOOST_ROUND = 10000
    EARLY_STOPPING_ROUND = 100

seed_everything(CFG.SEED)
CFG.MODEL_PATH.mkdir(parents=True, exist_ok=True)

In [None]:
# =============================================================================
# Load Data
# =============================================================================
train = pl.read_csv(CFG.DATA_PATH / "train.csv")
test = pl.read_csv(CFG.DATA_PATH / "test.csv")

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

In [None]:
# =============================================================================
# Feature Engineering
# =============================================================================
# TODO: Add your feature engineering here

# Define features
FEATURES = [col for col in train.columns if col not in ["id", CFG.TARGET_COL]]
print(f"Number of features: {len(FEATURES)}")

In [None]:
# =============================================================================
# Cross-Validation Training
# =============================================================================
train_pd = train.to_pandas()
test_pd = test.to_pandas()

oof_predictions = np.zeros(len(train_pd))
test_predictions = np.zeros(len(test_pd))

kf = KFold(n_splits=CFG.N_FOLDS, shuffle=True, random_state=CFG.SEED)
# For classification: kf = StratifiedKFold(n_splits=CFG.N_FOLDS, shuffle=True, random_state=CFG.SEED)

for fold, (train_idx, val_idx) in enumerate(kf.split(train_pd), 1):
    print(f"\n{'='*50}")
    print(f"Fold {fold}")
    print(f"{'='*50}")
    
    X_train = train_pd.loc[train_idx, FEATURES]
    y_train = train_pd.loc[train_idx, CFG.TARGET_COL]
    X_val = train_pd.loc[val_idx, FEATURES]
    y_val = train_pd.loc[val_idx, CFG.TARGET_COL]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    # Train model
    model = lgb.train(
        CFG.lgb_params,
        train_data,
        num_boost_round=CFG.NUM_BOOST_ROUND,
        valid_sets=[train_data, val_data],
        valid_names=["train", "valid"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=CFG.EARLY_STOPPING_ROUND),
            lgb.log_evaluation(500),
        ],
    )
    
    # Predict
    oof_predictions[val_idx] = model.predict(X_val)
    test_predictions += model.predict(test_pd[FEATURES]) / CFG.N_FOLDS
    
    # Save model
    model_path = CFG.MODEL_PATH / f"lgb_fold{fold}.pkl"
    with open(model_path, "wb") as f:
        pickle.dump(model, f)
    
    # Fold score
    fold_score = score(y_val.values, oof_predictions[val_idx])
    print(f"Fold {fold} Score: {fold_score:.6f}")
    
    del X_train, y_train, X_val, y_val, model
    gc.collect()

# Overall CV score
cv_score = score(train_pd[CFG.TARGET_COL].values, oof_predictions)
print(f"\n{'='*50}")
print(f"Overall CV Score: {cv_score:.6f}")
print(f"{'='*50}")

In [None]:
# =============================================================================
# Save OOF and Test Predictions
# =============================================================================
# Save OOF predictions
oof_df = pd.DataFrame({
    "id": train_pd["id"],
    "oof_pred": oof_predictions,
    "true": train_pd[CFG.TARGET_COL],
})
oof_df.to_csv(CFG.OUTPUT_DIR / "oof_predictions.csv", index=False)

# Save test predictions (for local use)
test_pred_df = pd.DataFrame({
    "id": test_pd["id"],
    "pred": test_predictions,
})
test_pred_df.to_csv(CFG.OUTPUT_DIR / "test_predictions.csv", index=False)

print(f"OOF predictions saved to: {CFG.OUTPUT_DIR / 'oof_predictions.csv'}")
print(f"Test predictions saved to: {CFG.OUTPUT_DIR / 'test_predictions.csv'}")

In [None]:
# =============================================================================
# Create Submission (for local testing)
# =============================================================================
sub_df = pl.read_csv(CFG.DATA_PATH / "sample_submission.csv")
# TODO: Update with your target column and predictions
# sub_df = sub_df.with_columns(pl.Series(CFG.TARGET_COL, test_predictions))
sub_df.write_csv(CFG.OUTPUT_DIR / "submission.csv")

print(f"Submission saved to: {CFG.OUTPUT_DIR / 'submission.csv'}")
print(sub_df.head())