In [1]:
import json
import logging

import config
import lightgbm as lgb
import polars as pl
from preprocess import fe, load_data

from src.customs.fold import add_kfold
from src.customs.metrics import LGBMMetric, Metric
from src.model.sklearn_like import (
    LightGBMWapper,
)
from src.trainer.tabular.simple import single_inference_fn, single_train_fn

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [2]:
train_test_df = load_data(config=config, valid_ratio=config.VALID_RATIO)
target_df = pl.read_csv("./data/extr_output/101/1/101.csv")
target_cols = [x for x in target_df.columns if x.startswith("t_")]
train_test_df = train_test_df.join(
    target_df.select(
        [
            config.ID_COL,
            *target_cols,
        ],
    ),
    on=config.ID_COL,
    how="left",
)
config.META_COLS = set(config.META_COLS) | set(target_cols)

features_df = fe(config=config, train_test_df=train_test_df)
feature_names = sorted([x for x in features_df.columns if x.startswith(config.FEATURE_PREFIX)])
cat_features = [x for x in feature_names if x.startswith(f"{config.FEATURE_PREFIX}c_")]

In [None]:
va_result_df, va_scores = pl.DataFrame(), {}
for i, seed in enumerate(config.SEEDS):
    name = f"lgb_{seed}"
    _va_result_df, _va_scores, trained_models = single_train_fn(
        model=LightGBMWapper(
            name=name,
            model=lgb.LGBMModel(
                objective="tweedie",
                boosting="gbdt",
                n_estimators=100000,
                learning_rate=0.01,
                num_leaves=11 + (i * 5),
                colsample_bytree=0.2,
                subsample=0.5,
                importance_type="gain",
                metric="None",
                random_state=seed,
            ),
            fit_params={
                "callbacks": [
                    lgb.early_stopping(1000, first_metric_only=True),
                    lgb.log_evaluation(period=100),
                ],
                "eval_metric": LGBMMetric(),
                "categorical_feature": cat_features,
                "feature_name": feature_names,
            },
        ),
        features_df=add_kfold(
            features_df,
            n_splits=config.N_SPLITS,
            random_state=seed,
            fold_col=config.FOLD_COL,
        ),
        feature_cols=feature_names,
        target_col="t_kmf_event_scaled",
        fold_col=config.FOLD_COL,
        meta_cols=config.META_COLS,
        # weight_col="weight",
        out_dir=config.OUTPUT_DIR,
        train_folds=None,
        eval_fn=Metric(),
        overwrite=True,
        use_eval_metric_extra_va_df=True,
    )
    va_result_df = pl.concat([va_result_df, _va_result_df], how="diagonal_relaxed")
    va_scores[name] = _va_scores

# ------------------------------
# final score
# ------------------------------
va_result_agg_df = (
    va_result_df.group_by(config.ID_COL)
    .agg(pl.col("pred").mean())
    .sort("ID")
    .join(train_test_df.select(config.META_COLS), on=config.ID_COL, how="left")
)
final_score = Metric()(input_df=va_result_agg_df)
logger.info(f"✅ final score: {final_score}")
va_scores["final"] = final_score

# save
va_result_agg_df.write_csv(f"{config.OUTPUT_DIR}/va_result.csv")
with open(f"{config.OUTPUT_DIR}/va_scores.json", "w") as f:
    json.dump(va_scores, f, indent=4)

In [None]:
# debug test
test_features_df = fe(config=config, train_test_df=train_test_df, output_dataset="TEST")
te_result_df = pl.DataFrame()
for seed in config.SEEDS:
    name = f"lgb_{seed}"

    _te_result_df = single_inference_fn(
        model=LightGBMWapper(name=name),
        features_df=test_features_df,
        feature_names=feature_names,
        model_dir=config.ARTIFACT_EXP_DIR(),
        inference_folds=list(range(config.N_SPLITS)),
        out_dir=config.OUTPUT_DIR,
    )
    te_result_df = pl.concat([te_result_df, _te_result_df], how="diagonal_relaxed")

te_result_agg_df = (
    te_result_df.group_by(config.ID_COL)
    .agg(pl.col("pred").mean())
    .sort("ID")
    .join(train_test_df.select(config.META_COLS), on=config.ID_COL, how="left")
)


print(te_result_agg_df["pred"].to_list())