In [1]:
import json
import logging

import config
import polars as pl
from catboost import CatBoostRegressor
from preprocess import fe, load_data

from src.customs.fold import add_kfold
from src.customs.metrics import CatBoostMetric, Metric
from src.model.sklearn_like import (
    CatBoostRegressorWrapper,
)
from src.trainer.tabular.simple import single_inference_fn, single_train_fn

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [2]:
train_test_df = load_data(config=config, valid_ratio=config.VALID_RATIO)
target_df = pl.read_csv("./data/extr_output/101/1/101.csv").with_columns(pl.col("t_event_pred") * 2)
target_cols = [x for x in target_df.columns if x.startswith("t_")]
train_test_df = train_test_df.join(
    target_df.select(
        [
            config.ID_COL,
            *target_cols,
        ],
    ),
    on=config.ID_COL,
    how="left",
)
config.META_COLS = set(config.META_COLS) | set(target_cols)

features_df = fe(config=config, train_test_df=train_test_df)
feature_names = sorted([x for x in features_df.columns if x.startswith(config.FEATURE_PREFIX)])
cat_features = [x for x in feature_names if x.startswith(f"{config.FEATURE_PREFIX}c_")]


def make_new_targets_by_race(
    df: pl.DataFrame,
    base_target_names: tuple[str] = ("t_kmf", "t_bfhf"),
    lower_bound_pos: float = 0.0,
    lower_bound_neg: float = 0.0,
    pred_col: str = "t_event_pred",
    race_group_col: str = "race_group",
) -> pl.DataFrame:
    for base_target_name in base_target_names:
        new_target_name = f"{base_target_name}_event_scaled2"

        # race group ごとに event==1 の scaling factor を計算
        agg_pos = (
            df.filter(pl.col(config.EVENT_COL) == 1)
            .group_by(race_group_col)
            .agg(
                (pl.col(pred_col).log().min() / (lower_bound_pos - pl.col(base_target_name).min())).alias(
                    "scaling_factor_pos"
                )
            )
        )

        # race group ごとに event==0 の scaling factor を計算
        agg_neg = (
            df.filter(pl.col(config.EVENT_COL) == 0)
            .group_by(race_group_col)
            .agg(
                (pl.col(pred_col).log().min() / (lower_bound_neg - pl.col(base_target_name).min())).alias(
                    "scaling_factor_neg"
                )
            )
        )

        # 各 race group ごとの scaling factor を df に結合
        df = df.join(agg_pos, on=race_group_col, how="left").join(agg_neg, on=race_group_col, how="left")

        # race group ごとに条件に沿って新たなターゲットを算出
        new_df = df.select(
            pl.col(config.ID_COL),
            pl.when(pl.col(config.EVENT_COL) == 1)
            .then(pl.col(pred_col).log() / pl.col("scaling_factor_pos") + pl.col(base_target_name))
            .otherwise(pl.col(pred_col).log() / pl.col("scaling_factor_neg") + pl.col(base_target_name))
            .alias(new_target_name),
        )
        df = df.join(new_df, on=config.ID_COL, how="left")

    return df


features_df = features_df.join(
    make_new_targets_by_race(
        target_df,
        lower_bound_neg=0.0,
        lower_bound_pos=0.0,
    ).select(
        pl.col(config.ID_COL),
        pl.col("scaling_factor_pos"),
        pl.col("scaling_factor_neg"),
        pl.col("t_kmf_event_scaled2").exp(),
    ),
    on=config.ID_COL,
    how="left",
)


def add_weight_by_race_group(features_df: pl.DataFrame, race_group: str) -> pl.DataFrame:
    features_df = features_df.with_columns(pl.col(config.SURVIVAL_TIME_COL).alias("tmp_time"))

    # event == 0 (negative) の min と max を race_group ごとに計算
    agg_neg = (
        features_df.filter(pl.col(config.EVENT_COL) == 0)
        .group_by(race_group)
        .agg([pl.col("tmp_time").min().alias("min_time_neg"), pl.col("tmp_time").max().alias("max_time_neg")])
    )

    # event == 1 (positive) の min と max を race_group ごとに計算
    agg_pos = (
        features_df.filter(pl.col(config.EVENT_COL) == 1)
        .group_by(race_group)
        .agg([pl.col("tmp_time").min().alias("min_time_pos"), pl.col("tmp_time").max().alias("max_time_pos")])
    )
    features_df = features_df.join(agg_neg, on=race_group, how="left").join(agg_pos, on=race_group, how="left")
    features_df = features_df.with_columns(
        [
            # event == 0 の場合: 1 - ((tmp_time - min_time_neg) / (max_time_neg - min_time_neg))
            (
                1 - ((pl.col("tmp_time") - pl.col("min_time_neg")) / (pl.col("max_time_neg") - pl.col("min_time_neg")))
            ).alias("scaled_survival_time"),
            # event == 1 の場合: (tmp_time - min_time_pos) / (max_time_pos - min_time_pos)
            ((pl.col("tmp_time") - pl.col("min_time_pos")) / (pl.col("max_time_pos") - pl.col("min_time_pos"))).alias(
                "scaled_survival_time_inv"
            ),
        ]
    ).with_columns(
        # event に応じた weight の計算
        pl.when(pl.col(config.EVENT_COL) == 0)
        .then(pl.col("scaled_survival_time") * (0.5 - 0.1) + 0.1)
        .otherwise(pl.col("scaled_survival_time_inv") * (1.5 - 1) + 1)
        .alias("weight")
    )

    return features_df


features_df = add_weight_by_race_group(features_df, race_group="race_group")

In [None]:
va_result_df, va_scores = pl.DataFrame(), {}
for seed in config.SEEDS:
    name = f"cat_{seed}"
    _va_result_df, _va_scores, trained_models = single_train_fn(
        model=CatBoostRegressorWrapper(
            name=name,
            model=CatBoostRegressor(
                loss_function="Tweedie:variance_power=1.5",
                grow_policy="SymmetricTree",
                learning_rate=0.05,
                n_estimators=100000,
                early_stopping_rounds=3000,
                eval_metric=CatBoostMetric(),
                verbose=100,
                random_state=seed,
                colsample_bylevel=0.2,
            ),
            multi_output=False,
            feature_names=feature_names,
            cat_features=cat_features,
        ),
        features_df=add_kfold(
            features_df,
            n_splits=config.N_SPLITS,
            random_state=seed,
            fold_col=config.FOLD_COL,
        ),
        feature_cols=feature_names,
        target_col="t_kmf_event_scaled2",
        fold_col=config.FOLD_COL,
        meta_cols=config.META_COLS,
        weight_col="weight",
        out_dir=config.OUTPUT_DIR,
        train_folds=None,
        eval_fn=Metric(),
        overwrite=True,
        use_eval_metric_extra_va_df=True,
    )
    va_result_df = pl.concat([va_result_df, _va_result_df], how="diagonal_relaxed")
    va_scores[name] = _va_scores

# ------------------------------
# final score
# ------------------------------
va_result_agg_df = (
    va_result_df.group_by(config.ID_COL)
    .agg(pl.col("pred").mean())
    .sort("ID")
    .join(train_test_df.select(config.META_COLS), on=config.ID_COL, how="left")
)
final_score = Metric()(input_df=va_result_agg_df)
logger.info(f"✅ final score: {final_score}")
va_scores["final"] = final_score

# save
va_result_agg_df.write_csv(f"{config.OUTPUT_DIR}/va_result.csv")
with open(f"{config.OUTPUT_DIR}/va_scores.json", "w") as f:
    json.dump(va_scores, f, indent=4)


In [None]:
# debug test
test_features_df = fe(config=config, train_test_df=train_test_df, output_dataset="TEST")
te_result_df = pl.DataFrame()
for seed in config.SEEDS:
    name = f"cat_{seed}"

    _te_result_df = single_inference_fn(
        model=CatBoostRegressorWrapper(name=name, feature_names=feature_names, cat_features=cat_features),
        features_df=test_features_df,
        feature_names=feature_names,
        model_dir=config.ARTIFACT_EXP_DIR(),
        inference_folds=list(range(config.N_SPLITS)),
        out_dir=config.OUTPUT_DIR,
    )
    te_result_df = pl.concat([te_result_df, _te_result_df], how="diagonal_relaxed")

te_result_agg_df = (
    te_result_df.group_by(config.ID_COL)
    .agg(pl.col("pred").mean())
    .sort("ID")
    .join(train_test_df.select(config.META_COLS), on=config.ID_COL, how="left")
)


print(te_result_agg_df["pred"].to_list())