In [None]:
import json
import logging

import config
import lightgbm as lgb
import polars as pl
from catboost import CatBoostRegressor
from preprocess import fe, load_data

from src.customs.fold import add_kfold
from src.customs.metrics import CatBoostMetric, LGBMMetric, Metric
from src.model.sklearn_like import (
    CatBoostRegressorWrapper,
    LightGBMWapper,
)
from src.trainer.tabular.simple import single_inference_fn, single_train_fn

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [None]:
train_test_df = load_data(config=config, valid_ratio=config.VALID_RATIO)
target_df = pl.read_csv("./data/extr_output/101/1/101.csv").with_columns(pl.col("t_event_pred") * 2)
target_cols = [x for x in target_df.columns if x.startswith("t_")]
train_test_df = train_test_df.join(
    target_df.select(
        [
            config.ID_COL,
            *target_cols,
        ],
    ),
    on=config.ID_COL,
    how="left",
)
config.META_COLS = set(config.META_COLS) | set(target_cols)

features_df = fe(config=config, train_test_df=train_test_df)
feature_names = sorted([x for x in features_df.columns if x.startswith(config.FEATURE_PREFIX)])
cat_features = [x for x in feature_names if x.startswith(f"{config.FEATURE_PREFIX}c_")]


def make_new_targets(
    df: pl.DataFrame,
    base_target_names: tuple[str] = ("t_kmf", "t_bfhf"),
    lower_bound_pos: float = 0.0,
    lower_bound_neg: float = 0.0,
    pred_col: str = "t_event_pred",
) -> pl.DataFrame:
    for base_target_name in base_target_names:
        new_target_name = f"{base_target_name}_event_scaled2"

        scaling_factor_pos = (
            df.filter(pl.col(config.EVENT_COL) == 1)
            .select(pl.col(pred_col).log().min() / (lower_bound_pos - pl.col(base_target_name).min()))[pred_col]
            .to_numpy()[0]
        )
        scaling_factor_neg = (
            df.filter(pl.col(config.EVENT_COL) == 0)
            .select(pl.col(pred_col).log().min() / (lower_bound_neg - pl.col(base_target_name).min()))[pred_col]
            .to_numpy()[0]
        )

        print(scaling_factor_pos, scaling_factor_neg)

        new_df = df.select(
            pl.col(config.ID_COL),
            pl.when(pl.col(config.EVENT_COL) == 1)
            .then(pl.col(pred_col).log() / scaling_factor_pos + pl.col(base_target_name))
            .otherwise(pl.col(pred_col).log() / scaling_factor_neg + pl.col(base_target_name))
            .alias(new_target_name),
        )
        df = df.join(new_df, on=config.ID_COL, how="left")
    return df


features_df = features_df.join(
    make_new_targets(
        target_df,
        lower_bound_neg=0.0,
        lower_bound_pos=0.0,
    ).select(pl.col(config.ID_COL), pl.col("t_kmf_event_scaled2").exp()),
    on=config.ID_COL,
    how="left",
).with_columns((pl.col("t_kmf_event_scaled2") / pl.col("t_kmf_event_scaled2").max()).alias("t_kmf_event_scaled3"))


def add_weight(
    features_df: pl.DataFrame,
    survival_time_col: str,
    event_col: str,
    neg_min: float = 0.01,  # event == 0 の場合の下限
    neg_max: float = 0.5,  # event == 0 の場合の上限
    pos_min: float = 1.0,  # event == 1 の場合の下限
    pos_max: float = 1.5,  # event == 1 の場合の上限
) -> pl.DataFrame:
    features_df = features_df.with_columns(pl.col(survival_time_col).alias("tmp_time"))

    # 各 event グループごとに tmp_time の最小値と最大値を計算
    features_df = features_df.with_columns(
        [
            pl.col("tmp_time").min().over(event_col).alias("group_min"),
            pl.col("tmp_time").max().over(event_col).alias("group_max"),
        ]
    )

    # 各グループごとにスケーリングした値から weight を計算
    features_df = features_df.with_columns(
        pl.when(pl.col(event_col) == 0)
        .then(
            # event == 0 の場合は逆スケーリング: 1 - ((tmp_time - group_min) / (group_max - group_min))
            (1 - ((pl.col("tmp_time") - pl.col("group_min")) / (pl.col("group_max") - pl.col("group_min"))))
            * (neg_max - neg_min)
            + neg_min
        )
        .otherwise(
            # event == 1 の場合はそのままスケーリング: ((tmp_time - group_min) / (group_max - group_min))
            ((pl.col("tmp_time") - pl.col("group_min")) / (pl.col("group_max") - pl.col("group_min")))
            * (pos_max - pos_min)
            + pos_min
        )
        .alias("weight")
    )

    return features_df


In [None]:
va_result_df, va_scores = pl.DataFrame(), {}
for i, seed in enumerate(config.SEEDS):
    name = f"lgb_{seed}"
    _va_result_df, _va_scores, trained_models = single_train_fn(
        model=LightGBMWapper(
            name=name,
            model=lgb.LGBMModel(
                objective="cross_entropy",
                boosting="gbdt",
                n_estimators=100000,
                learning_rate=0.01,
                num_leaves=11 + (4 * i),
                colsample_bytree=0.2,
                subsample=0.9,
                importance_type="gain",
                metric="None",
            ),
            fit_params={
                "callbacks": [
                    lgb.early_stopping(3000, first_metric_only=True),
                    lgb.log_evaluation(period=100),
                ],
                "eval_metric": LGBMMetric(),
                "categorical_feature": cat_features,
                "feature_name": feature_names,
            },
        ),
        features_df=add_kfold(
            add_weight(
                features_df,
                survival_time_col=config.SURVIVAL_TIME_COL,
                event_col=config.EVENT_COL,
                neg_min=0.5,
                neg_max=0.5,
                pos_min=1.0,
                pos_max=1.0,
            ),
            n_splits=config.N_SPLITS,
            random_state=seed,
            fold_col=config.FOLD_COL,
        ),
        feature_cols=feature_names,
        target_col="t_kmf_event_scaled3",
        fold_col=config.FOLD_COL,
        meta_cols=config.META_COLS,
        weight_col="weight",
        out_dir=config.OUTPUT_DIR,
        train_folds=None,
        eval_fn=Metric(),
        overwrite=True,
        use_eval_metric_extra_va_df=True,
    )
    va_result_df = pl.concat([va_result_df, _va_result_df], how="diagonal_relaxed")
    va_scores[name] = _va_scores

# ------------------------------
# final score
# ------------------------------
va_result_agg_df = (
    va_result_df.group_by(config.ID_COL)
    .agg(pl.col("pred").mean())
    .sort("ID")
    .join(train_test_df.select(config.META_COLS), on=config.ID_COL, how="left")
)
final_score = Metric()(input_df=va_result_agg_df)
logger.info(f"✅ final score: {final_score}")
va_scores["final"] = final_score

# save
va_result_agg_df.write_csv(f"{config.OUTPUT_DIR}/va_result.csv")
with open(f"{config.OUTPUT_DIR}/va_scores.json", "w") as f:
    json.dump(va_scores, f, indent=4)


In [None]:
# debug test
test_features_df = fe(config=config, train_test_df=train_test_df, output_dataset="TEST")
te_result_df = pl.DataFrame()
for seed in config.SEEDS:
    name = f"lgb_{seed}"

    _te_result_df = single_inference_fn(
        model=LightGBMWapper(name=name),
        features_df=test_features_df,
        feature_names=feature_names,
        model_dir=config.ARTIFACT_EXP_DIR(),
        inference_folds=list(range(config.N_SPLITS)),
        out_dir=config.OUTPUT_DIR,
    )
    te_result_df = pl.concat([te_result_df, _te_result_df], how="diagonal_relaxed")

te_result_agg_df = (
    te_result_df.group_by(config.ID_COL)
    .agg(pl.col("pred").mean())
    .sort("ID")
    .join(train_test_df.select(config.META_COLS), on=config.ID_COL, how="left")
)


print(te_result_agg_df["pred"].to_list())