In [None]:
import logging

import config
import lightgbm as lgb
import polars as pl
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor
from lifelines import BreslowFlemingHarringtonFitter, KaplanMeierFitter
from sklearn import linear_model

from src.customs.fold import add_kfold
from src.customs.metrics import CatBoostMetric, Metric, ROCAUCMetric
from src.feature.tabular import AggregateEncoder, OrdinalEncoder, RawEncoder
from src.model.sklearn_like import (
    CatBoostClassifierWrapper,
    CatBoostRegressorWrapper,
    LightGBMWapper,
    LinearWrapper,
    WeightedAverageModel,
    WeightedAverageModelWrapper,
    XGBoostRegressorWrapper,
)
from src.trainer.tabular.simple import single_inference_fn, single_train_fn

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [None]:
class CreateTargetFn:
    def __init__(self, time_col: str, event_col: str):
        self.time_col = time_col
        self.event_col = event_col
        self.target_cols = [
            "target_kmf",
            "target_kmf_oof",
            "target_xgboost_cox",
            "target_bfhf",
            "target_kmf_with_penalty",
        ]

    def __call__(self, df: pl.DataFrame) -> pl.DataFrame:
        # add kmf target by race_group
        train_df = df.filter(pl.col(config.DATASET_COL) == "TRAIN")
        target_df = df.clone()

        # add kmf target
        kmf = KaplanMeierFitter()
        kmf.fit(train_df[self.time_col], event_observed=train_df[self.event_col])
        target_df = target_df.with_columns(
            pl.Series("target_kmf", kmf.survival_function_at_times(target_df[self.time_col])),
        )

        # add oof kmf target
        fold_df = add_kfold(target_df, n_splits=5, random_state=config.SEED, fold_col="_fold")
        oof_df = pl.DataFrame()
        for fold in range(config.N_SPLITS):
            _tr_df = fold_df.filter(pl.col("_fold") != fold)
            _va_df = fold_df.filter(pl.col("_fold") == fold)
            kmf = KaplanMeierFitter()
            kmf.fit(_tr_df[self.time_col], event_observed=_tr_df[self.event_col])
            oof_df = pl.concat(
                [
                    oof_df,
                    (
                        _va_df.select(
                            pl.col(config.ID_COL),
                            pl.Series("target_kmf_oof", kmf.survival_function_at_times(_va_df[self.time_col])),
                        )
                    ),
                ],
                how="diagonal_relaxed",
            )
        target_df = target_df.join(oof_df, on=config.ID_COL, how="left")

        # add xgboost cox target
        target_df = target_df.with_columns(
            pl.when(target_df[self.event_col] == 0)
            .then(-1 * target_df[self.time_col])
            .otherwise(target_df[self.time_col])
            .alias("target_xgboost_cox"),
        )
        # add bfhf target
        bfhf = BreslowFlemingHarringtonFitter()
        bfhf.fit(train_df[self.time_col], event_observed=train_df[self.event_col])
        target_df = target_df.with_columns(
            pl.Series("target_bfhf", bfhf.survival_function_at_times(target_df[self.time_col])),
        )

        # add kmf target with penalty
        target_df = target_df.with_columns(
            pl.when(target_df[self.event_col] == 0)
            .then(target_df["target_kmf"] - 0.15)
            .otherwise(target_df["target_kmf"])
            .alias("target_kmf_with_penalty"),
        )

        return target_df


def load_data(valid_ratio: float | None = None) -> pl.DataFrame:
    raw_train_df = pl.read_csv(config.COMP_DATASET_DIR / "train.csv").with_columns(
        pl.lit("TRAIN").alias(config.DATASET_COL),
        pl.lit(-1).alias(config.FOLD_COL),
    )
    if valid_ratio is not None:
        from sklearn.model_selection import train_test_split

        _tr_df, _va_df = train_test_split(raw_train_df, test_size=0.2, random_state=config.SEED)
        _va_df = _va_df.with_columns(
            pl.lit("VALID").alias(config.DATASET_COL),
        )
        raw_train_df = pl.concat([_tr_df, _va_df], how="diagonal_relaxed")

    create_target_fn = CreateTargetFn(time_col=config.SURVIVAL_TIME_COL, event_col=config.EVENT_COL)
    raw_train_df = create_target_fn(raw_train_df)

    raw_test_df = pl.read_csv(config.COMP_DATASET_DIR / "test.csv").with_columns(
        pl.lit("TEST").alias(config.DATASET_COL)
    )

    config.META_COLS = sorted(list(set(config.META_COLS + create_target_fn.target_cols)))

    train_test_df = pl.concat([raw_train_df, raw_test_df], how="diagonal_relaxed").sort(
        config.DATASET_COL, config.ID_COL, descending=[True, False]
    )
    return train_test_df


def load_ensemble_source(
    train_test_df: pl.DataFrame,
    exp_names: dict[str, str],
    source_prefix: str = "",
    output_dataset: str = "TRAIN",
) -> pl.DataFrame:
    te_result_path = (  # noqa
        lambda exp_name, model_name: config.ARTIFACT_EXP_DIR(exp_name) / model_name / "te_result.csv"
        if not config.IS_KAGGLE_ENV
        else config.OUTPUT_DIR / exp_name / model_name / "te_result.csv"
    )

    source_dfs = [
        pl.concat(
            [
                pl.read_csv(
                    config.ARTIFACT_EXP_DIR(exp_name) / model_name / "va_result.csv",
                    columns=[config.ID_COL, "pred"],
                )
                .with_columns(
                    ((pl.col("pred").rank() - 1) / (pl.col("pred").rank().max() - 1)).alias(f"rank_pred_{exp_name}")
                )
                .rename({"pred": f"pred_{exp_name}"}),
                pl.read_csv(te_result_path(exp_name, model_name), columns=[config.ID_COL, "pred"])
                .with_columns(
                    ((pl.col("pred").rank() - 1) / (pl.col("pred").rank().max() - 1)).alias(f"rank_pred_{exp_name}")
                )
                .rename({"pred": f"pred_{exp_name}"}),
            ],
            how="diagonal_relaxed",
        )
        for exp_name, model_name in exp_names.items()
    ]
    source_df = source_dfs.pop(0)
    for df in source_dfs:
        source_df = source_df.join(df, on=config.ID_COL, how="left")

    # add prefix
    source_df = source_df.select(
        [
            config.ID_COL,
            pl.all().exclude(config.ID_COL).name.prefix(source_prefix),
        ]
    )
    return train_test_df.join(source_df, on=config.ID_COL, how="left").filter(
        pl.col(config.DATASET_COL) == output_dataset
    )


def fe(
    train_test_df: pl.DataFrame,
    output_dataset: str = "TRAIN",
) -> pl.DataFrame:
    encoders = [
        RawEncoder(columns=config.META_COLS, prefix=""),
        RawEncoder(
            columns=(
                [
                    *config.NUMERICAL_COLS,
                ]
            ),
            prefix=f"{config.FEATURE_PREFIX}n_",
        ),
        OrdinalEncoder(
            columns=(
                [
                    *config.CATEGORICAL_COLS,
                ]
            ),
            prefix=f"{config.FEATURE_PREFIX}c_",
        ),
    ]

    for encoder in encoders:
        encoder.fit(train_test_df.filter(pl.col(config.DATASET_COL) == "TRAIN"))

    features_df = pl.concat(
        [
            encoder.transform(
                train_test_df.filter(pl.col(config.DATASET_COL) == output_dataset),
            )
            for encoder in encoders
        ],
        how="horizontal",
    )
    return features_df

In [None]:
# train_test_df = load_data()
# features_df = fe(train_test_df)

# feature_names = sorted([x for x in features_df.columns if x.startswith(config.FEATURE_PREFIX)])
# cat_features = [x for x in feature_names if x.startswith(f"{config.FEATURE_PREFIX}c_")]


# logger.info(f"# of features: {len(feature_names)}")
# logger.info(f"# of cat_features: {len(cat_features)}")

# train_features_df = add_kfold(
#     features_df,
#     n_splits=config.N_SPLITS,
#     random_state=config.SEED,
#     fold_col=config.FOLD_COL,
# )

# va_result_df_cls_cat, _, _ = single_train_fn(
#     model=CatBoostClassifierWrapper(
#         name="cat",
#         model=CatBoostClassifier(
#             loss_function="Logloss",
#             eval_metric="AUC",
#             learning_rate=0.05,
#             n_estimators=10000,
#             early_stopping_rounds=500,
#             verbose=100,
#             # subsample=0.5,
#             colsample_bylevel=0.2,
#             random_state=config.SEED,
#         ),
#         multi_output=False,
#         feature_names=feature_names,
#         cat_features=cat_features,
#     ),
#     features_df=train_features_df,
#     feature_cols=feature_names,
#     target_col=config.EVENT_COL,
#     fold_col=config.FOLD_COL,
#     meta_cols=config.META_COLS,
#     out_dir=config.TMP_DIR,
#     train_folds=None,
#     eval_fn=ROCAUCMetric(),
#     overwrite=False,
#     use_eval_metric_extra_va_df=False,
# )

# va_result_df_cls_lgb, _, _ = single_train_fn(
#     model=LightGBMWapper(
#         name="lgb",
#         model=lgb.LGBMModel(
#             objective="binary",
#             boosting="gbdt",
#             n_estimators=10000,
#             learning_rate=0.02,
#             num_leaves=31,
#             colsample_bytree=0.2,
#             subsample=0.5,
#             importance_type="gain",
#             random_state=config.SEED,
#         ),
#         fit_params={
#             "callbacks": [
#                 lgb.early_stopping(500, first_metric_only=True),
#                 lgb.log_evaluation(period=100),
#             ],
#             "categorical_feature": cat_features,
#             "feature_name": feature_names,
#             "eval_metric": "auc",
#         },
#     ),
#     features_df=train_features_df,
#     feature_cols=feature_names,
#     target_col=config.EVENT_COL,
#     fold_col=config.FOLD_COL,
#     meta_cols=config.META_COLS,
#     out_dir=config.TMP_DIR,
#     train_folds=None,
#     eval_fn=ROCAUCMetric(),
#     overwrite=False,
#     use_eval_metric_extra_va_df=False,
# )


# va_result_df_1st = (
#     va_result_df_cls_cat.select(
#         [
#             *config.META_COLS,
#             pl.col("pred").alias("pred_cat"),
#         ]
#     )
#     .join(
#         va_result_df_cls_lgb.select(pl.col(config.ID_COL), pl.col("pred").alias("pred_lgb")),
#         on=config.ID_COL,
#         how="inner",
#     )
#     .with_columns(
#         pl.mean_horizontal(
#             [
#                 "pred_cat",
#                 "pred_lgb",
#             ]
#         ).alias("pred")
#     )
# )
# logger.info(f"{ROCAUCMetric()._name}: {ROCAUCMetric()(va_result_df_1st)}")


In [None]:
# # make target
# lower_bound = 0.0
# scaling_factor = va_result_df_1st.select(pl.col("pred").log().min() / (lower_bound - pl.col("target_kmf").min()))[
#     "pred"
# ].to_numpy()[0]
# logger.info(f"scaling_factor: {scaling_factor}")
# target_df = va_result_df_1st.select(
#     pl.col(config.ID_COL),
#     (1 - ((1 - pl.col("pred")).pow(2)).sqrt()).alias("sample_weight") + 1,
#     # (pl.col(config.SURVIVAL_TIME_COL) * (2 * pl.col("pred") - 1)).alias("target"),
#     pl.when(pl.col(config.EVENT_COL) == 0)
#     .then(pl.col("pred").log() / scaling_factor + pl.col("target_kmf"))
#     .otherwise(pl.col("pred").log() / scaling_factor + pl.col("target_kmf"))
#     .alias("target"),
# )


# stacking
train_test_df = load_data()
features_df = load_ensemble_source(train_test_df, config.ENSEMBLE_EXP_NAMES, source_prefix=config.FEATURE_PREFIX)
# features_df = load_ensemble_source(train_test_df, config.ENSEMBLE_EXP_NAMES, source_prefix=config.FEATURE_PREFIX).join(
#     target_df.select([config.ID_COL, "target", "sample_weight"]), on=config.ID_COL, how="left"
# )

feature_names = sorted([x for x in features_df.columns if x.startswith(config.FEATURE_PREFIX)])
feature_names = [x for x in feature_names if x.startswith(f"{config.FEATURE_PREFIX}rank_pred_")]

cat_features = [x for x in feature_names if x.startswith(f"{config.FEATURE_PREFIX}c_")]

logger.info(f"# of features: {len(feature_names)}")
logger.info(f"# of cat_features: {len(cat_features)}")

# train_features_df = add_kfold(
#     features_df,
#     n_splits=config.N_SPLITS,
#     random_state=config.N_SPLITS,
#     fold_col=config.FOLD_COL,
# )

# va_result_df, va_scores, trained_models = single_train_fn(
#     model=LinearWrapper(
#         name="ridge",
#         model=linear_model.Ridge(
#             alpha=10,
#             random_state=config.SEED,
#         ),
#         scaling=False,
#         feature_names=feature_names,
#     ),
#     features_df=train_features_df,
#     feature_cols=feature_names,
#     target_col="target_kmf_with_penalty",
#     # target_col="target",
#     fold_col=config.FOLD_COL,
#     meta_cols=config.META_COLS,
#     out_dir=config.OUTPUT_DIR,
#     # weight_col="sample_weight",
#     train_folds=None,
#     eval_fn=Metric(),
#     overwrite=True,
#     use_eval_metric_extra_va_df=False,
# )

In [None]:
from collections.abc import Callable

import numpy as np
import optuna
from sklearn.base import BaseEstimator, RegressorMixin

from src.customs.metrics import metric


class WeightedAverageModel(BaseEstimator, RegressorMixin):
    def __init__(
        self,
        weights: list[float] | None = None,
        eval_metric: Callable | None = None,
        is_max_optimal: bool = False,
    ):
        self.weights = np.array(weights) if weights is not None else None
        self.eval_metric = eval_metric
        self.is_max_optimal = is_max_optimal

    def fit(self, X, y, event, race_group, n_trials=10000):
        if self.eval_metric is None:
            raise ValueError("eval_metric is required for optimization.")

        def objective(trial):
            weights = np.array([trial.suggest_float(f"w{i}", 0, 1) for i in range(X.shape[1])])
            weights /= np.sum(weights)

            predictions = np.average(X, axis=1, weights=weights)
            score = self.eval_metric(y_time=y, y_pred=predictions, y_event=event, race_group=race_group)
            return score

        # Optuna の最適化
        study = optuna.create_study(direction="maximize" if self.is_max_optimal else "minimize")
        study.optimize(objective, n_trials=n_trials)

        # 最適化された重みを保存
        best_weights = np.array([study.best_trial.params[f"w{i}"] for i in range(X.shape[1])])
        self.weights = best_weights / np.sum(best_weights)  # 合計を1に正規化
        self.best_score_ = study.best_value
        self.study_ = study  # 学習後に Optuna の結果を確認可能
        return self

    def predict(self, X):
        if self.weights is None:
            raise ValueError("Model is not fitted yet.")
        return np.average(X, axis=1, weights=self.weights)


wam = WeightedAverageModel(
    eval_metric=metric,
    is_max_optimal=True,
)
wam.fit(
    X=features_df.select(feature_names).to_numpy(),
    y=features_df[config.SURVIVAL_TIME_COL].to_numpy(),
    event=features_df[config.EVENT_COL].to_numpy(),
    race_group=features_df["race_group"].to_numpy(),
)
print(wam.best_score_)
print(wam.weights)

In [None]:
# debug test
test_features_df = load_ensemble_source(
    train_test_df,
    config.ENSEMBLE_EXP_NAMES,
    source_prefix=config.FEATURE_PREFIX,
    output_dataset="TEST",
)

te_result_df = single_inference_fn(
    model=WeightedAverageModelWrapper(
        name="wam",
        model=WeightedAverageModel(
            weights=wam.weights,
        ),
        feature_names=feature_names,
    ),
    features_df=test_features_df,
    feature_names=feature_names,
    model_dir=config.ARTIFACT_EXP_DIR(),
    out_dir=config.OUTPUT_DIR,
    inference_folds=list(range(config.N_SPLITS)),
)
print(te_result_df["pred"].to_list())