In [1]:
import logging

import config
import lightgbm as lgb
import polars as pl
from catboost import CatBoostClassifier
from lifelines import BreslowFlemingHarringtonFitter, KaplanMeierFitter
from sklearn.model_selection import train_test_split

from src.customs.fold import add_kfold
from src.customs.metrics import ROCAUCMetric
from src.feature.tabular import OrdinalEncoder, RawEncoder
from src.model.sklearn_like import (
    CatBoostClassifierWrapper,
    LightGBMWapper,
)
from src.trainer.tabular.simple import single_train_fn

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [2]:
class CreateTargetFn:
    def __init__(self, time_col: str, event_col: str):
        self.time_col = time_col
        self.event_col = event_col
        self.target_cols = [
            "target_kmf",
            "target_cox",
            "target_bfhf",
        ]

    def __call__(self, df: pl.DataFrame) -> pl.DataFrame:
        # add kmf target by race_group
        train_df = df.filter(pl.col(config.DATASET_COL) == "TRAIN")
        target_df = df.clone()

        # add kmf target
        kmf = KaplanMeierFitter()
        kmf.fit(train_df[self.time_col], event_observed=train_df[self.event_col])
        target_df = target_df.with_columns(
            pl.Series("target_kmf", kmf.survival_function_at_times(target_df[self.time_col])),
        )

        # add xgboost cox target
        target_df = target_df.with_columns(
            pl.when(target_df[self.event_col] == 0)
            .then(-1 * target_df[self.time_col])
            .otherwise(target_df[self.time_col])
            .alias("target_cox"),
        )
        # add bfhf target
        bfhf = BreslowFlemingHarringtonFitter()
        bfhf.fit(train_df[self.time_col], event_observed=train_df[self.event_col])
        target_df = target_df.with_columns(
            pl.Series("target_bfhf", bfhf.survival_function_at_times(target_df[self.time_col])),
        )

        return target_df


def load_data(valid_ratio: float | None = None) -> pl.DataFrame:
    raw_train_df = pl.read_csv(config.COMP_DATASET_DIR / "train.csv").with_columns(
        pl.lit("TRAIN").alias(config.DATASET_COL),
        pl.lit(-1).alias(config.FOLD_COL),
    )
    if valid_ratio is not None:
        _tr_df, _va_df = train_test_split(raw_train_df, test_size=0.2, random_state=config.SEED)
        _va_df = _va_df.with_columns(
            pl.lit("VALID").alias(config.DATASET_COL),
        )
        raw_train_df = pl.concat([_tr_df, _va_df], how="diagonal_relaxed")

    create_target_fn = CreateTargetFn(time_col=config.SURVIVAL_TIME_COL, event_col=config.EVENT_COL)
    raw_train_df = create_target_fn(raw_train_df)

    raw_test_df = pl.read_csv(config.COMP_DATASET_DIR / "test.csv").with_columns(
        pl.lit("TEST").alias(config.DATASET_COL)
    )

    config.META_COLS = sorted(list(set(config.META_COLS + create_target_fn.target_cols)))

    train_test_df = pl.concat([raw_train_df, raw_test_df], how="diagonal_relaxed").sort(
        config.DATASET_COL, config.ID_COL, descending=[True, False]
    )
    return train_test_df


def fe(
    train_test_df: pl.DataFrame,
    output_dataset: str | None = "TRAIN",
) -> pl.DataFrame:
    encoders = [
        RawEncoder(columns=config.META_COLS, prefix=""),
        RawEncoder(
            columns=(
                [
                    *config.NUMERICAL_COLS,
                ]
            ),
            prefix=f"{config.FEATURE_PREFIX}n_",
        ),
        OrdinalEncoder(
            columns=(
                [
                    *config.CATEGORICAL_COLS,
                ]
            ),
            prefix=f"{config.FEATURE_PREFIX}c_",
        ),
    ]

    for encoder in encoders:
        encoder.fit(train_test_df.filter(pl.col(config.DATASET_COL) == "TRAIN"))

    features_df = pl.concat(
        [encoder.transform(train_test_df) for encoder in encoders],
        how="horizontal",
    )
    if output_dataset is None:
        return features_df
    return features_df.filter(pl.col(config.DATASET_COL) == output_dataset)

In [None]:
train_test_df = load_data(valid_ratio=config.VALID_RATIO)
features_df = fe(train_test_df)

feature_names = sorted([x for x in features_df.columns if x.startswith(config.FEATURE_PREFIX)])
cat_features = [x for x in feature_names if x.startswith(f"{config.FEATURE_PREFIX}c_")]


logger.info(f"# of features: {len(feature_names)}")
logger.info(f"# of cat_features: {len(cat_features)}")

In [None]:
all_val_result_df = pl.DataFrame()
for seed in config.SEEDS:
    _va_result_df, _, _ = single_train_fn(
        model=LightGBMWapper(
            name=f"lgb_{seed}",
            model=lgb.LGBMModel(
                objective="binary",
                boosting="gbdt",
                n_estimators=10000,
                learning_rate=0.02,
                num_leaves=31,
                colsample_bytree=0.2,
                subsample=0.5,
                importance_type="gain",
                random_state=seed,
            ),
            fit_params={
                "callbacks": [
                    lgb.early_stopping(500, first_metric_only=True),
                    lgb.log_evaluation(period=100),
                ],
                "categorical_feature": cat_features,
                "feature_name": feature_names,
                "eval_metric": "auc",
            },
        ),
        features_df=add_kfold(
            features_df,
            n_splits=config.N_SPLITS,
            random_state=seed,
            fold_col=config.FOLD_COL,
        ),
        feature_cols=feature_names,
        target_col=config.EVENT_COL,
        fold_col=config.FOLD_COL,
        meta_cols=config.META_COLS,
        out_dir=config.TMP_DIR,
        train_folds=None,
        eval_fn=ROCAUCMetric(),
        overwrite=False,
        use_eval_metric_extra_va_df=False,
    )
    all_val_result_df = pl.concat(
        [
            all_val_result_df,
            _va_result_df.select(
                [
                    pl.col(config.ID_COL),
                    pl.col("pred"),
                    pl.col("name"),
                    pl.col(config.FOLD_COL),
                ]
            ),
        ],
        how="diagonal_relaxed",
    )

In [None]:
for seed in config.SEEDS:
    _va_result_df, _, _ = single_train_fn(
        model=CatBoostClassifierWrapper(
            name=f"cat_{seed}",
            model=CatBoostClassifier(
                loss_function="Logloss",
                eval_metric="AUC",
                learning_rate=0.05,
                n_estimators=10000,
                early_stopping_rounds=500,
                verbose=100,
                # subsample=0.5,
                colsample_bylevel=0.2,
                random_state=seed,
            ),
            multi_output=False,
            feature_names=feature_names,
            cat_features=cat_features,
        ),
        features_df=add_kfold(
            features_df,
            n_splits=config.N_SPLITS,
            random_state=seed,
            fold_col=config.FOLD_COL,
        ),
        feature_cols=feature_names,
        target_col=config.EVENT_COL,
        fold_col=config.FOLD_COL,
        meta_cols=config.META_COLS,
        out_dir=config.TMP_DIR,
        train_folds=None,
        eval_fn=ROCAUCMetric(),
        overwrite=False,
        use_eval_metric_extra_va_df=False,
    )
    all_val_result_df = pl.concat(
        [
            all_val_result_df,
            _va_result_df.select(
                [
                    pl.col(config.ID_COL),
                    pl.col("pred"),
                    pl.col("name"),
                    pl.col(config.FOLD_COL),
                ]
            ),
        ],
        how="diagonal_relaxed",
    )

agg_va_result_df = (
    all_val_result_df.group_by([config.ID_COL])
    .agg(pl.col("pred").mean().alias("pred"))
    .join(features_df.select(config.META_COLS), on=config.ID_COL, how="left")
)
logger.info(f"{ROCAUCMetric()._name}: {ROCAUCMetric()(agg_va_result_df)}")

In [None]:
def make_new_targets(
    df: pl.DataFrame,
    base_target_names: tuple[str] = ("target_kmf", "target_bfhf"),
) -> pl.DataFrame:
    lower_bound = 0.0

    for base_target_name in base_target_names:
        new_target_name = f"{base_target_name}_{config.EXP_NAME}"

        scaling_factor = df.select(pl.col("pred").log().min() / (lower_bound - pl.col(base_target_name).min()))[
            "pred"
        ].to_numpy()[0]
        print(scaling_factor)

        new_df = df.select(
            pl.col(config.ID_COL),
            (pl.col("pred").log() / scaling_factor + pl.col(base_target_name)).alias(new_target_name),
        ).with_columns(
            # normalize
            ((pl.col(new_target_name) - pl.col(new_target_name).mean()) / pl.col(new_target_name).std()).alias(
                f"{new_target_name}_normalized"
            ),
        )
        df = df.join(new_df, on=config.ID_COL, how="left")
    return df


target_df = make_new_targets(
    df=agg_va_result_df,
    base_target_names=("target_kmf", "target_bfhf"),
)

target_cols = [x for x in target_df.columns if x.startswith("target_")]

target_df.select(
    [
        pl.col(config.ID_COL),
        *target_cols,
    ]
).write_csv(config.INPUT_DIR / f"{config.EXP_NAME}_target.csv")


In [None]:
target_df