In [1]:
import logging

import config
import lightgbm as lgb
import polars as pl
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor
from lifelines import BreslowFlemingHarringtonFitter, KaplanMeierFitter
from sklearn.model_selection import train_test_split

from src.customs.fold import add_kfold
from src.customs.metrics import CatBoostMetric, Metric, ROCAUCMetric
from src.feature.tabular import AggregateEncoder, OrdinalEncoder, RawEncoder
from src.model.sklearn_like import (
    CatBoostClassifierWrapper,
    CatBoostRegressorWrapper,
    LightGBMWapper,
    XGBoostRegressorWrapper,
)
from src.trainer.tabular.simple import single_inference_fn, single_inference_fn_v2, single_train_fn

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [2]:
def load_data(valid_ratio: float | None = None) -> pl.DataFrame:
    raw_train_df = pl.read_csv(config.COMP_DATASET_DIR / "train.csv").with_columns(
        pl.lit("TRAIN").alias(config.DATASET_COL),
        pl.lit(-1).alias(config.FOLD_COL),
    )
    if valid_ratio is not None:
        _tr_df, _va_df = train_test_split(raw_train_df, test_size=valid_ratio, random_state=config.SEED)
        _va_df = _va_df.with_columns(
            pl.lit("VALID").alias(config.DATASET_COL),
        )
        raw_train_df = pl.concat([_tr_df, _va_df], how="diagonal_relaxed")

    target_df = pl.read_csv(config.INPUT_DIR / config.EXTRA_TARGET_CSV)
    raw_train_df = raw_train_df.join(target_df, on=config.ID_COL, how="left")
    target_cols = [x for x in raw_train_df.columns if x.startswith("target_")]

    raw_test_df = pl.read_csv(config.COMP_DATASET_DIR / "test.csv").with_columns(
        pl.lit("TEST").alias(config.DATASET_COL)
    )

    config.META_COLS = sorted(list(set(config.META_COLS + target_cols)))

    train_test_df = pl.concat([raw_train_df, raw_test_df], how="diagonal_relaxed").sort(
        config.DATASET_COL, config.ID_COL, descending=[True, False]
    )
    return train_test_df


def fe(
    train_test_df: pl.DataFrame,
    output_dataset: str | None = "TRAIN",
) -> pl.DataFrame:
    encoders = [
        RawEncoder(columns=config.META_COLS, prefix=""),
        RawEncoder(
            columns=(
                [
                    *config.NUMERICAL_COLS,
                ]
            ),
            prefix=f"{config.FEATURE_PREFIX}n_",
        ),
        OrdinalEncoder(
            columns=(
                [
                    *config.CATEGORICAL_COLS,
                ]
            ),
            prefix=f"{config.FEATURE_PREFIX}c_",
        ),
    ]

    for encoder in encoders:
        encoder.fit(train_test_df.filter(pl.col(config.DATASET_COL) == "TRAIN"))

    features_df = pl.concat(
        [encoder.transform(train_test_df) for encoder in encoders],
        how="horizontal",
    )
    if output_dataset is None:
        return features_df
    return features_df.filter(pl.col(config.DATASET_COL) == output_dataset)

In [None]:
train_test_df = load_data(valid_ratio=config.VALID_RATIO)
features_df = fe(train_test_df)

feature_names = sorted([x for x in features_df.columns if x.startswith(config.FEATURE_PREFIX)])
cat_features = [x for x in feature_names if x.startswith(f"{config.FEATURE_PREFIX}c_")]


logger.info(f"# of features: {len(feature_names)}")
logger.info(f"# of cat_features: {len(cat_features)}")

In [None]:
train_features_df = add_kfold(
    features_df,
    n_splits=config.N_SPLITS,
    random_state=config.SEED,
    fold_col=f"{config.FOLD_COL}_nested",
)


all_result_df = pl.DataFrame()
base_name = "cat2"
for i_fold in train_features_df[f"{config.FOLD_COL}_nested"].unique().to_list():
    logger.info(f"Outer Fold: {i_fold}")
    tr_df = train_features_df.filter(pl.col(f"{config.FOLD_COL}_nested") != i_fold)
    va_df = train_features_df.filter(pl.col(f"{config.FOLD_COL}_nested") == i_fold)
    # -------------------
    # cat
    _, _, trained_models = single_train_fn(
        model=CatBoostRegressorWrapper(
            name=f"{base_name}_{i_fold}",
            model=CatBoostRegressor(
                loss_function="Tweedie:variance_power=1.5",
                learning_rate=0.05,
                n_estimators=10000,
                early_stopping_rounds=300,
                eval_metric=CatBoostMetric(),
                verbose=100,
                # subsample=0.5,
                colsample_bylevel=0.2,
                random_state=config.SEED,
                reg_lambda=5,
                # task_type="GPU",
            ),
            multi_output=False,
            feature_names=feature_names,
            cat_features=cat_features,
        ),
        features_df=add_kfold(
            tr_df,
            n_splits=config.N_SPLITS,
            random_state=config.SEED,
            fold_col=config.FOLD_COL,
        ),
        feature_cols=feature_names,
        target_col="target_kmf_event_pred_scaled",
        fold_col=config.FOLD_COL,
        meta_cols=config.META_COLS,
        # weight_col="sample_weight",
        out_dir=config.OUTPUT_DIR,
        train_folds=None,
        eval_fn=Metric(),
        overwrite=False,
        use_eval_metric_extra_va_df=True,
    )
    va_result_df = single_inference_fn_v2(
        models=trained_models,
        features_df=va_df,
        feature_names=feature_names,
    )
    all_result_df = pl.concat(
        [
            all_result_df,
            va_result_df.select(
                pl.col(config.ID_COL),
                pl.col(config.EVENT_COL),
                pl.col(config.SURVIVAL_TIME_COL),
                pl.col("race_group"),
                pl.col("pred"),
            ),
        ],
        how="diagonal_relaxed",
    )

score = Metric()(all_result_df)
logger.info(f"Score: {score}")
# save

va_result_filepath = config.OUTPUT_DIR / base_name / "va_result.csv"
va_result_filepath.parent.mkdir(parents=True, exist_ok=True)
all_result_df.write_csv(va_result_filepath)

In [5]:
if "VALID" in train_test_df[config.DATASET_COL].unique():
    ext_va_features_df = fe(train_test_df, output_dataset="VALID")
    all_extr_va_result_df = pl.DataFrame()
    for i_fold in range(config.N_SPLITS):
        base_name = "cat2"
        ext_va_result_df = single_inference_fn(
            model=CatBoostRegressorWrapper(
                name=f"{base_name}_{i_fold}",
                feature_names=feature_names,
                cat_features=cat_features,
            ),
            features_df=ext_va_features_df,
            feature_names=feature_names,
            model_dir=config.ARTIFACT_EXP_DIR(),
            inference_folds=list(range(config.N_SPLITS)),
            out_dir=config.OUTPUT_DIR,
        )
        print(ext_va_result_df["pred"].to_list())
        print(Metric(prediction_label="pred")(ext_va_result_df))
        all_extr_va_result_df = pl.concat(
            [
                all_extr_va_result_df,
                ext_va_result_df.select(
                    pl.col(config.ID_COL),
                    pl.col("pred"),
                ),
            ],
            how="diagonal_relaxed",
        )
    all_extr_va_result_df = all_extr_va_result_df.group_by(config.ID_COL).agg(pl.col("pred").mean().alias("pred"))
    all_extr_va_result_df = all_extr_va_result_df.join(
        ext_va_features_df,
        on=config.ID_COL,
        how="left",
    )
    print(Metric(prediction_label="pred")(all_extr_va_result_df))


In [None]:
# debug test
all_te_result_df = pl.DataFrame()
for i_fold in range(config.N_SPLITS):
    base_name = "cat2"
    te_result_df = single_inference_fn(
        model=CatBoostRegressorWrapper(
            name=f"{base_name}_{i_fold}",
            feature_names=feature_names,
            cat_features=cat_features,
        ),
        features_df=fe(train_test_df, output_dataset="TEST"),
        feature_names=feature_names,
        model_dir=config.ARTIFACT_EXP_DIR(),
        inference_folds=list(range(config.N_SPLITS)),
        out_dir=config.OUTPUT_DIR,
    )
    all_te_result_df = pl.concat(
        [
            all_te_result_df,
            te_result_df.select(
                pl.col(config.ID_COL),
                pl.col("pred"),
            ),
        ],
        how="diagonal_relaxed",
    )
all_te_result_df = all_te_result_df.group_by(config.ID_COL).agg(pl.col("pred").mean().alias("pred")).sort(config.ID_COL)

te_result_filepath = config.OUTPUT_DIR / base_name / "te_result.csv"
te_result_filepath.parent.mkdir(parents=True, exist_ok=True)
all_te_result_df.write_csv(te_result_filepath)
print(all_te_result_df["pred"].to_list())
