In [None]:
import logging
from pathlib import Path

import config
import polars as pl
import polars.selectors as cs
import rootutils
from xgboost import XGBRegressor

rootutils.setup_root(".", cwd=True, pythonpath=True, dotenv=True)


from src.customs.fold import add_kfold
from src.customs.metrics import CustomMetric
from src.feature.tabular import RawEncoder
from src.model.sklearn_like import XGBoostRegressorWrapper
from src.trainer.simple import single_train_fn

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [None]:
def add_cox_target(
    train_df: pl.DataFrame,
    event_col: str,
    time_col: str,
    target_col: str = "cox_target",
) -> pl.DataFrame:
    # event=0 -> -1 * time
    # event=1 -> time
    return train_df.with_columns(
        pl.when(train_df[event_col] == 0).then(-1 * train_df[time_col]).otherwise(train_df[time_col]).alias(target_col)
    )


raw_train_df = pl.read_csv(config.COMPETITION_DATASET_DIR / "train.csv").with_columns(
    pl.lit("TRAIN").alias(config.DATASET_COL),
    pl.lit(-1).alias(config.FOLD_COL),
)
raw_test_df = pl.read_csv(config.COMPETITION_DATASET_DIR / "test.csv").with_columns(
    pl.lit("TEST").alias(config.DATASET_COL)
)

train_test_df = pl.concat([raw_train_df, raw_test_df], how="diagonal_relaxed")
train_test_df = train_test_df.with_columns(
    pl.col(config.CATEGORICAL_COLS).fill_null("NAN").cast(pl.Categorical).to_physical().cast(pl.Int32)
)

data_dictionary_df = pl.read_csv(config.COMPETITION_DATASET_DIR / "data_dictionary.csv")
train_test_df = add_cox_target(
    train_test_df,
    event_col=config.EVENT_COL,
    time_col=config.SURVIVAL_TIME_COL,
)

In [None]:
encoders = [
    RawEncoder(columns=config.META_COLS, prefix=""),
    RawEncoder(
        columns=(
            [
                *config.NUMERICAL_COLS,
            ]
        ),
        prefix=f"{config.FEATURE_PREFIX}n_",
    ),
    RawEncoder(
        columns=(
            [
                *config.CATEGORICAL_COLS,
            ]
        ),
        prefix=f"{config.FEATURE_PREFIX}c_",
    ),
]

for encoder in encoders:
    encoder.fit(raw_train_df)

train_df = train_test_df.filter(pl.col(config.DATASET_COL) == "TRAIN")
train_features_df = pl.concat(
    [encoder.transform(train_df) for encoder in encoders],
    how="horizontal",
)

feature_names = sorted([x for x in train_features_df.columns if x.startswith(config.FEATURE_PREFIX)])
cat_features = [x for x in feature_names if x.startswith(f"{config.FEATURE_PREFIX}c_")]

In [None]:
model = XGBoostRegressorWrapper(
    model=XGBRegressor(
        n_estimators=2000,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.5,
        objective="survival:cox",
        learning_rate=0.02,
        eval_metric="cox-nloglik",
        # early_stopping_rounds=1000,
        enable_categorical=True,
    ),
    fit_params={"verbose": 100},
    feature_names=feature_names,
    # cat_features=cat_features,
)


train_features_df = add_kfold(train_features_df, n_splits=5, random_state=42, fold_col=config.FOLD_COL)
va_result_df, va_scores, trained_models = single_train_fn(
    model=model,
    features_df=train_features_df,
    feature_cols=feature_names,
    target_col="cox_target",
    fold_col=config.FOLD_COL,
    meta_cols=config.META_COLS,
    out_dir=config.OUTPUT_DIR,
    train_folds=None,
    eval_fn=CustomMetric(),
    overwrite=True,
)