In [1]:
import logging

import config
import lightgbm as lgb
import polars as pl
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor
from lifelines import BreslowFlemingHarringtonFitter, KaplanMeierFitter
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from torch import nn

from src.customs.fold import add_kfold
from src.customs.metrics import CatBoostMetric, Metric, ROCAUCMetric, XGBMetric
from src.feature.tabular import AggregateEncoder, OrdinalEncoder, RawEncoder
from src.model.sklearn_like import (
    CatBoostClassifierWrapper,
    CatBoostRegressorWrapper,
    LightGBMWapper,
    TabMRegressor,
    TabMRegressorWrapper,
    XGBoostRegressorWrapper,
)
from src.trainer.tabular.simple import single_inference_fn, single_train_fn

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [2]:
def load_data(valid_ratio: float | None = None) -> pl.DataFrame:
    raw_train_df = pl.read_csv(config.COMP_DATASET_DIR / "train.csv").with_columns(
        pl.lit("TRAIN").alias(config.DATASET_COL),
        pl.lit(-1).alias(config.FOLD_COL),
    )
    if valid_ratio is not None:
        _tr_df, _va_df = train_test_split(raw_train_df, test_size=valid_ratio, random_state=config.SEED)
        _va_df = _va_df.with_columns(
            pl.lit("VALID").alias(config.DATASET_COL),
        )
        raw_train_df = pl.concat([_tr_df, _va_df], how="diagonal_relaxed")

    target_df = pl.read_csv(config.INPUT_DIR / config.EXTRA_TARGET_CSV)
    target_cols = [x for x in target_df.columns if x.startswith("target_")]
    raw_train_df = raw_train_df.join(target_df, on=config.ID_COL, how="left")

    raw_test_df = pl.read_csv(config.COMP_DATASET_DIR / "test.csv").with_columns(
        pl.lit("TEST").alias(config.DATASET_COL)
    )

    config.META_COLS = sorted(list(set(config.META_COLS + target_cols)))

    train_test_df = pl.concat([raw_train_df, raw_test_df], how="diagonal_relaxed").sort(
        config.DATASET_COL, config.ID_COL, descending=[True, False]
    )
    return train_test_df


def fe(
    train_test_df: pl.DataFrame,
    output_dataset: str | None = "TRAIN",
) -> pl.DataFrame:
    encoders = [
        RawEncoder(columns=config.META_COLS, prefix=""),
        RawEncoder(
            columns=(
                [
                    *config.NUMERICAL_COLS,
                ]
            ),
            prefix=f"{config.FEATURE_PREFIX}n_",
        ),
        OrdinalEncoder(
            columns=(
                [
                    *config.CATEGORICAL_COLS,
                ]
            ),
            prefix=f"{config.FEATURE_PREFIX}c_",
        ),
    ]

    for encoder in encoders:
        encoder.fit(train_test_df.filter(pl.col(config.DATASET_COL) == "TRAIN"))

    features_df = pl.concat(
        [encoder.transform(train_test_df) for encoder in encoders],
        how="horizontal",
    )
    if output_dataset is None:
        return features_df
    return features_df.filter(pl.col(config.DATASET_COL) == output_dataset)

In [None]:
train_test_df = load_data(valid_ratio=config.VALID_RATIO)
features_df = fe(train_test_df)

feature_names = sorted([x for x in features_df.columns if x.startswith(config.FEATURE_PREFIX)])
cat_features = [x for x in feature_names if x.startswith(f"{config.FEATURE_PREFIX}c_")]


logger.info(f"# of features: {len(feature_names)}")
logger.info(f"# of cat_features: {len(cat_features)}")

In [4]:
def preprocess(features_df: pl.DataFrame, output_dataset: None | str = "TRAIN") -> pl.DataFrame:
    numerical_cols = [f"{config.FEATURE_PREFIX}n_{col}" for col in config.NUMERICAL_COLS]

    imputer = SimpleImputer(strategy="mean")
    imputer.fit(features_df.filter(pl.col(config.DATASET_COL) == "TRAIN").select(numerical_cols))

    features_df = pl.concat(
        [
            features_df.drop(numerical_cols),
            pl.DataFrame(imputer.transform(features_df.select(numerical_cols)), schema=numerical_cols),
        ],
        how="horizontal",
    )
    if not output_dataset:
        return features_df
    return features_df.filter(pl.col(config.DATASET_COL) == output_dataset)


preprocessed_features_df = preprocess(features_df)
feature_names = sorted([x for x in preprocessed_features_df.columns if x.startswith(config.FEATURE_PREFIX)])
cat_features = [x for x in feature_names if x.startswith(f"{config.FEATURE_PREFIX}c_")]
train_features_df = add_kfold(
    preprocessed_features_df,
    n_splits=config.N_SPLITS,
    random_state=config.SEED,
    fold_col=config.FOLD_COL,
)

In [None]:
# minmax scaling for target
max_target = train_features_df.select("target_pred_bfhf_target_001").max().to_numpy()[0]
min_target = train_features_df.select("target_pred_bfhf_target_001").min().to_numpy()[0]
train_features_df = train_features_df.with_columns(
    ((pl.col("target_pred_bfhf_target_001") - min_target) / (max_target - min_target)).alias(
        "target_pred_bfhf_target_001_scaled"
    )
)

va_result_df, va_scores, trained_models = single_train_fn(
    model=TabMRegressorWrapper(
        name="tabm2",
        model=TabMRegressor(
            arch_type="tabm-mini",
            backbone={"type": "MLP", "n_blocks": 3, "d_block": 512, "dropout": 0.1},
            categorical_features=cat_features,
            random_state=config.SEED,
            patience=30,
            eval_metric=XGBMetric(),  # NOTE: use tabm's eval_metric
            loss_fn=nn.BCEWithLogitsLoss(),
            # loss_fn=nn.MSELoss(),
            k=48,
            max_epochs=500,
            checkpoint_path=config.TMP_DIR / "best_model.pth",  # tmp path
        ),
        feature_names=feature_names,
    ),
    features_df=train_features_df,
    feature_cols=feature_names,
    target_col="target_pred_bfhf_target_001_scaled",
    fold_col=config.FOLD_COL,
    meta_cols=config.META_COLS,
    # weight_col="sample_weight",
    out_dir=config.OUTPUT_DIR,
    train_folds=None,
    eval_fn=Metric(),
    overwrite=True,
    use_eval_metric_extra_va_df=True,
    enable_plot_feature_importance=False,
)

In [6]:
if "VALID" in train_test_df[config.DATASET_COL].unique():
    ext_va_features_df = fe(train_test_df, output_dataset=None)
    preprocessed_ext_va_features_df = preprocess(ext_va_features_df, output_dataset="VALID")
    feature_names = sorted([x for x in preprocessed_ext_va_features_df.columns if x.startswith(config.FEATURE_PREFIX)])
    cat_features = [x for x in feature_names if x.startswith(f"{config.FEATURE_PREFIX}c_")]

    ext_va_result_df = single_inference_fn(
        model=TabMRegressorWrapper(
            name="tabm2",
            model=TabMRegressor(
                categorical_features=cat_features,
                random_state=config.SEED,
            ),
            feature_names=feature_names,
        ),
        features_df=preprocessed_ext_va_features_df,
        feature_names=feature_names,
        model_dir=config.ARTIFACT_EXP_DIR(),
        inference_folds=list(range(config.N_SPLITS)),
        out_dir=config.OUTPUT_DIR,
    )
    print(ext_va_result_df["pred"].to_list())
    print(Metric(prediction_label="pred")(ext_va_result_df))

In [None]:
# debug test
test_features_df = fe(train_test_df, output_dataset=None)
preprocessed_test_features_df = preprocess(test_features_df, output_dataset="TEST")
feature_names = sorted([x for x in preprocessed_test_features_df.columns if x.startswith(config.FEATURE_PREFIX)])
cat_features = [x for x in feature_names if x.startswith(f"{config.FEATURE_PREFIX}c_")]

te_result_df = single_inference_fn(
    model=TabMRegressorWrapper(
        name="tabm2",
        model=TabMRegressor(
            categorical_features=cat_features,
            random_state=config.SEED,
        ),
        feature_names=feature_names,
    ),
    features_df=preprocessed_test_features_df,
    feature_names=feature_names,
    model_dir=config.ARTIFACT_EXP_DIR(),
    inference_folds=list(range(config.N_SPLITS)),
    out_dir=config.OUTPUT_DIR,
)
print(te_result_df["pred"].to_list())