In [None]:
import json
import logging

import config
import lightgbm as lgb
import polars as pl
from catboost import CatBoostClassifier, CatBoostRegressor
from preprocess import fe, load_data

from src.customs.fold import add_kfold
from src.customs.metrics import CatBoostMetric, Metric, ROCAUCMetric
from src.model.sklearn_like import CatBoostClassifierWrapper, CatBoostRegressorWrapper, LightGBMWapper
from src.trainer.tabular.simple import single_inference_fn, single_train_fn

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [None]:
train_test_df = load_data(config=config, valid_ratio=config.VALID_RATIO)
target_df = pl.read_csv("./data/extr_output/101/1/101.csv").with_columns(pl.col("t_event_pred") * 2)
target_cols = [x for x in target_df.columns if x.startswith("t_")]
train_test_df = train_test_df.join(
    target_df.select(
        [
            config.ID_COL,
            *target_cols,
        ],
    ),
    on=config.ID_COL,
    how="left",
)
config.META_COLS = set(config.META_COLS) | set(target_cols)

features_df = fe(config=config, train_test_df=train_test_df)
feature_names = sorted([x for x in features_df.columns if x.startswith(config.FEATURE_PREFIX)])
cat_features = [x for x in feature_names if x.startswith(f"{config.FEATURE_PREFIX}c_")]


all_val_result_df = pl.DataFrame()
for i, seed in enumerate([0, 1, 2, 3, 4]):
    print(f"start lgb1 {seed}")
    _va_result_df, _, _ = single_train_fn(
        model=LightGBMWapper(
            name=f"lgb1_{seed}",
            model=lgb.LGBMModel(
                objective="binary",
                boosting="gbdt",
                n_estimators=10000,
                learning_rate=0.005,
                num_leaves=11 + (i * 5),
                colsample_bytree=0.2,
                subsample=0.5,
                importance_type="gain",
                random_state=seed,
            ),
            fit_params={
                "callbacks": [
                    lgb.early_stopping(300, first_metric_only=True),
                    lgb.log_evaluation(period=100),
                ],
                "categorical_feature": cat_features,
                "feature_name": feature_names,
            },
        ),
        features_df=add_kfold(
            features_df,
            n_splits=10,
            random_state=seed,
            fold_col=config.FOLD_COL,
        ),
        feature_cols=feature_names,
        target_col=config.EVENT_COL,
        fold_col=config.FOLD_COL,
        meta_cols=config.META_COLS,
        out_dir=config.OUTPUT_DIR,
        train_folds=None,
        eval_fn=ROCAUCMetric(),
        overwrite=False,
        use_eval_metric_extra_va_df=False,
    )

    all_val_result_df = pl.concat(
        [
            all_val_result_df,
            _va_result_df.select(
                [
                    pl.col(config.ID_COL),
                    pl.col("pred"),
                    pl.col("name"),
                    pl.col(config.FOLD_COL),
                ]
            ),
        ],
        how="diagonal_relaxed",
    )

    print(f"start catboost1 {seed}")
    _va_result_df, _, _ = single_train_fn(
        model=CatBoostClassifierWrapper(
            name=f"cat1_{seed}",
            model=CatBoostClassifier(
                loss_function="Logloss",
                learning_rate=0.005 + (i * 0.005),
                n_estimators=20000,
                early_stopping_rounds=300,
                verbose=100,
                # subsample=0.5,
                colsample_bylevel=0.2,
                random_state=seed,
            ),
            multi_output=False,
            feature_names=feature_names,
            cat_features=cat_features,
        ),
        features_df=add_kfold(
            features_df,
            n_splits=config.N_SPLITS,
            random_state=seed,
            fold_col=config.FOLD_COL,
        ),
        feature_cols=feature_names,
        target_col=config.EVENT_COL,
        fold_col=config.FOLD_COL,
        meta_cols=config.META_COLS,
        out_dir=config.OUTPUT_DIR,
        train_folds=None,
        eval_fn=ROCAUCMetric(),
        overwrite=False,
        use_eval_metric_extra_va_df=False,
    )
    all_val_result_df = pl.concat(
        [
            all_val_result_df,
            _va_result_df.select(
                [
                    pl.col(config.ID_COL),
                    pl.col("pred"),
                    pl.col("name"),
                    pl.col(config.FOLD_COL),
                ]
            ),
        ],
        how="diagonal_relaxed",
    )

# mean ensemble
agg_va_result_df = (
    all_val_result_df.group_by([config.ID_COL])
    .agg(pl.col("pred").mean().alias("pred"))
    .join(features_df.select(config.META_COLS), on=config.ID_COL, how="left")
)
logger.info(f"{ROCAUCMetric()._name}: {ROCAUCMetric()(agg_va_result_df)}")


def make_new_targets(
    df: pl.DataFrame,
    base_target_names: tuple[str] = ("t_kmf", "t_bfhf"),
    lower_bound: float = 0.0,
) -> pl.DataFrame:
    lower_bound = 0.0

    for base_target_name in base_target_names:
        new_target_name = f"{base_target_name}_event_scaled"

        scaling_factor = df.select(pl.col("pred").log().min() / (lower_bound - pl.col(base_target_name).min()))[
            "pred"
        ].to_numpy()[0]
        print(scaling_factor)

        new_df = df.select(
            pl.col(config.ID_COL),
            (pl.col("pred").log() / scaling_factor + pl.col(base_target_name)).alias(new_target_name),
        )
        df = df.join(new_df, on=config.ID_COL, how="left")
    return df


target_df = make_new_targets(
    df=agg_va_result_df,
    base_target_names=("t_kmf", "t_bfhf"),
)

target_cols = [x for x in target_df.columns if x.startswith("t_")]
meta_cols = [x for x in config.META_COLS if x not in target_cols]


target_df = target_df.select(
    [
        pl.col(meta_cols),
        pl.col("pred").alias("t_event_pred"),
        *[pl.col(x) for x in target_cols if x != "t_event_pred"],
    ]
)


In [None]:
train_test_df = load_data(config=config, valid_ratio=config.VALID_RATIO)
target_df = target_df.with_columns(pl.col("t_event_pred") * 2)
target_cols = [x for x in target_df.columns if x.startswith("t_")]
train_test_df = train_test_df.join(
    target_df.select(
        [
            config.ID_COL,
            *target_cols,
        ],
    ),
    on=config.ID_COL,
    how="left",
)
config.META_COLS = set(config.META_COLS) | set(target_cols)

features_df = fe(config=config, train_test_df=train_test_df)
feature_names = sorted([x for x in features_df.columns if x.startswith(config.FEATURE_PREFIX)])
cat_features = [x for x in feature_names if x.startswith(f"{config.FEATURE_PREFIX}c_")]


def make_new_targets(
    df: pl.DataFrame,
    base_target_names: tuple[str] = ("t_kmf", "t_bfhf"),
    lower_bound_pos: float = 0.0,
    lower_bound_neg: float = 0.0,
    pred_col: str = "t_event_pred",
) -> pl.DataFrame:
    for base_target_name in base_target_names:
        new_target_name = f"{base_target_name}_event_scaled2"

        scaling_factor_pos = (
            df.filter(pl.col(config.EVENT_COL) == 1)
            .select(pl.col(pred_col).log().min() / (lower_bound_pos - pl.col(base_target_name).min()))[pred_col]
            .to_numpy()[0]
        )
        scaling_factor_neg = (
            df.filter(pl.col(config.EVENT_COL) == 0)
            .select(pl.col(pred_col).log().min() / (lower_bound_neg - pl.col(base_target_name).min()))[pred_col]
            .to_numpy()[0]
        )

        print(scaling_factor_pos, scaling_factor_neg)

        new_df = df.select(
            pl.col(config.ID_COL),
            pl.when(pl.col(config.EVENT_COL) == 1)
            .then(pl.col(pred_col).log() / scaling_factor_pos + pl.col(base_target_name))
            .otherwise(pl.col(pred_col).log() / scaling_factor_neg + pl.col(base_target_name))
            .alias(new_target_name),
        )
        df = df.join(new_df, on=config.ID_COL, how="left")
    return df


features_df = features_df.join(
    make_new_targets(
        target_df,
        lower_bound_neg=0.0,
        lower_bound_pos=0.0,
    ).select(pl.col(config.ID_COL), pl.col("t_kmf_event_scaled2").exp()),
    on=config.ID_COL,
    how="left",
)


def add_weight(
    features_df: pl.DataFrame,
    survival_time_col: str,
    event_col: str,
    neg_min: float = 0.01,  # event == 0 の場合の下限
    neg_max: float = 0.5,  # event == 0 の場合の上限
    pos_min: float = 1.0,  # event == 1 の場合の下限
    pos_max: float = 1.5,  # event == 1 の場合の上限
) -> pl.DataFrame:
    features_df = features_df.with_columns(pl.col(survival_time_col).alias("tmp_time"))

    # 各 event グループごとに tmp_time の最小値と最大値を計算
    features_df = features_df.with_columns(
        [
            pl.col("tmp_time").min().over(event_col).alias("group_min"),
            pl.col("tmp_time").max().over(event_col).alias("group_max"),
        ]
    )

    # 各グループごとにスケーリングした値から weight を計算
    features_df = features_df.with_columns(
        pl.when(pl.col(event_col) == 0)
        .then(
            # event == 0 の場合は逆スケーリング: 1 - ((tmp_time - group_min) / (group_max - group_min))
            (1 - ((pl.col("tmp_time") - pl.col("group_min")) / (pl.col("group_max") - pl.col("group_min"))))
            * (neg_max - neg_min)
            + neg_min
        )
        .otherwise(
            # event == 1 の場合はそのままスケーリング: ((tmp_time - group_min) / (group_max - group_min))
            ((pl.col("tmp_time") - pl.col("group_min")) / (pl.col("group_max") - pl.col("group_min")))
            * (pos_max - pos_min)
            + pos_min
        )
        .alias("weight")
    )

    return features_df


In [None]:
va_result_df, va_scores = pl.DataFrame(), {}
for i, seed in enumerate(config.SEEDS):
    name = f"cat_{seed}"
    _va_result_df, _va_scores, trained_models = single_train_fn(
        model=CatBoostRegressorWrapper(
            name=name,
            model=CatBoostRegressor(
                loss_function="Tweedie:variance_power=1.5",
                grow_policy="SymmetricTree",
                learning_rate=0.05,
                n_estimators=100000,
                early_stopping_rounds=3000,
                eval_metric=CatBoostMetric(),
                verbose=100,
                random_state=seed,
                # subsample=0.9,
                colsample_bylevel=0.2,
            ),
            multi_output=False,
            feature_names=feature_names,
            cat_features=cat_features,
        ),
        features_df=add_kfold(
            add_weight(
                features_df,
                survival_time_col=config.SURVIVAL_TIME_COL,
                event_col=config.EVENT_COL,
                neg_min=0.01,
                neg_max=0.5,
                pos_min=1.0,
                pos_max=1.5,
            ),
            n_splits=config.N_SPLITS,
            random_state=seed,
            fold_col=config.FOLD_COL,
        ),
        feature_cols=feature_names,
        target_col="t_kmf_event_scaled2",
        fold_col=config.FOLD_COL,
        meta_cols=config.META_COLS,
        weight_col="weight",
        out_dir=config.OUTPUT_DIR,
        train_folds=None,
        eval_fn=Metric(),
        overwrite=False,
        use_eval_metric_extra_va_df=True,
    )
    va_result_df = pl.concat([va_result_df, _va_result_df], how="diagonal_relaxed")
    va_scores[name] = _va_scores

# ------------------------------
# final score
# ------------------------------
va_result_agg_df = (
    va_result_df.group_by(config.ID_COL)
    .agg(pl.col("pred").mean())
    .sort("ID")
    .join(train_test_df.select(config.META_COLS), on=config.ID_COL, how="left")
)
final_score = Metric()(input_df=va_result_agg_df)
logger.info(f"✅ final score: {final_score}")
va_scores["final"] = final_score

# save
va_result_agg_df.write_csv(f"{config.OUTPUT_DIR}/va_result.csv")
with open(f"{config.OUTPUT_DIR}/va_scores.json", "w") as f:
    json.dump(va_scores, f, indent=4)


In [None]:
from itertools import product

import numpy as np
import polars as pl
from tqdm import tqdm

# Assuming va_result_agg_df is already defined in your environment
# If not, you would need to load it first


def calculate_score(threshold, adjustment, input_df=va_result_agg_df):
    """
    Calculate the score with given threshold and adjustment parameters

    Args:
        threshold: The threshold value to replace 0.7
        adjustment: The adjustment value to replace 0.1
        input_df: The input dataframe

    Returns:
        score: The calculated metric score
    """
    adjusted_df = input_df.with_columns(
        (
            (pl.when(pl.col("t_event_pred") < threshold)).then(pl.col("pred")).otherwise(pl.col("pred") + adjustment)
        ).alias("pred")
    )

    # Assuming Metric() is a function that calculates your score
    # You might need to replace this with your actual metric calculation
    score = Metric()(input_df=adjusted_df)

    return score


# Grid search for optimal parameters
def grid_search(thresholds, adjustments):
    """
    Perform grid search over thresholds and adjustments

    Args:
        thresholds: List of threshold values to try
        adjustments: List of adjustment values to try

    Returns:
        best_params: Dictionary with best threshold, adjustment, and score
    """
    best_score = float("-inf")  # Initialize with negative infinity
    best_params = {"threshold": None, "adjustment": None, "score": None}

    results = []

    # Try all combinations of parameters
    for threshold, adjustment in tqdm(product(thresholds, adjustments), total=len(thresholds) * len(adjustments)):
        try:
            score = calculate_score(threshold, adjustment)
            results.append({"threshold": threshold, "adjustment": adjustment, "score": score})

            if score > best_score:
                best_score = score
                best_params = {"threshold": threshold, "adjustment": adjustment, "score": score}
                print(f"New best: threshold={threshold}, adjustment={adjustment}, score={score}")
        except Exception as e:
            print(f"Error with threshold={threshold}, adjustment={adjustment}: {e}")

    # Convert results to DataFrame for analysis
    results_df = pl.DataFrame(results)

    return best_params, results_df


# Define parameter ranges to search
thresholds = np.linspace(0.5, 1, 50)  # Try values from 0.5 to 0.9
adjustments = np.linspace(0.05, 0.5, 50)  # Try values from 0.05 to 0.2

best_params, results_df = grid_search(thresholds, adjustments)

print("\nBest parameters:")
print(f"Threshold: {best_params['threshold']}")
print(f"Adjustment: {best_params['adjustment']}")
print(f"Score: {best_params['score']}")

adjusted_df = va_result_agg_df.with_columns(
    (
        (pl.when(pl.col("t_event_pred") < best_params["threshold"]))
        .then(pl.col("pred"))
        .otherwise(pl.col("pred") + best_params["adjustment"])
    ).alias("pred")
)

final_score = Metric()(input_df=adjusted_df)
print(f"Final score with best parameters: {final_score}")

# save best params
with open(f"{config.OUTPUT_DIR}/pp_best_params.json", "w") as f:
    json.dump(best_params, f, indent=4)


In [None]:
# debug test
test_features_df = fe(config=config, train_test_df=train_test_df, output_dataset="TEST")
te_result_df = pl.DataFrame()
for seed in config.SEEDS:
    name = f"cat_{seed}"

    _te_result_df = single_inference_fn(
        model=CatBoostRegressorWrapper(
            name=name,
            feature_names=feature_names,
            cat_features=cat_features,
        ),
        features_df=test_features_df,
        feature_names=feature_names,
        model_dir=config.ARTIFACT_EXP_DIR(),
        inference_folds=list(range(config.N_SPLITS)),
        out_dir=config.OUTPUT_DIR,
    )
    te_result_df = pl.concat([te_result_df, _te_result_df], how="diagonal_relaxed")

te_result_agg_df = (
    te_result_df.group_by(config.ID_COL)
    .agg(pl.col("pred").mean())
    .sort("ID")
    .join(train_test_df.select(config.META_COLS), on=config.ID_COL, how="left")
)

all_te_result_df = pl.DataFrame()
for seed in [0, 1, 2, 3, 4]:
    _te_result_df = single_inference_fn(
        model=LightGBMWapper(name=f"lgb1_{seed}"),
        features_df=test_features_df,
        feature_names=feature_names,
        model_dir=config.ARTIFACT_EXP_DIR(),
        inference_folds=list(range(config.N_SPLITS)),
        out_dir=config.OUTPUT_DIR,
    )
    all_te_result_df = pl.concat(
        [
            all_te_result_df,
            _te_result_df.select(
                [
                    pl.col(config.ID_COL),
                    pl.col("pred"),
                ]
            ),
        ],
        how="diagonal_relaxed",
    )
    _te_result_df = single_inference_fn(
        model=CatBoostClassifierWrapper(
            name=f"cat1_{seed}",
            feature_names=feature_names,
            cat_features=cat_features,
        ),
        features_df=test_features_df,
        feature_names=feature_names,
        model_dir=config.ARTIFACT_EXP_DIR(),
        inference_folds=list(range(config.N_SPLITS)),
        out_dir=config.OUTPUT_DIR,
    )
    all_te_result_df = pl.concat(
        [
            all_te_result_df,
            _te_result_df.select(
                [
                    pl.col(config.ID_COL),
                    pl.col("pred"),
                ]
            ),
        ],
        how="diagonal_relaxed",
    )

agg_te_result_df = all_te_result_df.group_by(config.ID_COL).agg(pl.col("pred").mean().alias("t_event_pred")).sort("ID")
te_result_agg_df = te_result_agg_df.select(pl.exclude("t_event_pred")).join(
    agg_te_result_df,
    on=config.ID_COL,
    how="left",
)
print(te_result_agg_df)

# post process
te_result_agg_df = te_result_agg_df.with_columns(
    (
        (pl.when(pl.col("t_event_pred") < best_params["threshold"]))
        .then(pl.col("pred"))
        .otherwise(pl.col("pred") + best_params["adjustment"])
    ).alias("pred")
)
print(te_result_agg_df)

print(te_result_agg_df["pred"].to_list())


In [None]:
best_params