In [None]:
import os
import hydra
import logging
import pandas as pd
import warnings
import rootutils
import joblib
import numpy as np
from pathlib import Path
from hydra import compose, initialize
from hydra.core.hydra_config import HydraConfig

from sklearn.base import clone
from sklearn.utils.class_weight import compute_sample_weight

rootutils.setup_root(search_from="../", indicator=".project-root", pythonpath=True)

from src.experiment.utils import assign_fold_index, make_uid, visualize_feature_importance
from src.experiment.runner import run_extractors
from src.experiment.metrics import macro_f1_from_proba

In [None]:
OVERRIDES: list[str] = os.getenv("OVERRIDES", "experiment=002-tabular_v2").split(",")

In [None]:
if OVERRIDES is None:
    raise ValueError("OVERRIDES is not set")

with initialize(version_base=None, config_path="../configs"):
    CFG = compose(
        config_name="config.yaml",
        return_hydra_config=True,
        overrides=OVERRIDES,
    )
    HydraConfig.instance().set_config(CFG)  # use HydraConfig for notebook to use hydra job

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

if not logger.handlers:
    handler = logging.StreamHandler()
    logger.addHandler(handler)


warnings.filterwarnings("ignore")
INPUT_DIR = Path(CFG.paths.input_dir)

### Load Data


In [None]:
def assign_meta(df: pd.DataFrame, data="train"):
    df["data"] = data
    df["fold"] = -1
    return df


train_df = pd.read_csv(INPUT_DIR / "train.csv").rename(columns={"Unnamed: 0": "uid"})
test_df = pd.read_csv(INPUT_DIR / "test.csv").rename(columns={"Unnamed: 0": "uid"})
sample_submission_df = pd.read_csv(INPUT_DIR / "sample_submission.csv")

train_df = assign_meta(train_df, data="train")
test_df = assign_meta(test_df, data="test")

### CV Split


In [None]:
kfold = hydra.utils.instantiate(CFG.cv)
train_df = assign_fold_index(train_df=train_df, kfold=kfold, y_col="health")

### Feature Engineering


In [None]:
feature_extractors = hydra.utils.instantiate(CFG.feature_extractors)

for extractor in CFG.agg_feature_extractors:
    for group_keys in CFG.group_keys_for_agg:
        _extractor = hydra.utils.instantiate(extractor, group_keys=group_keys)
        feature_extractors.append(_extractor)

for extractor in CFG.te_feature_extractors:
    for group_keys in CFG.group_keys_for_te:
        _extractor = hydra.utils.instantiate(extractor, group_keys=group_keys)
        feature_extractors.append(_extractor)

raw_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
raw_feature_df = run_extractors(
    input_df=raw_df,
    extractors=feature_extractors,
    dirpath=Path(CFG.paths.feature_store_dir),
    fit=True,
    cache=False,
)
assert len(raw_df) == len(raw_feature_df)

raw_feature_df = pd.concat([raw_df, raw_feature_df], axis=1)
train_feature_df = raw_feature_df.query("data == 'train'").reset_index(drop=True)
test_feature_df = raw_feature_df.query("data == 'test'").reset_index(drop=True)

feature_columns = [col for col in train_feature_df.columns if col.startswith("f_")]

In [None]:
def train_cv_tabular_v1(
    df: pd.DataFrame,
    estimator,
    feature_columns: list[str],
    target_columns: str,
    fit_params: dict,
    output_dir: Path,
    train_folds: list[int] | None = None,
    overwrite: bool = False,
    use_xgb_class_weight: bool = False,
):
    """train cv for xgboost estimator"""
    estimators = []

    if train_folds is None:
        train_folds = sorted(df["fold"].unique())

    for i_fold in train_folds:
        logger.info(f"start training fold={i_fold} 🚀 ")
        fit_estimator = clone(estimator)

        output_df_fold = output_dir / f"fold{i_fold}"
        output_df_fold.mkdir(exist_ok=True, parents=True)

        estimator_uid = make_uid(fit_estimator.__dict__)
        estimator_name = fit_estimator.__class__.__name__
        estimator_name_with_uid = f"{estimator_name}_{estimator_uid}"
        estimator_path = output_df_fold / f"{estimator_name}.pkl"

        if estimator_path.exists() and (not overwrite):
            logger.info(f"skip fitting in fold{i_fold}")
            fit_estimator = joblib.load(estimator_path)
            estimators.append(fit_estimator)
            continue

        # split train and valid
        train_df = df.query(f"fold != {i_fold}").reset_index(drop=True)
        valid_df = df.query(f"fold == {i_fold}").reset_index(drop=True)
        tr_x, tr_y = train_df[feature_columns], train_df[target_columns]
        va_x, va_y = valid_df[feature_columns], valid_df[target_columns]

        logger.info(f"estimator : {estimator_name_with_uid}")

        if use_xgb_class_weight:
            if estimator_name == "XGBModel":
                fit_params["sample_weight"] = compute_sample_weight(class_weight="balanced", y=tr_y)
                fit_params["sample_weight_eval_set"] = [compute_sample_weight(class_weight="balanced", y=va_y)]

        fit_estimator.fit(X=tr_x, y=tr_y, eval_set=[(va_x, va_y)], **fit_params)
        estimators.append(fit_estimator)

        joblib.dump(fit_estimator, estimator_path)

    return estimators


def predict_cv_tabular_v1(
    df: pd.DataFrame,
    estimators: list,
    feature_columns: list[str],
    train_folds: list[int] | None = None,
    test: bool = False,
    result_columns: list[str] | None = None,
):
    if result_columns is None:
        result_columns = [col for col in df.columns if col not in feature_columns]

    if train_folds is None:
        train_folds = range(len(estimators))

    def _predict_i(df, i_fold, estimator):
        logger.info(f"fold{i_fold} predict : test={test}")
        if not test:
            df = df.query(f"fold == {i_fold}").reset_index(drop=True)

        va_x = df[feature_columns]
        va_pred = estimator.predict(va_x)
        i_result_df = df[result_columns].assign(pred=va_pred.tolist())
        if test:
            i_result_df = i_result_df.assign(est_fold=i_fold)
        return i_result_df

    valid_result_df = pd.concat(
        [_predict_i(df, i_fold, estimator) for i_fold, estimator in zip(train_folds, estimators)],
        axis=0,
        ignore_index=True,
    )
    return valid_result_df

In [None]:
from lightgbm import LGBMModel
import lightgbm
from sklearn.metrics import roc_auc_score


def lgb_macro_auc(y_true, y_pred):
    auc = roc_auc_score(y_true=y_true, y_score=y_pred, multi_class="ovr")
    return "macro_auc", auc, True


def lgb_macro_f1(y_true, y_pred):
    f1 = macro_f1_from_proba(y_true=y_true, y_pred=y_pred)
    return "macro_f1", f1, True


model_params = {
    # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMModel.html#lightgbm.LGBMModel
    "n_estimators": 10000,
    "num_leaves": 11,
    "learning_rate": 0.1,
    "objective": "multiclass",
    "colsample_bytree": 0.2,
    "subsample": 0.2,
    "random_state": 8823,
    # "class_weight": "balanced",
    "importance_type": "gain",
    "num_class": 3,
}
eary_stopping = lightgbm.early_stopping(500, first_metric_only=False, verbose=True)
log_evaluation = lightgbm.log_evaluation(period=100, show_stdv=True)

fit_params = {"callbacks": [eary_stopping, log_evaluation], "eval_metric": [lgb_macro_f1]}

estimator = LGBMModel(**model_params)
model_output_dir = Path(CFG.paths.output_dir) / "models"

trained_estimators = train_cv_tabular_v1(
    df=train_feature_df,
    estimator=estimator,
    feature_columns=feature_columns,
    target_columns=["health"],
    fit_params=fit_params,
    output_dir=model_output_dir,
    overwrite=True,
)

valid_result_df = predict_cv_tabular_v1(
    df=train_feature_df,
    estimators=trained_estimators,
    feature_columns=feature_columns,
)

val_score = macro_f1_from_proba(y_true=valid_result_df["health"], y_pred=valid_result_df["pred"].tolist())
logger.info(f"macro f1 score: {val_score}")

fig, importance_df = visualize_feature_importance(
    estimators=trained_estimators,
    feature_columns=feature_columns,
    top_n=50,
)

In [None]:
def make_submission(test_result_df):
    test_pred_df = pd.concat([test_result_df[["uid"]], pd.DataFrame(test_result_df["pred"].tolist())], axis=1)
    test_df["pred"] = np.argmax(test_pred_df.groupby("uid").mean(), axis=1)
    return test_df[["uid", "pred"]]


test_result_df = predict_cv_tabular_v1(
    df=test_feature_df,
    estimators=trained_estimators,
    feature_columns=feature_columns,
    test=True,
)

submission_df = make_submission(test_result_df)

In [None]:
submission_filepath = Path(CFG.paths.output_dir) / f"submissions_{CFG.experiment_name}.csv"
submission_df.to_csv(submission_filepath, index=False, header=False)

submission_df_ = pd.read_csv(submission_filepath)
assert len(submission_df_) == len(sample_submission_df)