In [None]:
import os
import hydra
import logging
import json
import pandas as pd
import joblib
import warnings
import rootutils
import numpy as np
from pathlib import Path
from hydra.core.hydra_config import HydraConfig
import omegaconf

rootutils.setup_root(search_from="../", indicator=".project-root", pythonpath=True)

from src.experiment.utils import (
    assign_fold_index,
    plot_confusion_matrix,
    visualize_feature_importance,
    plot_label_distributions,
    plot_venn_diagrams,
)
from src.experiment.feature.runner import run_extractors
from src.experiment.model.runner import train_cv_tabular_v1, predict_cv_tabular_v1
from src.utils.log_utils import get_consol_handler, get_file_handler
from src.experiment.optimization import opt_macro_f1_score
from src.experiment.model.custom_metrics import lgb_macro_f1, xgb_macro_f1

In [None]:
OVERRIDES: list[str] = os.getenv("OVERRIDES", "experiment=003-v01").split(",")

In [None]:
if OVERRIDES is None:
    raise ValueError("OVERRIDES is not set")

with hydra.initialize(version_base=None, config_path="../configs"):
    CFG = hydra.compose(
        config_name="config.yaml",
        return_hydra_config=True,
        overrides=OVERRIDES,
    )
    HydraConfig.instance().set_config(CFG)  # use HydraConfig for notebook to use hydra job

# set directories as global variables
INPUT_DIR = Path(CFG.paths.input_dir)

if CFG.debug:
    CFG.paths.output_dir = f"{CFG.paths.output_dir}_debug"

OUTPUT_DIR = Path(CFG.paths.output_dir)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
BASE_OUTPUT_DIR = Path(CFG.paths.resource_dir) / "outputs"


# set logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = [
    get_file_handler(OUTPUT_DIR / "notebook.log"),
    get_consol_handler(),
]

warnings.filterwarnings("ignore")

### Load Data


In [None]:
def assign_meta(df: pd.DataFrame, data="train"):
    df["data"] = data
    df["fold"] = -1
    df["group"] = df["State"] + "_" + df["FranchiseCode"].astype(str)
    return df


def money_to_numeric(x: str) -> int:
    return int(x.replace("$", "").replace(",", "").split(".")[0])


def transform_money(input_df: pd.DataFrame):
    df = input_df.copy()
    df["DisbursementGross"] = df["DisbursementGross"].map(money_to_numeric, na_action="ignore")
    df["GrAppv"] = df["GrAppv"].map(money_to_numeric, na_action="ignore")
    df["SBA_Appv"] = df["SBA_Appv"].map(money_to_numeric, na_action="ignore")
    return df


train_df = pd.read_csv(INPUT_DIR / "train.csv").rename(columns={"Unnamed: 0": "uid"})
test_df = pd.read_csv(INPUT_DIR / "test.csv").rename(columns={"Unnamed: 0": "uid"})
sample_submission_df = pd.read_csv(INPUT_DIR / "sample_submission.csv")

train_df = assign_meta(train_df, data="train")
test_df = assign_meta(test_df, data="test")

train_df = transform_money(train_df)
test_df = transform_money(test_df)

### CV Split


In [None]:
# check label distribution
plot_venn_diagrams(
    train_df,
    test_df,
    cat_cols=[
        "State",
        "BankState",
        "Sector",
        "FranchiseCode",
        "UrbanRural",
        "group",
    ],
)

In [None]:
kfold = hydra.utils.instantiate(CFG.cv)
train_df = assign_fold_index(train_df=train_df, kfold=kfold, y_col="MIS_Status", group_col="group")

if CFG.debug:
    train_df_neg = train_df.query("MIS_Status == 0").sample(1000, random_state=CFG.seed)
    train_df_pos = train_df.query("MIS_Status == 1").sample(1000, random_state=CFG.seed)
    train_df = pd.concat([train_df_neg, train_df_pos]).reset_index(drop=True)

train_df.groupby("fold")["MIS_Status"].agg(["count", "mean", "sum"]).assign(
    negative=lambda x: x["count"] - x["sum"], ratio=lambda x: x["sum"] / x["count"]
)

### Assign Ensemble


In [None]:
ensemble_exps = CFG.get("ensemble_exps")
if ensemble_exps is not None:
    for exp in ensemble_exps:
        logger.info(f"Load {exp} ...")
        train_filepath = BASE_OUTPUT_DIR / exp / "valid_result_df.pkl"
        test_filepath = BASE_OUTPUT_DIR / exp / "test_result_df.pkl"

        train_result_df = (
            joblib.load(train_filepath)[["uid", "pred", "pred_label"]].set_index("uid").add_prefix(f"f_{exp}_")
        )
        test_result_df = (
            joblib.load(test_filepath)[["uid", "pred", "pred_label"]].set_index("uid").add_prefix(f"f_{exp}_")
        )
        train_df = train_df.merge(train_result_df, on="uid", how="left")
        test_df = test_df.merge(test_result_df, on="uid", how="left")

### Feature Engineering


In [None]:
def get_agg_feature_extractors(feature_extractors, all_group_keys):
    if feature_extractors is None:
        return []

    if all_group_keys is None:
        return []

    extractors = []
    for extractor in feature_extractors:
        for group_keys in all_group_keys:
            _extractor = hydra.utils.instantiate(extractor, group_keys=group_keys)
            extractors.append(_extractor)
    return extractors


# train features : train data のみから作成
feature_extractors = hydra.utils.instantiate(CFG.feature_extractors)
feature_extractors.extend(
    get_agg_feature_extractors(
        feature_extractors=CFG.get("agg_feature_extractors"),
        all_group_keys=CFG.get("group_keys_for_agg"),
    )
)
feature_extractors.extend(
    get_agg_feature_extractors(
        feature_extractors=CFG.get("te_feature_extractors"),
        all_group_keys=CFG.get("group_keys_for_te"),
    )
)
feature_extractors.extend(
    get_agg_feature_extractors(
        feature_extractors=CFG.get("rolling_agg_feature_extractors"),
        all_group_keys=CFG.get("group_keys_for_rolling_agg"),
    )
)

train_feature_df = run_extractors(
    input_df=train_df,
    extractors=feature_extractors,
    dirpath=Path(CFG.paths.feature_store_dir),
    fit=True,
    cache=CFG.cache_feature_extractors,
)
train_feature_df = pd.concat([train_df, train_feature_df], axis=1)

# test features : test data から作成
feature_extractors = hydra.utils.instantiate(CFG.feature_extractors)
feature_extractors.extend(
    get_agg_feature_extractors(
        feature_extractors=CFG.get("agg_feature_extractors"),
        all_group_keys=CFG.get("group_keys_for_agg"),
    )
)
feature_extractors.extend(
    get_agg_feature_extractors(
        feature_extractors=CFG.get("te_feature_extractors"),
        all_group_keys=CFG.get("group_keys_for_te"),
    )
)
feature_extractors.extend(
    get_agg_feature_extractors(
        feature_extractors=CFG.get("rolling_agg_feature_extractors"),
        all_group_keys=CFG.get("group_keys_for_rolling_agg"),
    )
)
test_feature_df = run_extractors(
    input_df=test_df,
    extractors=feature_extractors,
    dirpath=Path(CFG.paths.feature_store_dir),
    fit=False,
    cache=CFG.cache_feature_extractors,
)

test_feature_df = pd.concat([test_df, test_feature_df], axis=1)

feature_columns = [col for col in train_feature_df.columns if col.startswith("f_")]
logger.info(f"train_feature_df.shape: {train_feature_df.shape}")
logger.info(f"test_feature_df.shape: {test_feature_df.shape}")

### Training


In [None]:
def assign_seed_average_pred(result_df: pd.DataFrame) -> pd.DataFrame:
    cols = [col for col in result_df.columns if col.startswith("pred_")]
    pred = 0
    for col in cols:
        pred += np.array(result_df[col].tolist())
    result_df["pred"] = list(pred / len(cols))
    return result_df.reset_index()


def to_python_type(value):
    if isinstance(value, omegaconf.DictConfig):
        return dict(value)
    elif isinstance(value, omegaconf.ListConfig):
        return list(value)
    else:
        return value


valid_result_df = pd.DataFrame()
all_trained_estimators = []
scores = {}
for seed in CFG.seed_average_seeds:
    logger.info(f"\n\nstart training seed={seed} 🚀")

    fit_params = dict(hydra.utils.instantiate(CFG.model.fit_params))
    params = {k: to_python_type(v) for k, v in CFG.model.get("params").items()}
    if CFG.model.estimator._target_.startswith("lightgbm.LGBM"):
        params["random_state"] = seed
        params["num_leaves"] = seed  # lgbm
        fit_params["eval_metric"] = [lgb_macro_f1]
        # NOTE : class_weight などが omegaconf.DictConfig の場合 error になるため set_params で設定

    if CFG.model.estimator._target_.startswith("xgboost.XGB"):
        params["eval_metric"] = xgb_macro_f1
        params["max_depth"] = seed

    estimator = hydra.utils.instantiate(CFG.model.estimator).set_params(**params)

    model_output_dir = OUTPUT_DIR / "models" / f"seed{seed}"
    trained_estimators = train_cv_tabular_v1(
        df=train_feature_df,
        estimator=estimator,
        feature_columns=feature_columns,
        target_columns=["MIS_Status"],
        fit_params=fit_params,
        output_dir=model_output_dir,
        overwrite=CFG.overwrite_training,
        use_xgb_class_weight=CFG.model.get("use_xgb_class_weight"),
        use_eval_set=CFG.model.get("use_eval_set"),
    )

    i_valid_result_df = predict_cv_tabular_v1(
        df=train_feature_df.query("data == 'train'").reset_index(drop=True),
        estimators=trained_estimators,
        feature_columns=feature_columns,
        predict_proba=CFG.model.get("predict_proba"),
    )
    i_opt_result: dict = opt_macro_f1_score(
        y_true=i_valid_result_df["MIS_Status"].to_numpy(),
        y_pred=np.array(i_valid_result_df["pred"].tolist()),
    )

    logger.info(f"macro f1 score [seed={seed}]: {i_opt_result}")
    scores[f"seed{seed}"] = i_opt_result

    valid_result_df = pd.concat(
        [
            valid_result_df,
            i_valid_result_df[["uid", "pred", "MIS_Status"]]
            .set_index(["uid", "MIS_Status"])
            .rename(columns={"pred": f"pred_{seed}"}),
        ],
        axis=1,
    )
    all_trained_estimators.extend(trained_estimators)


valid_result_df = assign_seed_average_pred(valid_result_df)
val_proba = np.array(valid_result_df["pred"].tolist()).reshape(-1)
opt_result: dict = opt_macro_f1_score(
    y_true=valid_result_df["MIS_Status"].to_numpy(),
    y_pred=val_proba,
)
val_pred_label = val_proba >= opt_result["th"]
scores["all_opt"] = opt_result
logger.info(f"score: {scores}")

joblib.dump(
    valid_result_df[["uid", "MIS_Status", "pred"]].assign(pred_label=val_pred_label * 1),
    OUTPUT_DIR / "valid_result_df.pkl",
)
json.dump(scores, open(OUTPUT_DIR / "scores.json", "w"))

### Visualization


In [None]:
try:
    fig, importance_df = visualize_feature_importance(
        estimators=all_trained_estimators,
        feature_columns=feature_columns,
        top_n=50,
    )
    fig.savefig(OUTPUT_DIR / "feature_importance.png", dpi=300)
    importance_df.to_csv(OUTPUT_DIR / "feature_importance.csv", index=False)
except AttributeError as e:
    logger.warning(f"feature_importance plot failed: {e}")


fig = plot_label_distributions(proba_matrix=np.array(valid_result_df["pred"].tolist()).reshape(-1, 1))
fig.show()
fig.savefig(OUTPUT_DIR / "label_distributions.png", dpi=300)


fig = plot_confusion_matrix(y_true=valid_result_df["MIS_Status"], y_pred=val_pred_label)
fig.savefig(OUTPUT_DIR / "confusion_matrix.png", dpi=300)

fig = plot_confusion_matrix(y_true=valid_result_df["MIS_Status"], y_pred=val_pred_label, normalize=True)
fig.savefig(OUTPUT_DIR / "confusion_matrix_normalized.png", dpi=300)

### Make submission


In [None]:
test_result_df = predict_cv_tabular_v1(
    df=test_feature_df,
    estimators=all_trained_estimators,
    feature_columns=feature_columns,
    test=True,
    predict_proba=CFG.model.predict_proba,
)

test_proba = np.array(test_result_df["pred"].tolist()).reshape(-1)
test_pred_df = test_result_df[["uid"]].assign(pred=test_proba).groupby("uid").mean().reset_index()
test_pred_df["pred_label"] = (test_pred_df["pred"] >= opt_result["th"]) * 1

submission_df = test_df[["uid"]].merge(test_pred_df[["uid", "pred_label"]], on="uid", how="left")
submission_filepath = (
    Path(CFG.paths.output_dir) / f"submissions_{CFG.experiment_name}_{scores['all_opt']['score']:.3f}.csv"
)
submission_df.to_csv(submission_filepath, index=False, header=False)
joblib.dump(test_pred_df, OUTPUT_DIR / "test_result_df.pkl")