In [None]:
import os
import hydra
import logging
import json
import pandas as pd
import joblib
import warnings
import rootutils
import numpy as np
from pathlib import Path
from hydra import compose, initialize
from hydra.core.hydra_config import HydraConfig
from sklearn.metrics import f1_score

rootutils.setup_root(search_from="../", indicator=".project-root", pythonpath=True)

from src.experiment.utils import (
    assign_fold_index,
    plot_confusion_matrix,
    visualize_feature_importance,
    plot_label_distributions,
)
from src.experiment.feature.runner import run_extractors
from src.experiment.metrics import macro_f1_from_proba
from src.experiment.model.runner import train_cv_tabular_v1, predict_cv_tabular_v1
from src.experiment.optimization import find_optimal_threshold_for_label, decode_label
from src.experiment.model.custom_metrics import lgb_macro_auc, lgb_macro_f1

In [None]:
OVERRIDES: list[str] = os.getenv("OVERRIDES", "experiment=089-ensemble_v1").split(",")

In [None]:
if OVERRIDES is None:
    raise ValueError("OVERRIDES is not set")

with initialize(version_base=None, config_path="../configs"):
    CFG = compose(
        config_name="config.yaml",
        return_hydra_config=True,
        overrides=OVERRIDES,
    )
    HydraConfig.instance().set_config(CFG)  # use HydraConfig for notebook to use hydra job

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

if not logger.handlers:
    handler = logging.StreamHandler()
    logger.addHandler(handler)


warnings.filterwarnings("ignore")
INPUT_DIR = Path(CFG.paths.input_dir)
OUTPUT_DIR = Path(CFG.paths.output_dir)
BASE_OUTPUT_DIR = Path(CFG.paths.resource_dir) / "outputs"

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

### Load Data


In [None]:
def assign_meta(df: pd.DataFrame, data="train"):
    df["data"] = data
    df["fold"] = -1
    return df


train_df = pd.read_csv(INPUT_DIR / "train.csv").rename(columns={"Unnamed: 0": "uid"})
test_df = pd.read_csv(INPUT_DIR / "test.csv").rename(columns={"Unnamed: 0": "uid"})
sample_submission_df = pd.read_csv(INPUT_DIR / "sample_submission.csv")

train_df = assign_meta(train_df, data="train")
test_df = assign_meta(test_df, data="test")
raw_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

### Ensemble


In [None]:
def make_pred_label_df(ensemble_exps: list[str]):
    feature_df = pd.DataFrame()
    for exp in ensemble_exps:
        logger.info(f"Load {exp} ...")
        train_filepath = BASE_OUTPUT_DIR / exp / "valid_result_df.pkl"
        test_filepath = BASE_OUTPUT_DIR / exp / "test_result_df.pkl"

        train_result_df = joblib.load(train_filepath)
        test_result_df = joblib.load(test_filepath)

        test_pred_df_ = test_result_df[["uid", 0, 1, 2]].groupby("uid").mean()
        test_pred_df_["pred"] = test_pred_df_.to_numpy().tolist()
        test_pred_df = test_pred_df_[["pred"]].reset_index()
        opt_results, val_pred_label = find_optimal_threshold_for_label(
            proba_matrix=np.array(train_result_df["pred"].to_numpy().tolist()),
            true_labels=train_result_df["health"],
            label_indices=[2, 0, 1],
        )

        best_f1_score = f1_score(
            y_true=train_result_df["health"],
            y_pred=val_pred_label,
            average="macro",
        )

        logger.info(f"best f1 score: {best_f1_score:.4f}")

        train_pred_df = (
            train_result_df[["uid"]]
            .assign(pred_label=val_pred_label)
            .set_index("uid")
            .add_prefix(f"{exp}_")
            .reset_index()
        )
        test_pred_df["pred_label"] = decode_label(
            proba_matrix=np.array(test_pred_df["pred"].to_numpy().tolist()), thresholds=opt_results
        )
        test_pred_df = test_pred_df[["uid", "pred_label"]].set_index("uid").add_prefix(f"{exp}_").reset_index()

        df = pd.concat(
            [train_pred_df.set_index("uid"), test_pred_df.set_index("uid")],
            axis=0,
            ignore_index=False,
        )
        feature_df = pd.concat([feature_df, df], axis=1)

    return feature_df


def voting(candidates: list[int], priorities: list[int]):
    vote_counts = {candidate: candidates.count(candidate) for candidate in set(candidates)}
    max_votes = max(vote_counts.values())
    potential_winners = [candidate for candidate, votes in vote_counts.items() if votes == max_votes]

    if len(potential_winners) == 1:
        return potential_winners[0]
    for priority in priorities:
        if priority in potential_winners:
            return priority


ensemble_pred_df = make_pred_label_df(ensemble_exps=CFG.ensemble_exps)
ensemble_pred_df["pred_label"] = ensemble_pred_df.apply(
    lambda row: voting(row.tolist(), priorities=[2, 0, 1]), axis=1
).tolist()
raw_feature_df = pd.merge(raw_df, ensemble_pred_df, on="uid", how="left")

train_feature_df = raw_feature_df.query("data == 'train'").reset_index(drop=True).astype({"health": int})
test_feature_df = raw_feature_df.query("data == 'test'").reset_index(drop=True)

score = f1_score(
    y_true=train_feature_df["health"],
    y_pred=train_feature_df["pred_label"],
    average="macro",
)
logger.info(f"score: {score:.4f}")

In [None]:
fig = plot_confusion_matrix(y_true=train_feature_df["health"], y_pred=train_feature_df["pred_label"])
fig.savefig(OUTPUT_DIR / "confusion_matrix.png", dpi=300)

fig = plot_confusion_matrix(y_true=train_feature_df["health"], y_pred=train_feature_df["pred_label"], normalize=True)
fig.savefig(OUTPUT_DIR / "confusion_matrix_normalized.png", dpi=300)

### Make submission


In [None]:
submission_df = test_feature_df[["uid", "pred_label"]].sort_values("uid")
submission_filepath = Path(CFG.paths.output_dir) / f"submissions_{CFG.experiment_name}_{score:.3f}.csv"
submission_df.to_csv(submission_filepath, index=False, header=False)

In [None]:
submission_df

In [None]:
submission_df["pred_label"].value_counts() / len(submission_df)