In [None]:
import os
import hydra
import logging
import re
import json
from multiprocessing import cpu_count

import pandas as pd
import joblib
import warnings
import rootutils
import numpy as np
from pathlib import Path
from hydra import compose, initialize
from hydra.core.hydra_config import HydraConfig
from sklearn.metrics import f1_score
from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    EvalPrediction,
    Trainer,
)
from torch import nn
import torch
from transformers import Trainer

rootutils.setup_root(search_from="../", indicator=".project-root", pythonpath=True)

from src.experiment.utils import (
    assign_fold_index,
    plot_confusion_matrix,
    visualize_feature_importance,
    plot_label_distributions,
)
from src.experiment.feature.runner import run_extractors
from src.experiment.metrics import macro_f1_from_proba
from src.experiment.model.runner import train_cv_tabular_v1, predict_cv_tabular_v1
from src.experiment.optimization import find_optimal_threshold_for_label, decode_label
from src.experiment.model.custom_metrics import lgb_macro_auc, lgb_macro_f1

In [None]:
OVERRIDES: list[str] = os.getenv("OVERRIDES", "experiment=067-text_v1").split(",")

In [None]:
if OVERRIDES is None:
    raise ValueError("OVERRIDES is not set")

with initialize(version_base=None, config_path="../configs"):
    CFG = compose(
        config_name="config.yaml",
        return_hydra_config=True,
        overrides=OVERRIDES,
    )
    HydraConfig.instance().set_config(CFG)  # use HydraConfig for notebook to use hydra job

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

if not logger.handlers:
    handler = logging.StreamHandler()
    logger.addHandler(handler)


warnings.filterwarnings("ignore")
INPUT_DIR = Path(CFG.paths.input_dir)
OUTPUT_DIR = Path(CFG.paths.output_dir)

### Load Data


In [None]:
def assign_meta(df: pd.DataFrame, data="train"):
    df["data"] = data
    df["fold"] = -1
    return df


def align_train_test_unique(train, test, ignore_columns=["uid", "data", "fold", "created_at"], fill_value=np.nan):
    """
    This function modifies both 'train' and 'test' DataFrames. For each column,
    it replaces values that are unique to either set (not common to both) with NaN.

    :param train: DataFrame used for training.
    :param test: DataFrame used for testing.
    :return: Tuple of modified train and test DataFrames.
    """
    aligned_train = train.copy()
    aligned_test = test.copy()

    for column in train.columns:
        if column in ignore_columns:
            continue
        if column in test.columns:
            # Find values that are not common to both train and test sets
            common_values = set(train[column]).intersection(set(test[column]))

            aligned_train[column] = train[column].apply(lambda x: x if x in common_values else fill_value)
            aligned_test[column] = test[column].apply(lambda x: x if x in common_values else fill_value)

    return aligned_train, aligned_test


def replace_rare_values(df, cols, threshold, replacement_value):
    """
    This function replaces values in each column of the DataFrame that appear less frequently
    than the specified threshold with a specified replacement value.

    :param df: DataFrame to process.
    :param threshold: Frequency threshold. Values appearing less than this will be replaced.
    :param replacement_value: The value to replace rare values with.
    :return: Modified DataFrame.
    """
    for column in cols:
        value_counts = df[column].value_counts()
        values_to_replace = value_counts[value_counts < threshold].index

        df[column] = df[column].apply(lambda x: replacement_value if x in values_to_replace else x)
    return df


train_df = pd.read_csv(INPUT_DIR / "train.csv").rename(columns={"Unnamed: 0": "uid"})
test_df = pd.read_csv(INPUT_DIR / "test.csv").rename(columns={"Unnamed: 0": "uid"})
sample_submission_df = pd.read_csv(INPUT_DIR / "sample_submission.csv")

if CFG.debug:
    train_df = train_df.sample(10).reset_index(drop=True)
    test_df = test_df.sample(10).reset_index(drop=True)

train_df = assign_meta(train_df, data="train")
test_df = assign_meta(test_df, data="test")

if CFG.align_train_test:
    train_df, test_df = align_train_test_unique(
        train=train_df,
        test=test_df,
        ignore_columns=[
            "uid",
            "data",
            "fold",
            "created_at",
            "tree_dbh",
            # "spc_common",
            # "spc_latin",
        ],
    )  #  test にないものは nan にする

for col in test_df.columns:
    if test_df[col].dtype == "float":
        continue
    logger.info(f"{col}: {train_df[col].nunique()}, {test_df[col].nunique()}")

### CV Split


In [None]:
kfold = hydra.utils.instantiate(CFG.cv)
train_df = assign_fold_index(train_df=train_df, kfold=kfold, y_col="health")

raw_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

if CFG.replace_rare_values_threshold is not None:
    raw_df = replace_rare_values(
        df=raw_df,
        cols=[
            "spc_common",
            "spc_latin",
            "boro_ct",
            "cb_num",
            "st_assem",
            "nta",
            "nta_name",
            "zip_city",
            "borocode",
            "boroname",
            "st_senate",
            "cncldist",
        ],
        threshold=CFG.replace_rare_values_threshold,
        replacement_value=np.nan,
    )

    for col in raw_df.columns:
        logger.info(f"{col}: {raw_df[col].nunique()}")

### Datasets


In [None]:
def make_text(row):
    def _parse(text):
        if text == "None":
            return text
        elements = re.split("(?=[A-Z])", text)[1:]
        return " ".join(elements).lower()

    def _parse_steward(steward):
        if steward == "None":
            return "no steward"
        elif steward == "1or2":
            return "1 or 2"
        elif steward == "3or4":
            return "3 or 4"
        elif steward == "4orMore":
            return "4 or more"
        else:
            return steward

    # created_at = f'created at {row["created_at"]}'
    # tree_dbh = f'tree diameter {row["tree_dbh"]}'
    # curb_loc = f'curb location {_parse(row["curb_loc"])}'
    # steward = f'steward {_parse_steward(row["steward"])}'
    # guards = f'guards {(row["guards"])}'
    # sidewalk = f'sidewalk condition {_parse(row["sidewalk"])}'
    # user_type = f'user type {row["user_type"]}'
    # problems = f'problems {_parse(row["problems"])}'
    # spc_common = f'species common name {row["spc_common"]}'
    # spc_latin = f'species latin name {row["spc_latin"]}'
    # nta_name = f'neighborhood tabulation area name {row["nta_name"]}'
    # borocode = f'borough code {row["borocode"]}'
    # boroname = f'borough name {row["boroname"]}'
    # zip_city = f'zip city {row["zip_city"]}'
    # boro_ct = f'boro ct {row["boro_ct"]}'
    # cb_num = f'cb num {row["cb_num"]}'
    # st_senate = f'st senate {row["st_senate"]}'
    # st_assem = f'st assem {row["st_assem"]}'
    # cncldist = f'cncldist {row["cncldist"]}'
    created_at = f'created at {row["created_at"]}'
    tree_dbh = f'tree diameter {row["tree_dbh"]}'
    curb_loc = f'{_parse(row["curb_loc"])}'
    steward = f'{_parse_steward(row["steward"])}'
    guards = f'{(row["guards"])}'
    sidewalk = f'{_parse(row["sidewalk"])}'
    user_type = f'{row["user_type"]}'
    problems = f'{_parse(row["problems"])}'
    spc_common = f'{row["spc_common"]}'
    spc_latin = f'{row["spc_latin"]}'
    nta_name = f'{row["nta_name"]}'
    borocode = f'{row["borocode"]}'
    boroname = f'{row["boroname"]}'
    zip_city = f'{row["zip_city"]}'
    boro_ct = f'{row["boro_ct"]}'
    cb_num = f'{row["cb_num"]}'
    st_senate = f'{row["st_senate"]}'
    st_assem = f'{row["st_assem"]}'
    cncldist = f'{row["cncldist"]}'

    text = " ".join(
        [
            created_at,
            tree_dbh,
            curb_loc,
            steward,
            guards,
            sidewalk,
            user_type,
            problems,
            spc_common,
            spc_latin,
            nta_name,
            borocode,
            boroname,
            zip_city,
            boro_ct,
            cb_num,
            st_senate,
            st_assem,
            cncldist,
        ]
    )
    return text


def get_datasets(df: pd.DataFrame, tokenizer, max_len: int):
    def text_to_input_ids(examples):
        return tokenizer(examples["text"], padding=False, truncation=True, max_length=max_len)

    # convert [SEP] to tokenizer.sep_token
    df["text"] = df["text"].replace("[SEP]", tokenizer.sep_token)

    ds = Dataset.from_pandas(df)
    return ds.map(text_to_input_ids, batched=True, num_proc=max([1, cpu_count() - 1]))


def compute_metrics(p: EvalPrediction):
    preds = p.predictions
    labels = p.label_ids
    pred_labels = np.argmax(preds, axis=1)
    score = f1_score(labels, pred_labels, average="macro")
    metrics = {"f1_score": score}
    return metrics


train_df["text"] = train_df.fillna("None").apply(make_text, axis=1)
test_df["text"] = test_df.fillna("None").apply(make_text, axis=1)

### Train & Inference


In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(
            weight=torch.tensor(
                [
                    1.88439415,
                    0.42291495,
                    9.5434575,
                ],
                device=model.device,
            ).float()
        )
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
valid_result_df = pd.DataFrame()
test_result_df = pd.DataFrame()
scores = {}

tokenizer = AutoTokenizer.from_pretrained(CFG.transformer_model)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

for i_fold in np.sort(train_df["fold"].unique()):
    logger.info(f"fold{i_fold} start!!! 🚀")

    model = AutoModelForSequenceClassification.from_pretrained(CFG.transformer_model, num_labels=3)
    i_train_df = train_df[train_df["fold"] != i_fold]
    i_valid_df = train_df[train_df["fold"] == i_fold]

    train_ds = get_datasets(
        df=i_train_df[["uid", "health", "text"]].rename(columns={"health": "labels"}),
        tokenizer=tokenizer,
        max_len=CFG.max_length,
    )
    valid_ds = get_datasets(
        df=i_valid_df[["uid", "health", "text"]].rename(columns={"health": "labels"}),
        tokenizer=tokenizer,
        max_len=CFG.max_length,
    )
    test_ds = get_datasets(
        df=test_df[["uid", "text"]],
        tokenizer=tokenizer,
        max_len=CFG.max_length,
    )

    train_args = hydra.utils.instantiate(CFG.train_args, output_dir=OUTPUT_DIR / f"fold{i_fold}")
    trainer = CustomTrainer(
        model=model,
        args=train_args,
        train_dataset=train_ds,
        eval_dataset=valid_ds,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    trainer.train()

    # predict on valid and test by trained model
    val_pred_result = trainer.predict(valid_ds)

    # calc val score
    tmp_val_pred_label = np.argmax(val_pred_result.predictions, axis=1)
    val_score = f1_score(val_pred_result.label_ids, tmp_val_pred_label, average="macro")
    logger.info(f"fold{i_fold} val score: {val_score}")

    # store valid and test prediction
    i_valid_result_df = i_valid_df.assign(pred=val_pred_result.predictions.tolist())
    valid_result_df = pd.concat([valid_result_df, i_valid_result_df], axis=0)

    test_pred = trainer.predict(test_ds).predictions
    i_test_result_df = test_df.assign(pred=test_pred.tolist())
    test_result_df = pd.concat([test_result_df, i_test_result_df], axis=0)

# reindex
valid_result_df = valid_result_df.sort_values("uid").reset_index(drop=True)
test_result_df = test_result_df.sort_values("uid").reset_index(drop=True)

val_proba = np.array(valid_result_df["pred"].tolist())
val_score = macro_f1_from_proba(y_true=valid_result_df["health"], y_pred=val_proba)
scores["all_nomal"] = val_score

opt_results, val_pred_label = find_optimal_threshold_for_label(
    proba_matrix=val_proba,
    true_labels=valid_result_df["health"],
    label_indices=[2, 0, 1],
)
best_f1_score = f1_score(
    y_true=valid_result_df["health"],
    y_pred=val_pred_label,
    average="macro",
)
scores["all_opt"] = best_f1_score

logger.info(f"macro f1 score: {val_score}")
logger.info(f"optimized thresholds: {opt_results}")
logger.info(f"best f1 score: {best_f1_score}")

joblib.dump(valid_result_df[["uid", "health", "pred"]], OUTPUT_DIR / "valid_result_df.pkl")
json.dump(scores, open(OUTPUT_DIR / "scores.json", "w"))

### Visualization


In [None]:
fig = plot_label_distributions(proba_matrix=np.array(valid_result_df["pred"].tolist()))
fig.show()
fig.savefig(OUTPUT_DIR / "label_distributions.png", dpi=300)


fig = plot_confusion_matrix(y_true=valid_result_df["health"], y_pred=val_pred_label)
fig.savefig(OUTPUT_DIR / "confusion_matrix.png", dpi=300)

fig = plot_confusion_matrix(y_true=valid_result_df["health"], y_pred=val_pred_label, normalize=True)
fig.savefig(OUTPUT_DIR / "confusion_matrix_normalized.png", dpi=300)

### Make Submission


In [None]:
test_pred_df = pd.concat([test_result_df[["uid"]], pd.DataFrame(test_result_df["pred"].tolist())], axis=1)
test_df["pred"] = np.argmax(test_pred_df.groupby("uid").mean(), axis=1)
submission_df = test_df[["uid", "pred"]]
submission_filepath = Path(CFG.paths.output_dir) / f"submissions_{CFG.experiment_name}.csv"
submission_df.to_csv(submission_filepath, index=False, header=False)

In [None]:
test_pred_df = pd.concat([test_result_df[["uid"]], pd.DataFrame(test_result_df["pred"].tolist())], axis=1)
test_df["opt_pred"] = decode_label(proba_matrix=test_pred_df.groupby("uid").mean().to_numpy(), thresholds=opt_results)

submission_filepath = Path(CFG.paths.output_dir) / f"submissions_{CFG.experiment_name}_opt_{best_f1_score:.3f}.csv"
test_df[["uid", "opt_pred"]].to_csv(submission_filepath, index=False, header=False)

joblib.dump(test_pred_df, OUTPUT_DIR / "test_result_df.pkl")