In [None]:
import os
import hydra
import logging
import json
import pandas as pd
import joblib
import warnings
import rootutils
import numpy as np
from pathlib import Path
from hydra import compose, initialize
from hydra.core.hydra_config import HydraConfig
from sklearn.metrics import f1_score

rootutils.setup_root(search_from="../", indicator=".project-root", pythonpath=True)

from src.experiment.utils import (
    assign_fold_index,
    plot_confusion_matrix,
    visualize_feature_importance,
    plot_label_distributions,
)
from src.experiment.feature.runner import run_extractors
from src.experiment.metrics import macro_f1_from_proba
from src.experiment.model.runner import train_cv_tabular_v1, predict_cv_tabular_v1
from src.experiment.optimization import find_optimal_threshold_for_label, decode_label
from src.experiment.model.custom_metrics import lgb_macro_auc, lgb_macro_f1

In [None]:
OVERRIDES: list[str] = os.getenv("OVERRIDES", "experiment=047-tabular_v3").split(",")

In [None]:
if OVERRIDES is None:
    raise ValueError("OVERRIDES is not set")

with initialize(version_base=None, config_path="../configs"):
    CFG = compose(
        config_name="config.yaml",
        return_hydra_config=True,
        overrides=OVERRIDES,
    )
    HydraConfig.instance().set_config(CFG)  # use HydraConfig for notebook to use hydra job

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

if not logger.handlers:
    handler = logging.StreamHandler()
    logger.addHandler(handler)


warnings.filterwarnings("ignore")
INPUT_DIR = Path(CFG.paths.input_dir)
OUTPUT_DIR = Path(CFG.paths.output_dir)

### Load Data


In [None]:
def assign_meta(df: pd.DataFrame, data="train"):
    df["data"] = data
    df["fold"] = -1
    return df


def align_train_test_unique(train, test, ignore_columns=["uid", "data", "fold", "created_at"], fill_value=np.nan):
    """
    This function modifies both 'train' and 'test' DataFrames. For each column,
    it replaces values that are unique to either set (not common to both) with NaN.

    :param train: DataFrame used for training.
    :param test: DataFrame used for testing.
    :return: Tuple of modified train and test DataFrames.
    """
    aligned_train = train.copy()
    aligned_test = test.copy()

    for column in train.columns:
        if column in ignore_columns:
            continue
        if column in test.columns:
            # Find values that are not common to both train and test sets
            common_values = set(train[column]).intersection(set(test[column]))

            aligned_train[column] = train[column].apply(lambda x: x if x in common_values else fill_value)
            aligned_test[column] = test[column].apply(lambda x: x if x in common_values else fill_value)

    return aligned_train, aligned_test


def replace_rare_values(df, cols, threshold, replacement_value):
    """
    This function replaces values in each column of the DataFrame that appear less frequently
    than the specified threshold with a specified replacement value.

    :param df: DataFrame to process.
    :param threshold: Frequency threshold. Values appearing less than this will be replaced.
    :param replacement_value: The value to replace rare values with.
    :return: Modified DataFrame.
    """
    for column in cols:
        value_counts = df[column].value_counts()
        values_to_replace = value_counts[value_counts < threshold].index

        df[column] = df[column].apply(lambda x: replacement_value if x in values_to_replace else x)
    return df


train_df = pd.read_csv(INPUT_DIR / "train.csv").rename(columns={"Unnamed: 0": "uid"})
test_df = pd.read_csv(INPUT_DIR / "test.csv").rename(columns={"Unnamed: 0": "uid"})
sample_submission_df = pd.read_csv(INPUT_DIR / "sample_submission.csv")

train_df = assign_meta(train_df, data="train")
test_df = assign_meta(test_df, data="test")

if CFG.align_train_test:
    train_df, test_df = align_train_test_unique(
        train=train_df,
        test=test_df,
        ignore_columns=[
            "uid",
            "data",
            "fold",
            "created_at",
            "tree_dbh",
            # "spc_common",
            # "spc_latin",
        ],
    )  #  test にないものは nan にする

for col in test_df.columns:
    if test_df[col].dtype == "float":
        continue
    logger.info(f"{col}: {train_df[col].nunique()}, {test_df[col].nunique()}")

### CV Split


In [None]:
kfold = hydra.utils.instantiate(CFG.cv)
train_df = assign_fold_index(train_df=train_df, kfold=kfold, y_col="health")

raw_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

if CFG.replace_rare_values_threshold is not None:
    raw_df = replace_rare_values(
        df=raw_df,
        cols=[
            "spc_common",
            "spc_latin",
            "boro_ct",
            "cb_num",
            "st_assem",
            "nta",
            "nta_name",
            "zip_city",
            "borocode",
            "boroname",
            "st_senate",
            "cncldist",
        ],
        threshold=CFG.replace_rare_values_threshold,
        replacement_value=np.nan,
    )

    for col in raw_df.columns:
        logger.info(f"{col}: {raw_df[col].nunique()}")

### Feature Engineering


In [None]:
feature_extractors = hydra.utils.instantiate(CFG.feature_extractors)
for extractor in CFG.rolling_agg_feature_extractors:
    if CFG.group_keys_for_rolling_agg is not None:
        for group_keys in CFG.group_keys_for_rolling_agg:
            _extractor = hydra.utils.instantiate(extractor, group_keys=group_keys)
            feature_extractors.append(_extractor)

for extractor in CFG.agg_feature_extractors:
    if CFG.group_keys_for_agg is not None:
        for group_keys in CFG.group_keys_for_agg:
            _extractor = hydra.utils.instantiate(extractor, group_keys=group_keys)
            feature_extractors.append(_extractor)


raw_feature_df = run_extractors(
    input_df=raw_df,
    extractors=feature_extractors,
    dirpath=Path(CFG.paths.feature_store_dir),
    fit=True,
    cache=CFG.cache_feature_extractors,
)
assert len(raw_df) == len(raw_feature_df)

raw_feature_df = pd.concat([raw_df, raw_feature_df], axis=1)
train_feature_df = raw_feature_df.query("data == 'train'").reset_index(drop=True).astype({"health": int})
test_feature_df = raw_feature_df.query("data == 'test'").reset_index(drop=True)

feature_columns = [col for col in train_feature_df.columns if col.startswith("f_")]
logger.info(f"num features: {len(feature_columns)}")

### Train


In [None]:
def assign_seed_average_pred(result_df: pd.DataFrame) -> pd.DataFrame:
    cols = [col for col in result_df.columns if col.startswith("pred_")]
    pred = 0
    for col in cols:
        pred += np.array(result_df[col].tolist())
    result_df["pred"] = list(pred / len(cols))
    return result_df.reset_index()


valid_result_df = pd.DataFrame()
all_trained_estimators = []
scores = {}
for seed in CFG.seed_average_seeds:
    logger.info(f"\n\nstart training seed={seed} 🚀")
    CFG.model.estimator.random_state = seed
    CFG.model.estimator.num_leaves = seed  # lgbm

    if CFG.use_cat_features:
        cat_features = [x for x in feature_columns if x.startswith("f_oe_")]
        estimator = hydra.utils.instantiate(CFG.model.estimator, cat_features=cat_features)
    else:
        estimator = hydra.utils.instantiate(CFG.model.estimator)

    fit_params = dict(hydra.utils.instantiate(CFG.model.fit_params))
    if estimator.__class__.__name__.startswith("LGBM"):
        fit_params["eval_metric"] = [
            # lgb_macro_auc,
            lgb_macro_f1,
        ]

    model_output_dir = OUTPUT_DIR / "models" / f"seed{seed}"
    trained_estimators = train_cv_tabular_v1(
        df=train_feature_df,
        estimator=estimator,
        feature_columns=feature_columns,
        target_columns=["health"],
        fit_params=fit_params,
        output_dir=model_output_dir,
        overwrite=CFG.overwrite_training,
    )

    i_valid_result_df = predict_cv_tabular_v1(
        df=train_feature_df,
        estimators=trained_estimators,
        feature_columns=feature_columns,
        predict_proba=CFG.model.predict_proba,
    )
    val_score = macro_f1_from_proba(
        y_true=i_valid_result_df["health"],
        y_pred=i_valid_result_df["pred"].tolist(),
    )
    logger.info(f"macro f1 score [seed={seed}]: {val_score}")
    scores[f"seed{seed}"] = val_score

    valid_result_df = pd.concat(
        [
            valid_result_df,
            i_valid_result_df[["uid", "pred", "health"]]
            .set_index(["uid", "health"])
            .rename(columns={"pred": f"pred_{seed}"}),
        ],
        axis=1,
    )
    all_trained_estimators.extend(trained_estimators)


valid_result_df = assign_seed_average_pred(valid_result_df)
val_proba = np.array(valid_result_df["pred"].tolist())
val_score = macro_f1_from_proba(y_true=valid_result_df["health"], y_pred=val_proba)
scores["all_nomal"] = val_score

opt_results, val_pred_label = find_optimal_threshold_for_label(
    proba_matrix=val_proba,
    true_labels=valid_result_df["health"],
    label_indices=[2, 0, 1],
)
best_f1_score = f1_score(
    y_true=valid_result_df["health"],
    y_pred=val_pred_label,
    average="macro",
)
scores["all_opt"] = best_f1_score

logger.info(f"macro f1 score: {val_score}")
logger.info(f"optimized thresholds: {opt_results}")
logger.info(f"best f1 score: {best_f1_score}")

joblib.dump(valid_result_df[["uid", "health", "pred"]], OUTPUT_DIR / "valid_result_df.pkl")
json.dump(scores, open(OUTPUT_DIR / "scores.json", "w"))

### Visualization


In [None]:
fig, importance_df = visualize_feature_importance(
    estimators=all_trained_estimators,
    feature_columns=feature_columns,
    top_n=50,
)
fig.savefig(OUTPUT_DIR / "feature_importance.png", dpi=300)
importance_df.to_csv(OUTPUT_DIR / "feature_importance.csv", index=False)


fig = plot_label_distributions(proba_matrix=np.array(valid_result_df["pred"].tolist()))
fig.show()
fig.savefig(OUTPUT_DIR / "label_distributions.png", dpi=300)


fig = plot_confusion_matrix(y_true=valid_result_df["health"], y_pred=val_pred_label)
fig.savefig(OUTPUT_DIR / "confusion_matrix.png", dpi=300)

fig = plot_confusion_matrix(y_true=valid_result_df["health"], y_pred=val_pred_label, normalize=True)
fig.savefig(OUTPUT_DIR / "confusion_matrix_normalized.png", dpi=300)

### Make submission


In [None]:
test_result_df = predict_cv_tabular_v1(
    df=test_feature_df,
    estimators=all_trained_estimators,
    feature_columns=feature_columns,
    test=True,
    predict_proba=CFG.model.predict_proba,
)

test_pred_df = pd.concat([test_result_df[["uid"]], pd.DataFrame(test_result_df["pred"].tolist())], axis=1)
test_df["pred"] = np.argmax(test_pred_df.groupby("uid").mean(), axis=1)
submission_df = test_df[["uid", "pred"]]
submission_filepath = Path(CFG.paths.output_dir) / f"submissions_{CFG.experiment_name}.csv"
submission_df.to_csv(submission_filepath, index=False, header=False)

In [None]:
test_pred_df = pd.concat([test_result_df[["uid"]], pd.DataFrame(test_result_df["pred"].tolist())], axis=1)
test_df["opt_pred"] = decode_label(proba_matrix=test_pred_df.groupby("uid").mean().to_numpy(), thresholds=opt_results)

submission_filepath = Path(CFG.paths.output_dir) / f"submissions_{CFG.experiment_name}_opt_{best_f1_score:.3f}.csv"
test_df[["uid", "opt_pred"]].to_csv(submission_filepath, index=False, header=False)

joblib.dump(test_pred_df, OUTPUT_DIR / "test_result_df.pkl")

In [None]:
print(test_df["opt_pred"].value_counts() / len(test_df))

In [None]:
test_df["pred"].value_counts() / len(test_df)

In [None]:
train_df["health"].value_counts() / len(train_df)

In [None]:
def plot_dist_by_color(df, value_col, color_col):
    import seaborn as sns
    import matplotlib.pyplot as plt

    plt.figure(figsize=(10, 6))
    for color in df[color_col].unique():
        sns.distplot(df[value_col][df[color_col] == color], hist=False, label=color)

    plt.title("Distribution of Values Color-Coded by Color")
    plt.legend()
    plt.show()