In [None]:
import os
import hydra
import logging
import pandas as pd
import warnings
import rootutils
import joblib
import numpy as np
from pathlib import Path
from hydra import compose, initialize
from hydra.core.hydra_config import HydraConfig

from sklearn.utils.class_weight import compute_sample_weight

rootutils.setup_root(search_from="../", indicator=".project-root", pythonpath=True)

from src.experiment.utils import assign_fold_index, make_uid, visualize_feature_importance
from src.experiment.runner import run_extractors
from src.experiment.metrics import macro_f1_from_proba
from src.experiment.model.runner import train_cv_tabular_v1, predict_cv_tabular_v1

In [None]:
OVERRIDES: list[str] = os.getenv("OVERRIDES", "experiment=002-tabular_v2").split(",")

In [None]:
if OVERRIDES is None:
    raise ValueError("OVERRIDES is not set")

with initialize(version_base=None, config_path="../configs"):
    CFG = compose(
        config_name="config.yaml",
        return_hydra_config=True,
        overrides=OVERRIDES,
    )
    HydraConfig.instance().set_config(CFG)  # use HydraConfig for notebook to use hydra job

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

if not logger.handlers:
    handler = logging.StreamHandler()
    logger.addHandler(handler)


warnings.filterwarnings("ignore")
INPUT_DIR = Path(CFG.paths.input_dir)
OUTPUT_DIR = Path(CFG.paths.output_dir)

### Load Data


In [None]:
def assign_meta(df: pd.DataFrame, data="train"):
    df["data"] = data
    df["fold"] = -1
    return df


train_df = pd.read_csv(INPUT_DIR / "train.csv").rename(columns={"Unnamed: 0": "uid"})
test_df = pd.read_csv(INPUT_DIR / "test.csv").rename(columns={"Unnamed: 0": "uid"})
sample_submission_df = pd.read_csv(INPUT_DIR / "sample_submission.csv")

train_df = assign_meta(train_df, data="train")
test_df = assign_meta(test_df, data="test")

### CV Split


In [None]:
kfold = hydra.utils.instantiate(CFG.cv)
train_df = assign_fold_index(train_df=train_df, kfold=kfold, y_col="health")

### Feature Engineering


In [None]:
feature_extractors = hydra.utils.instantiate(CFG.feature_extractors)

for extractor in CFG.agg_feature_extractors:
    for group_keys in CFG.group_keys_for_agg:
        _extractor = hydra.utils.instantiate(extractor, group_keys=group_keys)
        feature_extractors.append(_extractor)

for extractor in CFG.te_feature_extractors:
    for group_keys in CFG.group_keys_for_te:
        _extractor = hydra.utils.instantiate(extractor, group_keys=group_keys)
        feature_extractors.append(_extractor)

raw_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
raw_feature_df = run_extractors(
    input_df=raw_df,
    extractors=feature_extractors,
    dirpath=Path(CFG.paths.feature_store_dir),
    fit=True,
    cache=False,
)
assert len(raw_df) == len(raw_feature_df)

raw_feature_df = pd.concat([raw_df, raw_feature_df], axis=1)
train_feature_df = raw_feature_df.query("data == 'train'").reset_index(drop=True)
test_feature_df = raw_feature_df.query("data == 'test'").reset_index(drop=True)

feature_columns = [col for col in train_feature_df.columns if col.startswith("f_")]

### Train


In [None]:
estimator = hydra.utils.instantiate(CFG.estimator)
fit_params = hydra.utils.instantiate(CFG.fit_params)

model_output_dir = OUTPUT_DIR / "models"
trained_estimators = train_cv_tabular_v1(
    df=train_feature_df,
    estimator=estimator,
    feature_columns=feature_columns,
    target_columns=["health"],
    fit_params=fit_params,
    output_dir=model_output_dir,
    overwrite=True,
)

valid_result_df = predict_cv_tabular_v1(
    df=train_feature_df,
    estimators=trained_estimators,
    feature_columns=feature_columns,
)

val_score = macro_f1_from_proba(y_true=valid_result_df["health"], y_pred=valid_result_df["pred"].tolist())
logger.info(f"macro f1 score: {val_score}")

In [None]:
fig, importance_df = visualize_feature_importance(
    estimators=trained_estimators,
    feature_columns=feature_columns,
    top_n=50,
)
fig.savefig(OUTPUT_DIR / "feature_importance.png", dpi=300)
importance_df.to_csv(OUTPUT_DIR / "feature_importance.csv", index=False)

### Make submission


In [None]:
def make_submission(test_result_df):
    test_pred_df = pd.concat([test_result_df[["uid"]], pd.DataFrame(test_result_df["pred"].tolist())], axis=1)
    test_df["pred"] = np.argmax(test_pred_df.groupby("uid").mean(), axis=1)
    return test_df[["uid", "pred"]]


test_result_df = predict_cv_tabular_v1(
    df=test_feature_df,
    estimators=trained_estimators,
    feature_columns=feature_columns,
    test=True,
)

submission_df = make_submission(test_result_df)
submission_filepath = Path(CFG.paths.output_dir) / f"submissions_{CFG.experiment_name}.csv"
submission_df.to_csv(submission_filepath, index=False, header=False)

submission_df_ = pd.read_csv(submission_filepath)

assert len(submission_df_) == len(sample_submission_df)