In [None]:
import os
import hydra
import logging
import json
import pandas as pd
import warnings
import rootutils
import numpy as np
from pathlib import Path
from hydra import compose, initialize
from hydra.core.hydra_config import HydraConfig

import optuna.integration.lightgbm as lgb
from sklearn.utils.class_weight import compute_sample_weight


rootutils.setup_root(search_from="../", indicator=".project-root", pythonpath=True)

from src.experiment.utils import assign_fold_index
from src.experiment.feature.runner import run_extractors
from src.experiment.model.custom_metrics import lgb_py_minus_macro_f1

In [None]:
OVERRIDES: list[str] = os.getenv("OVERRIDES", "experiment=043-tabular_v3").split(",")
TIME_BUDGET = 60 * 60 * 3

In [None]:
if OVERRIDES is None:
    raise ValueError("OVERRIDES is not set")

with initialize(version_base=None, config_path="../../configs"):
    CFG = compose(
        config_name="config.yaml",
        return_hydra_config=True,
        overrides=OVERRIDES,
    )
    HydraConfig.instance().set_config(CFG)  # use HydraConfig for notebook to use hydra job

logger = logging.getLogger()
logger.setLevel(logging.INFO)

if not logger.handlers:
    handler = logging.StreamHandler()
    logger.addHandler(handler)


warnings.filterwarnings("ignore")
INPUT_DIR = Path(CFG.paths.input_dir)
OUTPUT_DIR = Path(CFG.paths.output_dir)
BASE_OUTPUT_DIR = Path(CFG.paths.resource_dir) / "outputs"

### Load Data


In [None]:
def assign_meta(df: pd.DataFrame, data="train"):
    df["data"] = data
    df["fold"] = -1
    return df


train_df = pd.read_csv(INPUT_DIR / "train.csv").rename(columns={"Unnamed: 0": "uid"})
test_df = pd.read_csv(INPUT_DIR / "test.csv").rename(columns={"Unnamed: 0": "uid"})
sample_submission_df = pd.read_csv(INPUT_DIR / "sample_submission.csv")

train_df = assign_meta(train_df, data="train")
test_df = assign_meta(test_df, data="test")

In [None]:
kfold = hydra.utils.instantiate(CFG.cv)
train_df = assign_fold_index(train_df=train_df, kfold=kfold, y_col="health")

raw_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

### Feature Engineering


In [None]:
feature_extractors = hydra.utils.instantiate(CFG.feature_extractors)


def get_agg_feature_extractors(feature_extractors, all_group_keys):
    if feature_extractors is None:
        return []

    if all_group_keys is None:
        return []

    extractors = []
    for extractor in feature_extractors:
        for group_keys in all_group_keys:
            _extractor = hydra.utils.instantiate(extractor, group_keys=group_keys)
            extractors.append(_extractor)
    return extractors


feature_extractors.extend(get_agg_feature_extractors(CFG.get("agg_feature_extractors"), CFG.get("group_keys_for_agg")))
feature_extractors.extend(get_agg_feature_extractors(CFG.get("te_feature_extractors"), CFG.get("group_keys_for_te")))
feature_extractors.extend(
    get_agg_feature_extractors(CFG.get("rolling_agg_feature_extractors"), CFG.get("group_keys_for_rolling_agg"))
)


raw_feature_df = run_extractors(
    input_df=raw_df,
    extractors=feature_extractors,
    dirpath=Path(CFG.paths.feature_store_dir),
    fit=True,
    cache=CFG.cache_feature_extractors,
)
assert len(raw_df) == len(raw_feature_df)

raw_feature_df = pd.concat([raw_df, raw_feature_df], axis=1)
train_feature_df = raw_feature_df.query("data == 'train'").reset_index(drop=True).astype({"health": int})
test_feature_df = raw_feature_df.query("data == 'test'").reset_index(drop=True)

feature_columns = [col for col in train_feature_df.columns if col.startswith("f_")]
logger.info(f"num features: {len(feature_columns)}")

train_feature_df_0 = train_feature_df.query("fold != 0").reset_index(drop=True)
valid_feature_df_0 = train_feature_df.query("fold == 0").reset_index(drop=True)

In [None]:
default_params = dict(CFG.model.estimator)
target_model_name = default_params.pop("_target_")
logger.info(f"target model: {target_model_name}")

if target_model_name.startswith("lightgbm"):
    default_params["metric"] = "custom"
    num_boost_round = default_params.pop("n_estimators")

    unknown_params = ["class_weight", "importance_type"]
    default_params = {k: v for k, v in default_params.items() if k not in unknown_params}

    train_dataset = lgb.Dataset(
        train_feature_df_0[feature_columns],
        label=train_feature_df_0["health"],
        weight=compute_sample_weight(class_weight="balanced", y=train_feature_df_0["health"]),
    )
    valid_dataset = lgb.Dataset(
        valid_feature_df_0[feature_columns], label=valid_feature_df_0["health"], weight=np.ones(len(valid_feature_df_0))
    )

    model = lgb.train(
        params=default_params,
        train_set=train_dataset,
        valid_sets=[train_dataset, valid_dataset],
        num_boost_round=num_boost_round,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(100),
        ],
        feval=[lgb_py_minus_macro_f1],
        show_progress_bar=False,
        time_budget=TIME_BUDGET,
    )


opt_params = model.params.copy()

# update params
opt_params["calss_weight"] = "balanced"
opt_params["importance_type"] = "gain"

print(opt_params)
json.dump(opt_params, open(OUTPUT_DIR / "opt_params_lgb.json", "w"), indent=4)

In [None]:
model.params