In [None]:
import logging
import os
import warnings
from pathlib import Path

import hydra
import pandas as pd
import rootutils
from hydra.core.hydra_config import HydraConfig
from kaggle import KaggleApi

from kuma_utils.training import CrossValidator

rootutils.setup_root(search_from="../", indicator=".project-root", pythonpath=True)

from src.kaggle.dataset import download_kaggle_competition_dataset
from src.utils.log_utils import get_consol_handler, get_file_handler

In [None]:
OVERRIDES: list[str] = os.getenv("OVERRIDES", "experiment=default").split(",")

In [None]:
if OVERRIDES is None:
    raise ValueError("OVERRIDES is not set")

with hydra.initialize(version_base=None, config_path="../configs"):
    CFG = hydra.compose(
        config_name="config.yaml",
        return_hydra_config=True,
        overrides=OVERRIDES,
    )
    # use HydraConfig for notebook to use hydra job
    HydraConfig.instance().set_config(CFG)


if CFG.debug:
    CFG.paths.output_dir = f"{CFG.paths.output_dir}_debug"

INPUT_DIR = Path(CFG.paths.input_dir)  # input directory (r/w)
OUTPUT_DIR = Path(CFG.paths.output_dir)  # experiment output directory (r/w)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_ROOT_DIR = Path(CFG.paths.data_dir) / "output"  # output directory (r/w)
COMPETITION_DATA_DIR = INPUT_DIR / CFG.meta.competition_name

# clients
KAGGLE_CLIENT = KaggleApi()
KAGGLE_CLIENT.authenticate()

# set logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = [
    get_file_handler(OUTPUT_DIR / "notebook.log"),
    get_consol_handler(),
]

warnings.filterwarnings("ignore")

### Load Data


In [None]:
download_kaggle_competition_dataset(
    client=KAGGLE_CLIENT,
    competition=CFG.meta.competition_name,
    out_dir=Path(CFG.paths.input_dir),
)

train_df = pd.read_csv(COMPETITION_DATA_DIR / "train.csv")
sample_submission_df = pd.read_csv(COMPETITION_DATA_DIR / "sample_submission.csv")

### Feature Engineering (skip)


In [None]:
train_feature_df = train_df[CFG.feature_cols].copy()
train_label = train_df[CFG.label_col].copy()

logger.info(f"train_feature_df.shape: {train_feature_df.shape}")

### Train cv


In [None]:
trainer = CrossValidator(model=hydra.utils.instantiate(CFG.estimator.model))

In [None]:
cat_features = CFG.estimator.get("cat_features")
if cat_features:
    train_feature_df.loc[:, cat_features] = train_feature_df[cat_features].fillna("__NA__").to_numpy()

trainer.train(
    data=(train_feature_df, train_label),
    params=CFG.estimator.params,
    fit_params=CFG.estimator.fit_params,
    folds=hydra.utils.instantiate(CFG.cv),
    cat_features=cat_features,
    logger=Path(OUTPUT_DIR / "trainer.log"),
    # tune_model=True,
    # timeout=60,
)

trainer.save(path=OUTPUT_DIR / "trainer.pkl")
logger.info(f"\ntrainer: \n{trainer}")

In [None]:
trainer.plot_feature_importance(save_to=OUTPUT_DIR / "feature_importance.png")
trainer.plot_calibration_curve(
    data=(train_feature_df, train_label),
    save_to=OUTPUT_DIR / "calibration_curve.png",
)