In [None]:
import os
import hydra
import logging
import pandas as pd
import sklearn

from pathlib import Path
from hydra import compose, initialize
from hydra.core.hydra_config import HydraConfig

from src.experiment.utils import assign_fold_index

In [None]:
OVERRIDES: list[str] = os.getenv("OVERRIDES", "experiment=001-tabular_v01").split(",")

In [None]:
if OVERRIDES is None:
    raise ValueError("OVERRIDES is not set")

with initialize(version_base=None, config_path="../configs"):
    CFG = compose(
        config_name="config.yaml",
        return_hydra_config=True,
        overrides=OVERRIDES,
    )
    HydraConfig.instance().set_config(CFG)  # use HydraConfig for notebook to use hydra job

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

if not logger.handlers:
    handler = logging.StreamHandler()
    logger.addHandler(handler)

sklearn.set_config(skip_parameter_validation=True)  # pipeline の謎エラー回避
INPUT_DIR = Path(CFG.paths.input_dir)

### Load Data


In [None]:
train_df = pd.read_csv(INPUT_DIR / "train.csv")
test_df = pd.read_csv(INPUT_DIR / "test.csv")

### CV Split


In [None]:
kfold = hydra.utils.instantiate(CFG.cv)
train_df = assign_fold_index(train_df=train_df, kfold=kfold, y_col="health")

In [None]:
pipeline = hydra.utils.instantiate(CFG.pipeline)

In [None]:
pipeline.fit(train_df)
pipeline.transform(train_df)