In [8]:
import os
import re
import json
from collections import Counter
from datetime import datetime, timezone, timedelta
from pathlib import Path

import ipynbname
import polars as pl

### Configuration

In [2]:
stem = ipynbname.path().stem
m = re.search(r"(\d+)$", stem)

ID = m.group(1)
SEED = 42
level = "l1"

resource_id = "045"
runs_id = "runs/xgb-045-trl5-5fold-s42"

threshold = "0.99"  # 0.90, 0.95, 0.99の中から選択

FEATURE_DIR = Path(f"../../artifacts/features/{ID}")

os.makedirs(FEATURE_DIR, exist_ok=True)

pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

print(f"Feature dir created successfully in \n{FEATURE_DIR}")

Feature dir created successfully in 
../../artifacts/features/047


### Utils

In [3]:
def check_info(
    train: pl.DataFrame,
    test: pl.DataFrame
) -> tuple[float, float, float]:
    train_mem = sum(train[col].to_numpy().nbytes for col in train.columns) / 1024**3
    test_mem = sum(test[col].to_numpy().nbytes for col in test.columns) / 1024**3

    print("=== Shape & Memory ===")
    print(f"Train Shape: {train.shape}, Test Shape: {test.shape}")
    print(f"Train Memory: {train_mem:.2f} GB, Test Memory: {test_mem:.2f} GB\n")

    dtype_counts = Counter([str(dt) for dt in train.dtypes])

    n_cat = None
    print("=== DTypes ===")
    for dtype, cnt in dtype_counts.items():
        print(f"{dtype}: {cnt}")
        if dtype == "Categorical":
            n_cat = cnt
    return train_mem, test_mem, n_cat

### Feature Engineering
- 045を閾値99%でcolを選別

In [4]:
# === Load Data ===
resource_dir = Path(f"../../artifacts/features/{resource_id}")

with open(resource_dir / "meta.json", "r") as f:
    meta = json.load(f)

train_paths = meta["train_paths"]
test_paths = meta["test_paths"]

with open(f"../../{runs_id}/keep_cols.json", "r") as f:
    keep_cols = json.load(f)[threshold]["cols"]

all_cols = pl.read_parquet(train_paths, n_rows=0).columns

train_cols = keep_cols + ["target", "5fold-s42", "row_id"]

train = pl.read_parquet(train_paths, columns=train_cols)
test = pl.read_parquet(test_paths, columns=keep_cols)

In [5]:
# === 特徴量エンジニアリング後の情報 ===
train_mem, test_mem, n_cat = check_info(train, test)

=== Shape & Memory ===
Train Shape: (750000, 566), Test Shape: (250000, 563)
Train Memory: 1.58 GB, Test Memory: 0.53 GB

=== DTypes ===
Float32: 533
Int32: 29
Int64: 1
Int8: 2
UInt32: 1


In [6]:
# === Save Overall Data ===
tr_path = FEATURE_DIR / "train.parquet"
test_path = FEATURE_DIR / "test.parquet"

train.write_parquet(tr_path)
test.write_parquet(test_path)

print(f"tr_df saved successfully to {tr_path}")
print(f"test_df saved successfully to {test_path}")

tr_df saved successfully to ../../artifacts/features/047/train.parquet
test_df saved successfully to ../../artifacts/features/047/test.parquet


### Save Meta data

In [7]:
JST = timezone(timedelta(hours=9))
pairs = [
    ("skf/k=5/s=42@train", "5fold-s42")
]
meta = {
    "data_id": ID,
    "train_paths": [str(tr_path)],
    "test_paths": [str(test_path)],
    "level": level,
    "created_at": datetime.now(JST).isoformat(),
    "train_shape": [train.height, train.width],
    "test_shape": [test.height, test.width],
    "memory": {
        "train": train_mem,
        "test": test_mem
    },
    "fold_column": pairs,
    "cat_cols": n_cat if n_cat else None
}

with open(f"{FEATURE_DIR}/meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

for k, v in meta.items():
    print(f"{k}: {v}")

data_id: 047
train_paths: ['../../artifacts/features/047/train.parquet']
test_paths: ['../../artifacts/features/047/test.parquet']
level: l1
created_at: 2025-10-03T17:08:40.126776+09:00
train_shape: [750000, 566]
test_shape: [250000, 563]
memory: {'train': 1.5799887478351593, 'test': 0.5252659320831299}
fold_column: [('skf/k=5/s=42@train', '5fold-s42')]
cat_cols: None


In [9]:
for c in keep_cols:
    print(c)

target_mean_by_contact_duration2
target_mean_by_housing_duration2
target_mean_by_month_duration2
target_mean_by_duration2_previous2
target_mean_by_poutcome_duration2
target_mean_by_month_day2
target_median_by_poutcome_duration2
target_std_by_contact_duration2
target_mean_by_education_duration2
target_std_by_month_day2
target_mean_by_duration2_pdays2
target_mean_by_marital_duration2
target_mean_by_job_duration2
target_mean_by_poutcome_balance2
target_mean_by_contact_balance2
target_std_by_housing_duration2
target_mean_by_loan_balance2
month_day2_ce
target_mean_by_job_balance2
target_std_by_month_duration2
duration
target_median_by_housing_duration2
target_mean_by_marital_balance2
target_mean_by_loan_duration2
target_mean_by_month_pdays2
target_std_by_contact_balance2
target_mean_by_housing_balance2
target_mean_by_contact_month
duration_sq
target_mean_by_education_balance2
target_mean_by_month_poutcome
age_div_duration
target_std_by_poutcome_duration2
target_median_by_month_day2
target_m

In [12]:
for i, c in enumerate(keep_cols):
    if "target" in c:
        print(i, c)

0 target_mean_by_contact_duration2
1 target_mean_by_housing_duration2
2 target_mean_by_month_duration2
3 target_mean_by_duration2_previous2
4 target_mean_by_poutcome_duration2
5 target_mean_by_month_day2
6 target_median_by_poutcome_duration2
7 target_std_by_contact_duration2
8 target_mean_by_education_duration2
9 target_std_by_month_day2
10 target_mean_by_duration2_pdays2
11 target_mean_by_marital_duration2
12 target_mean_by_job_duration2
13 target_mean_by_poutcome_balance2
14 target_mean_by_contact_balance2
15 target_std_by_housing_duration2
16 target_mean_by_loan_balance2
18 target_mean_by_job_balance2
19 target_std_by_month_duration2
21 target_median_by_housing_duration2
22 target_mean_by_marital_balance2
23 target_mean_by_loan_duration2
24 target_mean_by_month_pdays2
25 target_std_by_contact_balance2
26 target_mean_by_housing_balance2
27 target_mean_by_contact_month
29 target_mean_by_education_balance2
30 target_mean_by_month_poutcome
32 target_std_by_poutcome_duration2
33 target_m

In [None]:
# src/tuning/optuna_objectives.py
"""
Unified Optuna objective factory and model registry.

- One-stop place to define search spaces and bind them to Trainer classes.
- Add a model by registering: ("module.path:TrainerClass", search_space_fn, tags)
"""

from __future__ import annotations
from typing import Callable, Any, Type
import importlib
import json
from pathlib import Path

import optuna
import wandb
from optuna.exceptions import TrialPruned

from src.utils.snapshot_study import snapshot_study
from src.utils.telegram import send_message
from src.utils.loggers import WandbLogger

# ---- Types ----
Objective          = Callable[[optuna.trial.Trial], float]
ObjectiveFactory   = Callable[..., Objective]
SearchSpace        = Callable[[optuna.trial.Trial], dict[str, Any]]
TrainerSpec        = str  # "module.path:ClassName"

def _resolve(spec: TrainerSpec) -> Type:
    mod, name = spec.split(":")
    return getattr(importlib.import_module(mod), name)

# ---- Search spaces per model (差分はここだけ) ----


def space_lgbm(t: optuna.trial.Trial) -> dict[str, Any]:
    return {
        "learning_rate": 0.02,
        "num_leaves":   t.suggest_int("num_leaves", 31, 511),
        "max_depth":    t.suggest_int("max_depth", -1, 12),
        "min_data_in_leaf": t.suggest_int("min_data_in_leaf", 10, 200),
        "feature_fraction": t.suggest_float("feature_fraction", 0.6, 0.95),
        "bagging_fraction": t.suggest_float("bagging_fraction", 0.6, 0.95),
        "bagging_freq":     t.suggest_int("bagging_freq", 0, 10),
        "lambda_l1":  t.suggest_float("lambda_l1", 1e-4, 20.0, log=True),
        "lambda_l2":  t.suggest_float("lambda_l2", 1e-4, 20.0, log=True),
    }

# 必要に応じて追加: space_cb, space_tabnet, ...

# ---- Registry: model_name -> (Trainer class spec, search space fn, tags) ----
_REGISTRY: dict[str, tuple[TrainerSpec, SearchSpace, list[str]]] = {
    "xgb":  ("src.models.xgb.xgb_cv_trainer:XGBCVTrainer",   space_xgb,  ["xgb"]),
    "lgbm": ("src.models.lgbm.lgbm_cv_trainer:LGBMCVTrainer", space_lgbm, ["lgbm"]),
    # "cb":   ("src.models.cb.cb_cv_trainer:CBCVTrainer", space_cb, ["cb"]),
    # ...
}

def available_models() -> list[str]:
    return sorted(_REGISTRY)

# ---- Unified objective factory ----
def create_objective(
    model_type: str,
    data_id: int,
    *,
    seed: int = 42,
    n_folds: int = 5,
    fold_idx: int = 0,
    wandb_project: str = "project",
    study_name: str = "study",
    opts: dict | None = None,
    snapshot_every: int = 10,
) -> Objective:
    """
    Return Optuna objective(trial)->float for a given `model_type`.

    - Uses `_REGISTRY` to lazily import the corresponding Trainer class
      and to build its hyperparameter dict from `search_space(trial)`.
    - Common behaviors (W&B logging, manifest dump, pruning on OOM, snapshots)
      are centralized here.
    """
    try:
        trainer_spec, space_fn, tags = _REGISTRY[model_type]
    except KeyError:
        raise ValueError(f"Unknown model type: {model_type}. Available: {', '.join(available_models())}")

    Trainer = _resolve(trainer_spec)
    optuna_root = Path("../../artifacts/optuna")

    # features meta (共通。必要に応じてパスは調整)
    with open(f"../../artifacts/features/{data_id}/meta.json") as f:
        meta = json.load(f)
    train_paths = meta["train_paths"]
    level = meta.get("level", "unknown")

    def objective(trial: optuna.trial.Trial) -> float:
        params = space_fn(trial)

        run = wandb.init(
            project=wandb_project,
            group=study_name,
            name=f"trl{trial.number}",
            job_type="optuna-search",
            config={"data_id": data_id, "n_folds": n_folds, "level": level,
                    "model": model_type, **params, **(opts or {})},
            tags=[model_type, level, *tags],
            reinit=True,
            dir="../../artifacts",
        )

        try:
            trainer = Trainer(
                data_id,
                train_paths,
                n_folds=n_folds,
                params=params,
                seed=seed,
                opts=opts,
            )
            score = trainer.fit_one_fold(fold_idx, loggers=[WandbLogger(run=run)])

            (optuna_root / study_name).mkdir(parents=True, exist_ok=True)
            path = optuna_root / f"{study_name}/trl{trial.number}.json"
            manifest = {
                "params": params, "n_folds": n_folds, "seed": seed, "fold_idx": fold_idx,
                "wandb_id": run.id, "wandb_url": run.url, "opts": opts, "score": score,
            }
            with open(path, "w") as f:
                json.dump(manifest, f, indent=4)

            if snapshot_every and (trial.number + 1) % snapshot_every == 0:
                try:
                    snapshot_study(
                        study=trial.study,
                        study_name=study_name,
                        trial_num=trial.number,
                        out_root=optuna_root,
                        send_telegram=True,
                    )
                except Exception:
                    pass

            return score

        except RuntimeError as e:
            msg = str(e)
            if "CUDA out of memory" in msg:
                send_message(f"[OOM] study={study_name} tr={trial.number} params={trial.params}")
                raise TrialPruned("OOM -> pruned")
            else:
                send_message(f"[ERROR] study={study_name} tr={trial.number} {type(e).__name__}: {msg}")
                raise
        finally:
            wandb.finish()

    return objective
