In [12]:
import os
import re
import json
from collections import Counter
from datetime import datetime, timezone, timedelta
from pathlib import Path

import ipynbname
import numpy as np
import polars as pl

### Configuration

In [13]:
stem = ipynbname.path().stem
m = re.search(r"(\d+)$", stem)

ID = m.group(1)
SEED = 42
N_SPLITS = [5]
LEVEL = "l2"


FEATURE_DIR = Path(f"../../artifacts/features/{ID}")

os.makedirs(FEATURE_DIR, exist_ok=True)

pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

print(f"Feature dir created successfully in \n{FEATURE_DIR}")

Feature dir created successfully in 
../../artifacts/features/058


### Utils

In [14]:
def check_info(
    train: pl.DataFrame,
    test: pl.DataFrame
) -> tuple[float, float, float]:
    train_mem = sum(train[col].to_numpy().nbytes for col in train.columns) / 1024**3
    test_mem = sum(test[col].to_numpy().nbytes for col in test.columns) / 1024**3

    print("=== Shape & Memory ===")
    print(f"Train Shape: {train.shape}, Test Shape: {test.shape}")
    print(f"Train Memory: {train_mem:.2f} GB, Test Memory: {test_mem:.2f} GB\n")

    dtype_counts = Counter([str(dt) for dt in train.dtypes])

    n_cat = None
    print("=== DTypes ===")
    for dtype, cnt in dtype_counts.items():
        print(f"{dtype}: {cnt}")
        if dtype == "Categorical":
            n_cat = cnt
    return train_mem, test_mem, n_cat

### Feature Engineering

In [15]:
runs_root = Path("../../runs")
choosed_runs = [
    "xgb-057-trl5-5fold-s42",
    "xgb-057-trl0-5fold-s42",
    "xgb-056-trl0-5fold-s42",
    "xgb-055-trl0-5fold-s42",
    "xgb-052-trl0-5fold-s42",
    "xgb-050-trl0-5fold-s42",
    "mlp-057-trl0-5fold-s42",
    "lgbm-057-trl21-5fold-s42"
]
run_paths = [runs_root / r for r in choosed_runs]

train_cols = []
test_cols = []
train_arrays = []
test_arrays = []

for rd in run_paths:
    train_path = rd / "oof.npy"
    test_path = rd / "test.npy"
    if not (train_path.exists() and test_path.exists()):
        print(f"{train_path} does not exist")
        continue

    name = rd.name  # 例: xgb-031-trl6-5fold-s42
    train = np.load(train_path)[:750_000]
    test = np.load(test_path)[:250_000]

    train_arrays.append(train)
    test_arrays.append(test)
    train_cols.append(name)
    test_cols.append(name)

# === 横結合して DataFrame 化 ===
train_df = pl.DataFrame({c: a for c, a in zip(train_cols, train_arrays)})
test_df = pl.DataFrame({c: a for c, a in zip(test_cols, test_arrays)})


# === row_id を追加 ===
train_df = train_df.with_row_index("row_id")

# === targetを追加 ===
lf = pl.scan_parquet("../../artifacts/features/033/tr_df.parquet")
y = lf.select("target").collect()
tr_df = train_df.with_columns(y)

### Add Fold col

In [16]:
folds_path = "../../artifacts/folds/folds.parquet"
pairs = [
    ("skf/k=5/s=42@train", "5fold-s42")
]
cfgs = [c for c, _ in pairs]
rename_map = {c: n for c, n in pairs}

# folds をまとめて読み → ワイド化（cfg列を列見出しに）→ 列名をfold_nameにリネーム
folds_wide = (
    pl.scan_parquet(folds_path)
      .filter(pl.col("cfg").is_in(cfgs))
      .unique(subset=["row_id", "cfg"], keep="last")
      .select(["row_id", "cfg", "fold"])
      .collect(engine="streaming")
      .pivot(values="fold", index="row_id", on="cfg", aggregate_function="first")
      .rename(rename_map)
      .with_columns(pl.col("row_id").cast(pl.Int32))
      .with_columns([pl.all().exclude("row_id").cast(pl.Int8)])  # 型を軽く
)

# tr_df が DataFrame の場合
tr_df = tr_df.join(folds_wide, on="row_id", how="left")

In [17]:
# === 特徴量エンジニアリング後の情報 ===
train_mem, test_mem, n_cat = check_info(tr_df, test_df)

=== Shape & Memory ===
Train Shape: (750000, 11), Test Shape: (250000, 8)
Train Memory: 0.03 GB, Test Memory: 0.01 GB

=== DTypes ===
UInt32: 1
Float32: 8
Int8: 2


In [18]:
# === Save Data ===
tr_path = FEATURE_DIR / "train.parquet"
test_path = FEATURE_DIR / "test.parquet"

tr_df.write_parquet(FEATURE_DIR / "train.parquet")
test_df.write_parquet(FEATURE_DIR / "test.parquet")

print(f"tr_df saved successfully to {tr_path}")
print(f"test_df saved successfully to {test_path}")

tr_df saved successfully to ../../artifacts/features/058/train.parquet
test_df saved successfully to ../../artifacts/features/058/test.parquet


### Save Meta Data

In [19]:
JST = timezone(timedelta(hours=9))
meta = {
    "data_id": ID,
    "train_paths": [str(tr_path)],
    "test_paths": [str(test_path)],
    "level": LEVEL,
    "created_at": datetime.now(JST).isoformat(),
    "train_shape": [tr_df.height, tr_df.width],
    "test_shape": [test_df.height, test_df.width],
    "memory": {
        "train": train_mem,
        "test": test_mem
    },
    "n_splits": N_SPLITS,
    "cat_cols": n_cat if n_cat else None,
    "run_ids": train_cols
}

with open(f"{FEATURE_DIR}/meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

for k, v in meta.items():
    print(f"{k}: {v}")

data_id: 058
train_paths: ['../../artifacts/features/058/train.parquet']
test_paths: ['../../artifacts/features/058/test.parquet']
level: l2
created_at: 2025-10-06T20:11:07.012577+09:00
train_shape: [750000, 11]
test_shape: [250000, 8]
memory: {'train': 0.026542693376541138, 'test': 0.007450580596923828}
n_splits: [5]
cat_cols: None
run_ids: ['xgb-057-trl5-5fold-s42', 'xgb-057-trl0-5fold-s42', 'xgb-056-trl0-5fold-s42', 'xgb-055-trl0-5fold-s42', 'xgb-052-trl0-5fold-s42', 'xgb-050-trl0-5fold-s42', 'mlp-057-trl0-5fold-s42', 'lgbm-057-trl21-5fold-s42']
