In [1]:
import os
import json
from collections import Counter
from datetime import datetime, timezone, timedelta
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl

### Configuration

In [2]:
ID = "034"
SEED = 42
N_SPLITS = [5]
LEVEL = "l2"


FEATURE_DIR = Path(f"../../artifacts/features/{ID}")

os.makedirs(FEATURE_DIR, exist_ok=True)

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

print(f"Feature dir created successfully in \n{FEATURE_DIR}")

Feature dir created successfully in 
../../artifacts/features/034


### Utils

In [3]:
def check_info(
    train: pl.DataFrame,
    test: pl.DataFrame
) -> tuple[float, float, float]:
    train_mem = sum(train[col].to_numpy().nbytes for col in train.columns) / 1024**3
    test_mem = sum(test[col].to_numpy().nbytes for col in test.columns) / 1024**3

    print("=== Shape & Memory ===")
    print(f"Train Shape: {train.shape}, Test Shape: {test.shape}")
    print(f"Train Memory: {train_mem:.2f} GB, Test Memory: {test_mem:.2f} GB\n")

    dtype_counts = Counter([str(dt) for dt in train.dtypes])

    n_cat = None
    print("=== DTypes ===")
    for dtype, cnt in dtype_counts.items():
        print(f"{dtype}: {cnt}")
        if dtype == "Categorical":
            n_cat = cnt
    return train_mem, test_mem, n_cat

### Feature Engineering

In [4]:
runs_root = Path("../../runs")
run_dirs = sorted(p for p in runs_root.iterdir() if p.is_dir())

oof_cols = []
test_cols = []
oof_arrays = []
test_arrays = []

for rd in run_dirs:
    oof_path = rd / "oof.npy"
    test_path = rd / "test.npy"
    if not (oof_path.exists() and test_path.exists()):
        continue

    name = rd.name  # 例: xgb-031-trl6-5fold-s42
    oof = np.load(oof_path)[:750_000]
    test = np.load(test_path)[:250_000]

    # カラム名を run 名にして格納
    oof_arrays.append(oof)
    test_arrays.append(test)
    oof_cols.append(name)
    test_cols.append(name)

# === 横結合して DataFrame 化 ===
oof_df = pl.DataFrame({c: a for c, a in zip(oof_cols, oof_arrays)})
test_df = pl.DataFrame({c: a for c, a in zip(test_cols, test_arrays)})


# === row_id を追加 ===
oof_df = oof_df.with_row_index("row_id")

# === targetを追加 ===
lf = pl.scan_parquet("../../artifacts/features/033/tr_df.parquet")
y = lf.select("target").collect()
oof_df = oof_df.with_columns(y)

### Add Fold col

In [5]:
folds_path = "../../artifacts/folds/folds.parquet"
pairs = [
    ("skf/k=5/s=42@train", "5fold-s42")
]
cfgs = [c for c, _ in pairs]
rename_map = {c: n for c, n in pairs}

# folds をまとめて読み → ワイド化（cfg列を列見出しに）→ 列名をfold_nameにリネーム
folds_wide = (
    pl.scan_parquet(folds_path)
      .filter(pl.col("cfg").is_in(cfgs))
      .unique(subset=["row_id", "cfg"], keep="last")
      .select(["row_id", "cfg", "fold"])
      .collect(engine="streaming")
      .pivot(values="fold", index="row_id", columns="cfg", aggregate_function="first")
      .rename(rename_map)
      .with_columns(pl.col("row_id").cast(pl.Int32))
      .with_columns([pl.all().exclude("row_id").cast(pl.Int8)])  # 型を軽く
)

# tr_df が DataFrame の場合
oof_df = oof_df.join(folds_wide, on="row_id", how="left")

  pl.scan_parquet(folds_path)


In [6]:
# === 特徴量エンジニアリング後の情報 ===
train_mem, test_mem, n_cat = check_info(oof_df, test_df)

=== Shape & Memory ===
Train Shape: (750000, 42), Test Shape: (250000, 39)
Train Memory: 0.11 GB, Test Memory: 0.04 GB

=== DTypes ===
UInt32: 1
Float32: 39
Int8: 2


In [7]:
# === Save Data ===
oof_df.write_parquet(FEATURE_DIR / "train.parquet")
test_df.write_parquet(FEATURE_DIR / "test.parquet")

### Save Meta Data

In [8]:
JST = timezone(timedelta(hours=9))
meta = {
    "data_id": ID,
    "train_paths": [str(FEATURE_DIR / "train.parquet")],
    "test_paths": [str(FEATURE_DIR / "test.parquet")],
    "level": LEVEL,
    "created_at": datetime.now(JST).isoformat(),
    "train_shape": [oof_df.height, oof_df.width],
    "test_shape": [test_df.height, test_df.width],
    "memory": {
        "train": train_mem,
        "test": test_mem
    },
    "n_splits": N_SPLITS,
    "cat_cols": n_cat if n_cat else None,
    "run_ids": oof_cols
}

with open(f"{FEATURE_DIR}/meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)