In [1]:
import os
import re
import json
import math
from collections import Counter
from datetime import datetime, timezone, timedelta
from pathlib import Path

import ipynbname
import polars as pl

### Configuration

In [2]:
stem = ipynbname.path().stem
m = re.search(r"(\d+)$", stem)

ID = m.group(1)
SEED = 42
level = "l1"

resource_id = "045"
runs_id = "runs/xgb-045-trl5-5fold-s42"

threshold = "0.95"  # 0.90, 0.95, 0.99の中から選択

FEATURE_DIR = Path(f"../../artifacts/features/{ID}")

os.makedirs(FEATURE_DIR, exist_ok=True)

pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

print(f"Feature dir created successfully in \n{FEATURE_DIR}")

Feature dir created successfully in 
../../artifacts/features/046


### Utils

In [3]:
def check_info(
    train: pl.DataFrame,
    test: pl.DataFrame
) -> tuple[float, float, float]:
    train_mem = sum(train[col].to_numpy().nbytes for col in train.columns) / 1024**3
    test_mem = sum(test[col].to_numpy().nbytes for col in test.columns) / 1024**3

    print("=== Shape & Memory ===")
    print(f"Train Shape: {train.shape}, Test Shape: {test.shape}")
    print(f"Train Memory: {train_mem:.2f} GB, Test Memory: {test_mem:.2f} GB\n")

    dtype_counts = Counter([str(dt) for dt in train.dtypes])

    n_cat = None
    print("=== DTypes ===")
    for dtype, cnt in dtype_counts.items():
        print(f"{dtype}: {cnt}")
        if dtype == "Categorical":
            n_cat = cnt
    return train_mem, test_mem, n_cat

### Feature Engineering
- 045を閾値95%でcolを選別

In [7]:
# === Load Data ===
resource_dir = Path(f"../../artifacts/features/{resource_id}")

with open(resource_dir / "meta.json", "r") as f:
    meta = json.load(f)

train_paths = meta["train_paths"]
test_paths = meta["test_paths"]

with open(f"../../{runs_id}/keep_cols.json", "r") as f:
    keep_cols = json.load(f)[threshold]["cols"]

all_cols = pl.read_parquet(train_paths, n_rows=0).columns

train_cols = keep_cols + ["target", "5fold-s42", "row_id"]

train = pl.read_parquet(train_paths, columns=train_cols)
test = pl.read_parquet(test_paths, columns=keep_cols)

In [8]:
# === 特徴量エンジニアリング後の情報 ===
train_mem, test_mem, n_cat = check_info(train, test)

=== Shape & Memory ===
Train Shape: (750000, 345), Test Shape: (250000, 342)
Train Memory: 0.96 GB, Test Memory: 0.32 GB

=== DTypes ===
Float32: 320
Int32: 21
Int64: 1
Int8: 2
UInt32: 1


In [9]:
# === Save Overall Data ===
tr_path = FEATURE_DIR / "train.parquet"
test_path = FEATURE_DIR / "test.parquet"

train.write_parquet(tr_path)
test.write_parquet(test_path)

print(f"tr_df saved successfully to {tr_path}")
print(f"test_df saved successfully to {test_path}")

tr_df saved successfully to ../../artifacts/features/046/train.parquet
test_df saved successfully to ../../artifacts/features/046/test.parquet


### Save Meta data

In [10]:
JST = timezone(timedelta(hours=9))
pairs = [
    ("skf/k=5/s=42@train", "5fold-s42")
]
meta = {
    "data_id": ID,
    "train_paths": [str(tr_path)],
    "test_paths": [str(test_path)],
    "level": level,
    "created_at": datetime.now(JST).isoformat(),
    "train_shape": [train.height, train.width],
    "test_shape": [test.height, test.width],
    "memory": {
        "train": train_mem,
        "test": test_mem
    },
    "fold_column": pairs,
    "cat_cols": n_cat if n_cat else None
}

with open(f"{FEATURE_DIR}/meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

for k, v in meta.items():
    print(f"{k}: {v}")

data_id: 046
train_paths: ['../../artifacts/features/046/train.parquet']
test_paths: ['../../artifacts/features/046/test.parquet']
level: l1
created_at: 2025-10-03T15:42:24.673118+09:00
train_shape: [750000, 345]
test_shape: [250000, 342]
memory: {'train': 0.962521880865097, 'test': 0.31944364309310913}
fold_column: [('skf/k=5/s=42@train', '5fold-s42')]
cat_cols: None
