In [3]:
import os
import json
from collections import Counter
from datetime import datetime, timezone, timedelta
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
import pyarrow.parquet as pq
from sklearn.model_selection import StratifiedKFold

### Configuration

In [4]:
# Configuration
ID = "032"
SEED = 42
N_SPLITS = [7, 10, 15, 20]
LEVEL = "l1"

source_path = Path("../../artifacts/features/023")
drop_cols_path = "../../runs/xgb-023-trl57-s42/drop_cols_95.json"

FEATURE_DIR = Path(f"../../artifacts/features/base/{ID}")

os.makedirs(FEATURE_DIR, exist_ok=True)

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

print(f"Feature dir created successfully in \n{FEATURE_DIR}")

Feature dir created successfully in 
../../artifacts/features/base/032


### Utils

In [None]:
def check_info(
    train: pl.DataFrame,
    test: pl.DataFrame
) -> tuple[float, float, float]:
    train_mem = sum(train[col].to_numpy().nbytes for col in train.columns) / 1024**3
    test_mem = sum(test[col].to_numpy().nbytes for col in test.columns) / 1024**3

    print("=== Shape & Memory ===")
    print(f"Train Shape: {train.shape}, Test Shape: {test.shape}")
    print(f"Train Memory: {train_mem:.2f} GB, Test Memory: {test_mem:.2f} GB\n")

    dtype_counts = Counter([str(dt) for dt in train.dtypes])

    n_cat = None
    print("=== DTypes ===")
    for dtype, cnt in dtype_counts.items():
        print(f"{dtype}: {cnt}")
        if dtype == "Categorical":
            n_cat = cnt
    return train_mem, test_mem, n_cat


def downcast(df: pl.DataFrame) -> pl.DataFrame:
    INT32_MIN, INT32_MAX = -2_147_483_648, 2_147_483_647

    df = df.with_columns(pl.col(pl.Float64).cast(pl.Float32))

    # Int64で安全に落とせる列だけ選別
    int64_cols = [c for c, dt in df.schema.items() if dt == pl.Int64]
    safe_cols = []
    for c in int64_cols:
        mn, mx = df[c].min(), df[c].max()
        if mn >= INT32_MIN and mx <= INT32_MAX:
            safe_cols.append(c)

    # 安全な列だけ Int32 に
    if safe_cols:
        df = df.with_columns(pl.col(safe_cols).cast(pl.Int32))
    return df

### Feature Engineering
- 023のDataから特徴量重要度が低いものを除外

In [5]:
# === Load Data ===
with open(drop_cols_path, "r") as f:
    drop_cols = json.load(f)

tr_cols = pl.scan_parquet(source_path / "tr_df.parquet").collect_schema().names()
tr_cols = [c for c in tr_cols if c not in drop_cols]

tr_df = pl.read_parquet(
    source_path / "tr_df.parquet",
    columns=tr_cols
)

test_cols = pl.scan_parquet(source_path / "test_df.parquet").collect_schema().names()
test_cols = [c for c in test_cols if c not in drop_cols]

test_df = pl.read_parquet(
    source_path / "test_df.parquet",
    columns=test_cols
)

In [6]:
pqfile = pq.ParquetFile(source_path / "tr_df.parquet")
table = pqfile.read(columns=["target"])
y = table["target"].combine_chunks().to_numpy()

for n_splits in N_SPLITS:
    skf = StratifiedKFold(
        n_splits=n_splits,
        shuffle=True,
        random_state=SEED
    )

    fold_ids = np.zeros(len(y), dtype=int)

    for fold_idx, (_, val_idx) in enumerate(
        skf.split(range(len(y)), y)
    ):
        fold_ids[val_idx] = fold_idx

    fold_name = f"{n_splits}fold-s{SEED}"

    tr_df = tr_df.with_columns(
        pl.lit(fold_ids).alias(fold_name).cast(pl.Int8)
    )

In [7]:
# === 特徴量エンジニアリング後の情報 ===
train_mem, test_mem, n_cat = check_info(tr_df, test_df)

=== Shape & Memory ===
Train Shape: (750000, 206), Test Shape: (250000, 200)
Train Memory: 0.56 GB, Test Memory: 0.19 GB

=== DTypes ===
UInt32: 1
Int32: 4
Float32: 195
Int8: 6


In [8]:
# === Save Data ===
tr_df.write_parquet(FEATURE_DIR / "train.parquet")
test_df.write_parquet(FEATURE_DIR / "test.parquet")

### Save Meta Data

In [9]:
JST = timezone(timedelta(hours=9))
meta = {
    "data_id": ID,
    "train_paths": [str(FEATURE_DIR / "train.parquet")],
    "test_paths": [str(FEATURE_DIR / "test.parquet")],
    "level": LEVEL,
    "created_at": datetime.now(JST).isoformat(),
    "train_shape": [tr_df.height, tr_df.width],
    "test_shape": [test_df.height, test_df.width],
    "memory": {
        "tr_df": train_mem,
        "test_df": test_mem
    },
    "n_splits": N_SPLITS,
    "cat_cols": n_cat if n_cat else None,
    "drop_cols": drop_cols_path
}

with open(f"{FEATURE_DIR}/meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)