In [9]:
import os
import json
from collections import Counter
from datetime import datetime, timezone, timedelta
from pathlib import Path

import pandas as pd
import polars as pl

### Configuration

In [10]:
ID = "029"
SEED = 42
LEVEL = "l1"
FEATURE_DIR = Path(f"../../artifacts/features/{ID}")

os.makedirs(FEATURE_DIR, exist_ok=True)

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

print(f"Feature dir created successfully in \n{FEATURE_DIR}")

Feature dir created successfully in 
../../artifacts/features/029


### Utils

In [11]:
def check_info(
    train: pl.DataFrame,
    test: pl.DataFrame
) -> tuple[float, float, float]:
    train_mem = sum(train[col].to_numpy().nbytes for col in train.columns) / 1024**3
    test_mem = sum(test[col].to_numpy().nbytes for col in test.columns) / 1024**3

    print("=== Shape & Memory ===")
    print(f"Train Shape: {train.shape}, Test Shape: {test.shape}")
    print(f"Train Memory: {train_mem:.2f} GB, Test Memory: {test_mem:.2f} GB\n")

    dtype_counts = Counter([str(dt) for dt in train.dtypes])

    n_cat = None
    print("=== DTypes ===")
    for dtype, cnt in dtype_counts.items():
        print(f"{dtype}: {cnt}")
        if dtype == "Categorical":
            n_cat = cnt
    return train_mem, test_mem, n_cat


def downcast(df: pl.DataFrame) -> pl.DataFrame:
    INT32_MIN, INT32_MAX = -2_147_483_648, 2_147_483_647

    df = df.with_columns(pl.col(pl.Float64).cast(pl.Float32))

    # Int64で安全に落とせる列だけ選別
    int64_cols = [c for c, dt in df.schema.items() if dt == pl.Int64]
    safe_cols = []
    for c in int64_cols:
        mn, mx = df[c].min(), df[c].max()
        if mn >= INT32_MIN and mx <= INT32_MAX:
            safe_cols.append(c)

    # 安全な列だけ Int32 に
    if safe_cols:
        df = df.with_columns(pl.col(safe_cols).cast(pl.Int32))
    return df

### Feature Engineering
- 027のdataから特徴量重要度が低いものを除外

In [12]:
# === Load Data ===
with open("../../runs/xgb-027-trl22-s42/drop_cols_99.json", "r") as f:
    drop_cols = json.load(f)

source_path = Path("../../artifacts/features/027")

tr_cols = pl.scan_parquet(source_path / "tr_df.parquet").collect_schema().names()
tr_cols = [c for c in tr_cols if c not in drop_cols]

tr_df = pl.read_parquet(
    source_path / "tr_df.parquet",
    columns=tr_cols
)

test_cols = pl.scan_parquet(source_path / "test_df.parquet").collect_schema().names()
test_cols = [c for c in test_cols if c not in drop_cols]

test_df = pl.read_parquet(
    source_path / "test_df.parquet",
    columns=test_cols
)

In [4]:
# === Save Data ===
tr_df.write_parquet(FEATURE_DIR / "train.parquet")
test_df.write_parquet(FEATURE_DIR / "test.parquet")

In [7]:
# === 特徴量エンジニアリング後の情報 ===
train_mem, test_mem, n_cat = check_info(tr_df, test_df)

=== Shape & Memory ===
Train Shape: (750000, 1818), Test Shape: (250000, 1816)
Train Memory: 5.09 GB, Test Memory: 1.70 GB

=== DTypes ===
UInt32: 1
Int32: 7
Float32: 1808
Int8: 2


### Save Meta Data

In [13]:
fold_name = "5fold-s42"
JST = timezone(timedelta(hours=9))
meta = {
    "data_id": ID,
    "train_paths": [str(FEATURE_DIR / "train.parquet")],
    "test_paths": [str(FEATURE_DIR / "test.parquet")],
    "level": LEVEL,
    "created_at": datetime.now(JST).isoformat(),
    "train_shape": [tr_df.height, tr_df.width],
    "test_shape": [test_df.height, test_df.width],
    "memory": {
        "train": train_mem,
        "test": test_mem
    },
    "fold_column": fold_name,
    "cat_cols": n_cat if n_cat else None,
    "drop_cols": "../../runs/xgb-027-trl22-s42/drop_cols_99.json"
}

with open(f"{FEATURE_DIR}/meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)