In [10]:
import os
import json
import math
from collections import Counter
from itertools import combinations
from datetime import datetime, timezone, timedelta
from pathlib import Path

import numpy as np
import polars as pl

### Configuration

In [11]:
ID = "002"
SEED = 42
level = "l1"
FEATURE_DIR = Path(f"../../artifacts/features/{ID}")

os.makedirs(FEATURE_DIR, exist_ok=True)

pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

print(f"Feature dir created successfully in \n{FEATURE_DIR}")

Feature dir created successfully in 
../../artifacts/features/002


### Utils

In [12]:
def check_info(
    train: pl.DataFrame,
    test: pl.DataFrame
) -> tuple[float, float, float]:
    train_mem = sum(train[col].to_numpy().nbytes for col in train.columns) / 1024**3
    test_mem = sum(test[col].to_numpy().nbytes for col in test.columns) / 1024**3

    print("=== Shape & Memory ===")
    print(f"Train Shape: {train.shape}, Test Shape: {test.shape}")
    print(f"Train Memory: {train_mem:.2f} GB, Test Memory: {test_mem:.2f} GB\n")

    dtype_counts = Counter([str(dt) for dt in train.dtypes])

    n_cat = None
    print("=== DTypes ===")
    for dtype, cnt in dtype_counts.items():
        print(f"{dtype}: {cnt}")
        if dtype == "Categorical":
            n_cat = cnt
    return train_mem, test_mem, n_cat


def downcast(df: pl.DataFrame) -> pl.DataFrame:
    INT32_MIN, INT32_MAX = -2_147_483_648, 2_147_483_647

    df = df.with_columns(pl.col(pl.Float64).cast(pl.Float32))

    # Int64で安全に落とせる列だけ選別
    int64_cols = [c for c, dt in df.schema.items() if dt == pl.Int64]
    safe_cols = []
    for c in int64_cols:
        mn, mx = df[c].min(), df[c].max()
        if mn >= INT32_MIN and mx <= INT32_MAX:
            safe_cols.append(c)

    # 安全な列だけ Int32 に
    if safe_cols:
        df = df.with_columns(pl.col(safe_cols).cast(pl.Int32))
    return df

### Feature Engineering
- Month, DayをSin, Cosで変換
- Balance, DurationのDigitsを追加
- Balance, Duration関連の積や商を追加

In [13]:
# === Load Data ===
train = pl.read_csv("../../input/train.csv").drop("id")
test = pl.read_csv("../../input/test.csv").drop("id")

y_tr = train["y"].cast(pl.Int32)
train = train.drop("y")

CATS = [col for col in train.columns if train[col].dtype == pl.Utf8]
NUMS = [col for col in train.columns if train[col].dtype != pl.Utf8]
print(f"NUMS: {len(NUMS)}\n{NUMS}")
print(f"\nCATS: {len(CATS)}\n{CATS}")

NUMS: 7
['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

CATS: 9
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']


In [14]:
# === 全データを結合 ===
all_data = pl.concat([train, test], how="vertical")
cat_exprs = [
    pl.col(c)
    .cast(pl.Categorical)
    .to_physical()
    .rank("dense")
    .cast(pl.Int32)
    .alias(c)
    for c in CATS
]
all_data = all_data.with_columns(cat_exprs)
num_df = all_data.select(NUMS)
cat_df = all_data.select(
    [pl.col(c).cast(pl.Utf8).cast(pl.Categorical) for c in CATS]
)

In [15]:
# Durationとbalanceのそれぞれの桁の数
exprs = []
digits_cols = ["duration", "balance"]
for k in (0, 1, 2, 3):
    for c in digits_cols:
        exprs.append(
            ((pl.col(c) // (10**k)) % 10)
            .cast(pl.Int32)
            .alias(f"{c}_digitL{k}")
        )

digits_df = num_df.select(exprs)
print(f"New cols: {len(digits_df.columns)}")

New cols: 8


In [16]:
# 周期のdf
TAU = 2 * math.pi

cyc_exprs = [
    np.sin(pl.col("month") * (TAU/12))
    .alias("month_sin"),
    np.cos(pl.col("month") * (TAU/12))
    .alias("month_cos"),
    np.sin(pl.col("day") * (TAU/31))
    .alias("day_sin"),
    np.cos(pl.col("day") * (TAU/31))
    .alias("day_cos"),
]
cyc_df = all_data.select(cyc_exprs)
print(f"New cols: {len(cyc_df.columns)}")

New cols: 4


In [17]:
# 積と商のdf
exprs = []
pairs = list(combinations(NUMS, 2))
for c1, c2 in pairs:
    exprs.append(
        (
            np.log1p(pl.col(c1).abs()) * pl.col(c1).sign()
            - np.log1p(pl.col(c2).abs()) * pl.col(c1).sign()
        ).alias(f"{c1}_div_{c2}"))
    exprs.append(
        (pl.col(c1) * pl.col(c2)).alias(f"{c1}_mul_{c2}")
    )
for c in NUMS:
    exprs.append((pl.col(c) ** 2).alias(f"{c}_sq"))

arith_df = num_df.select(exprs)
print(f"New cols: {len(arith_df.columns)}")

New cols: 49


In [18]:
# dfを結合
all_data = pl.concat([num_df, cat_df, digits_df, cyc_df, arith_df], how="horizontal")

In [19]:
# === row_id を追加 ===
all_data = all_data.with_row_index("row_id")

# === Downcast ===
all_data = downcast(all_data)

# === データを分割 ===
tr_df = all_data[:len(train)]
test_df = all_data[len(train):len(train)+len(test)]

# === データを分割 ===
tr_df = all_data[:len(train)]
test_df = all_data[len(train):]

# === targetを追加 ===
tr_df = tr_df.with_columns(y_tr.alias("target"))

### Add Fold Col

In [20]:
folds_path = "../../artifacts/folds/folds.parquet"
pairs = [
    ("skf/k=5/s=42@train", "5fold-s42")
]
cfgs = [c for c, _ in pairs]
rename_map = {c: n for c, n in pairs}

# folds をまとめて読み → ワイド化（cfg列を列見出しに）→ 列名をfold_nameにリネーム
folds_wide = (
    pl.scan_parquet(folds_path)
      .filter(pl.col("cfg").is_in(cfgs))
      .unique(subset=["row_id", "cfg"], keep="last")
      .select(["row_id", "cfg", "fold"])
      .collect(engine="streaming")
      .pivot(values="fold", index="row_id", on="cfg", aggregate_function="first")
      .rename(rename_map)
      .with_columns(pl.col("row_id").cast(pl.Int32))
      .with_columns([pl.all().exclude("row_id").cast(pl.Int8)])  # 型を軽く
)

# tr_df が DataFrame の場合
tr_df = tr_df.join(folds_wide, on="row_id", how="left")

In [33]:
# === 特徴量エンジニアリング後の情報 ===
train_mem, test_mem, n_cat = check_info(tr_df, test_df)

=== Shape & Memory ===
Train Shape: (750000, 80), Test Shape: (250000, 78)
Train Memory: 0.25 GB, Test Memory: 0.08 GB

=== DTypes ===
UInt32: 1
Int32: 43
Categorical(ordering='physical'): 9
Float32: 25
Int64: 1
Int8: 1


In [34]:
# === Save Data ===
tr_path = FEATURE_DIR / "train.parquet"
test_path = FEATURE_DIR / "test.parquet"

tr_df.write_parquet(tr_path)
test_df.write_parquet(test_path)

print(f"tr_df saved successfully to {tr_path}")
print(f"test_df saved successfully to {test_path}")

tr_df saved successfully to ../../artifacts/features/002/train.parquet
test_df saved successfully to ../../artifacts/features/002/test.parquet


### Save Meta Data

In [35]:
JST = timezone(timedelta(hours=9))
meta = {
    "data_id": ID,
    "train_paths": [str(FEATURE_DIR / "train.parquet")],
    "test_paths": [str(FEATURE_DIR / "test.parquet")],
    "level": level,
    "created_at": datetime.now(JST).isoformat(),
    "train_shape": [tr_df.height, tr_df.width],
    "test_shape": [test_df.height, test_df.width],
    "memory": {
        "train": train_mem,
        "test": test_mem
    },
    "fold_column": pairs,
    "cat_cols": n_cat if n_cat else None
}

with open(f"{FEATURE_DIR}/meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)