In [1]:
import os
import sys
import json
from collections import Counter
from datetime import datetime, timezone, timedelta
from itertools import combinations
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
import pyarrow.parquet as pq
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm

sys.path.append(os.path.abspath("../.."))

from src.utils.target_encoding import target_encoding

In [2]:
# Configuration
ID = "013"
SEED = 42
N_SPLITS = 5
FEATURE_DIR = Path(f"../../artifacts/features/base/{ID}")

os.makedirs(FEATURE_DIR, exist_ok=True)

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

polars.config.Config

## Feature Engineering

In [3]:
# === Load Data ===
train = pl.read_csv("../../input/train.csv").drop("id")
test = pl.read_csv("../../input/test.csv").drop("id")
orig = pl.read_parquet("../../input/original.parquet")
orig = orig.with_columns(
    pl.when(pl.col("y") == "yes").then(1)
      .when(pl.col("y") == "no").then(0)
      .otherwise(None)
      .alias("y")
)

y_tr = train["y"].cast(pl.Int8)
y_orig = orig["y"].cast(pl.Int8)
y_merged = pl.concat([y_tr, y_orig], how="vertical")

train = train.drop("y")
orig = orig.drop("y")

CATS = [col for col in train.columns if train[col].dtype == pl.Utf8]
NUMS = [col for col in train.columns if train[col].dtype != pl.Utf8]

In [4]:
# === 全データを結合 ===
all_data = pl.concat([train, orig, test], how="vertical")

In [5]:
# === NUM → CAT ===
NUMS2CATS = [f"{c}2" for c in NUMS]
SIZES = {}

num2cat_exprs = [
    pl.col(c)
    .cast(pl.Utf8)
    .cast(pl.Categorical)
    .to_physical()
    .cast(pl.Int32).alias(f"{c}2")
    for c in NUMS
]
cat_exprs = [
    pl.col(c)
    .cast(pl.Categorical)
    .to_physical()
    .cast(pl.Int32).alias(c)
    for c in CATS
]

all_data = all_data.with_columns(
    num2cat_exprs + cat_exprs
)

SIZES = all_data.select(
    [pl.col(col)
     .n_unique()
     .alias(col) for col in CATS + NUMS2CATS]
).to_dicts()[0]

print(f"Created {len(NUMS2CATS)} new columns\n")
print(SIZES)

Created 7 new columns

{'job': 12, 'marital': 3, 'education': 4, 'default': 2, 'housing': 2, 'loan': 2, 'contact': 3, 'month': 12, 'poutcome': 4, 'age2': 78, 'balance2': 8590, 'day2': 31, 'duration2': 1824, 'campaign2': 52, 'pdays2': 628, 'previous2': 54}


In [6]:
# === 2Comboのペアを作成 ===
pairs = list(combinations(CATS + NUMS2CATS, 2))

combo_exprs = [(pl.col(c1) * SIZES[c2] + pl.col(c2))
               .alias(f"{c1}_{c2}") for c1, c2 in pairs]

COMBO = [f"{c1}_{c2}" for c1, c2 in pairs]

all_data = all_data.with_columns(combo_exprs)

print(f"Created {len(combo_exprs)} new Combo columns")

Created 120 new Combo columns


In [7]:
# === Target Encoding ===
tr_df = all_data[:len(train)+len(orig)].with_columns(y_merged.alias("target"))
test_df = all_data[len(train)+len(orig):]

te_cols = NUMS2CATS + CATS + COMBO

te_df = target_encoding(tr_df, test_df, cat_cols=te_cols)

print(f"Created {len(te_df.columns)} new columns")

0it [00:00, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

Created 136 new columns


In [8]:
# === Dataの結合 ===
all_data = pl.concat([
    all_data.select(NUMS),
    te_df
], how="horizontal")
all_data = all_data.with_row_index("row_id")

In [9]:
# === Downcast ===
INT32_MIN, INT32_MAX = -2_147_483_648, 2_147_483_647

all_data = all_data.with_columns(pl.col(pl.Float64).cast(pl.Float32))

# Int64で安全に落とせる列だけ選別
int64_cols = [c for c, dt in all_data.schema.items() if dt == pl.Int64]
safe_cols = []
for c in int64_cols:
    mn, mx = all_data[c].min(), all_data[c].max()
    if mn >= INT32_MIN and mx <= INT32_MAX:
        safe_cols.append(c)

# 安全な列だけ Int32 に
if safe_cols:
    all_data = all_data.with_columns(pl.col(safe_cols).cast(pl.Int32))


# === データを分割 ===
tr_df = all_data[:len(train)+len(orig)]
test_df = all_data[len(train)+len(orig):]

# === targetを追加 ===
tr_df = tr_df.with_columns(y_merged.alias("target"))

# === Fold列を追加 ===
skf = StratifiedKFold(
    n_splits=N_SPLITS,
    shuffle=True,
    random_state=SEED
)

fold_ids = np.zeros(len(y_merged), dtype=int)

for fold_idx, (_, val_idx) in enumerate(
    skf.split(range(len(y_merged)), y_merged)
):
    fold_ids[val_idx] = fold_idx

fold_name = f"{N_SPLITS}fold-s{SEED}"
tr_df = tr_df.with_columns(
    pl.lit(fold_ids).alias(fold_name).cast(pl.Int8)
)

In [10]:
# === 特徴量エンジニアリング後の情報 ===
tr_memory = sum(tr_df[col].to_numpy().nbytes for col in tr_df.columns) / 1024**2
test_memory = sum(test_df[col].to_numpy().nbytes for col in test_df.columns) / 1024**2

print("=== Shape & Memory ===")
print(f"Train Shape: {tr_df.shape}, Test Shape: {test_df.shape}")
print(f"Train Memory: {tr_memory:.2f} MB, Test Memory: {test_memory:.2f} MB\n")

dtype_counts = Counter([str(dt) for dt in tr_df.dtypes])

n_cat = None
print("=== DTypes ===")
for dtype, cnt in dtype_counts.items():
    print(f"{dtype}: {cnt}")
    if dtype == "Categorical":
        n_cat = cnt

=== Shape & Memory ===
Train Shape: (795211, 146), Test Shape: (250000, 144)
Train Memory: 438.34 MB, Test Memory: 137.33 MB

=== DTypes ===
UInt32: 1
Int32: 7
Float32: 136
Int8: 2


In [11]:
# === Save Data ===
tr_path = FEATURE_DIR / "tr_df.parquet"
test_path = FEATURE_DIR / "test_df.parquet"

tr_df.write_parquet(tr_path)
test_df.write_parquet(test_path)

print(f"tr_df saved successfully to {tr_path}")
print(f"test_df saved successfully to {test_path}")

tr_df saved successfully to ../../artifacts/features/base/013/tr_df.parquet
test_df saved successfully to ../../artifacts/features/base/013/test_df.parquet


## Meta dataを保存

In [12]:
JST = timezone(timedelta(hours=9))
meta = {
    "data_id": ID,
    "created_at": datetime.now(JST).isoformat(),
    "train_shape": [tr_df.height, tr_df.width],
    "test_shape": [test_df.height, test_df.width],
    "memory": {
        "tr_df": tr_memory,
        "test_df": test_memory
    },
    "fold_column": fold_name,
    "cat_cols": n_cat if n_cat else None
}

with open(f"{FEATURE_DIR}/meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)