In [1]:
import os
import sys
import json
from collections import Counter
from datetime import datetime, timezone, timedelta
from itertools import combinations
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
from tqdm.notebook import tqdm

sys.path.append(os.path.abspath("../.."))

from src.utils.target_encoding import target_encoding

### Configuration

In [2]:
ID = "027"
SEED = 42
level = "l1"
FEATURE_DIR = Path(f"../../artifacts/features/{ID}")

os.makedirs(FEATURE_DIR, exist_ok=True)

# === Chank処理の個数と処理する番目を確定 ===
CHUNK_SIZE = 10
CHUNK_N = 9  # 0~9

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

print(f"Feature dir created successfully in \n{FEATURE_DIR}")

Feature dir created successfully in 
../../artifacts/features/027


### Utils

In [3]:
def check_info(
    train: pl.DataFrame,
    test: pl.DataFrame
) -> tuple[float, float, float]:
    train_mem = sum(train[col].to_numpy().nbytes for col in train.columns) / 1024**3
    test_mem = sum(test[col].to_numpy().nbytes for col in test.columns) / 1024**3

    print("=== Shape & Memory ===")
    print(f"Train Shape: {train.shape}, Test Shape: {test.shape}")
    print(f"Train Memory: {train_mem:.2f} GB, Test Memory: {test_mem:.2f} GB\n")

    dtype_counts = Counter([str(dt) for dt in train.dtypes])

    n_cat = None
    print("=== DTypes ===")
    for dtype, cnt in dtype_counts.items():
        print(f"{dtype}: {cnt}")
        if dtype == "Categorical":
            n_cat = cnt
    return train_mem, test_mem, n_cat


def downcast(df: pl.DataFrame) -> pl.DataFrame:
    INT32_MIN, INT32_MAX = -2_147_483_648, 2_147_483_647

    df = df.with_columns(pl.col(pl.Float64).cast(pl.Float32))

    # Int64で安全に落とせる列だけ選別
    int64_cols = [c for c, dt in df.schema.items() if dt == pl.Int64]
    safe_cols = []
    for c in int64_cols:
        mn, mx = df[c].min(), df[c].max()
        if mn >= INT32_MIN and mx <= INT32_MAX:
            safe_cols.append(c)

    # 安全な列だけ Int32 に
    if safe_cols:
        df = df.with_columns(pl.col(safe_cols).cast(pl.Int32))
    return df

### Feature Engineering (Save per Chunk)
- 3-gram TE(mean)
- orig Targetでも3-gram TE(mean)
- 3-gram CE(without orig)

In [3]:
# === Load Data ===
train = pl.read_csv("../../input/train.csv").drop("id")
test = pl.read_csv("../../input/test.csv").drop("id")
orig = pl.read_parquet("../../input/original.parquet")
orig = orig.with_columns(
    pl.when(pl.col("y") == "yes").then(1)
      .when(pl.col("y") == "no").then(0)
      .otherwise(None)
      .alias("y")
)

y_tr = train["y"].cast(pl.Int8)
y_orig = orig["y"].cast(pl.Int8)
y_merged = pl.concat([y_tr, y_orig], how="vertical")

train = train.drop("y")
orig = orig.drop("y")

CATS = [col for col in train.columns if train[col].dtype == pl.Utf8]
NUMS = [col for col in train.columns if train[col].dtype != pl.Utf8]

In [4]:
# === 全データを結合 ===
all_data = pl.concat([train, test, orig], how="vertical")

In [5]:
# === NUM → CAT ===
NUMS2CATS = [f"{c}2" for c in NUMS]
SIZES = {}

num2cat_exprs = [
    pl.col(c)
    .cast(pl.Utf8)
    .cast(pl.Categorical)
    .to_physical()
    .cast(pl.Int32).alias(f"{c}2")
    for c in NUMS
]
cat_exprs = [
    pl.col(c)
    .cast(pl.Categorical)
    .to_physical()
    .cast(pl.Int32).alias(c)
    for c in CATS
]

all_data = all_data.with_columns(
    num2cat_exprs + cat_exprs
)

SIZES = all_data.select(
    [pl.col(col)
     .n_unique()
     .alias(col) for col in CATS + NUMS2CATS]
).to_dicts()[0]

print(f"Created {len(NUMS2CATS)} new columns\n")
print(SIZES)

Created 7 new columns

{'job': 12, 'marital': 3, 'education': 4, 'default': 2, 'housing': 2, 'loan': 2, 'contact': 3, 'month': 12, 'poutcome': 4, 'age2': 78, 'balance2': 8590, 'day2': 31, 'duration2': 1824, 'campaign2': 52, 'pdays2': 628, 'previous2': 54}


In [6]:
# === 2Comboのペアを作成 ===
pairs = list(combinations(CATS + NUMS2CATS, 2))

combo_exprs = [(pl.col(c1) * SIZES[c2] + pl.col(c2))
               .alias(f"{c1}_{c2}") for c1, c2 in pairs]

COMBO2 = [f"{c1}_{c2}" for c1, c2 in pairs]

all_data = all_data.with_columns(combo_exprs)

print(f"Created {len(combo_exprs)} new columns")

Created 120 new columns


In [7]:
# === 3Comboのペアを作成 ===
pairs = list(combinations(CATS + NUMS2CATS, 3))

combo_exprs = [
    ((pl.col(c1) * SIZES[c2] + pl.col(c2)) * SIZES[c3] + pl.col(c3)).alias(
        f"{c1}_{c2}_{c3}"
    )
    for c1, c2, c3 in pairs
]

COMBO3 = [f"{c1}_{c2}_{c3}" for c1, c2, c3 in pairs]

all_data = all_data.with_columns(combo_exprs)

print(f"Created {len(combo_exprs)} new columns")

Created 560 new columns


In [8]:
# === Targetをoriginalのものにする ===
tr_df = all_data[:len(train)]
test_df = all_data[len(train):len(train)+len(test)]
orig_df = all_data[len(train)+len(test):]
orig_df = orig_df.with_columns(y_orig.alias("target"))

te_cols = CATS + NUMS2CATS + COMBO2 + COMBO3

n = CHUNK_SIZE
k, m = divmod(len(te_cols), n)
te_col_chunks = [
    te_cols[i * k + min(i, m): (i + 1) * k + min(i + 1, m)] for i in range(n)
]
te_cols = te_col_chunks[CHUNK_N]

te_tr_dict = {f"{col}_te2": np.zeros(tr_df.height) for col in te_cols}
te_test_dict = {f"{col}_te2": np.zeros(test_df.height) for col in te_cols}

for col in tqdm(te_cols):
    means_df = (
        orig_df.group_by(col)
        .agg(pl.col("target").mean())
        .rename({"target": "mean_target"})
    )

    # 2. valデータにマッピング（Polarsのjoinで結合）
    tr_with_mean = tr_df.join(means_df, on=col, how="left")

    # 3. マッピングできなかったものは平均値で補完
    overall_mean = means_df["mean_target"].mean()
    tr_te = tr_with_mean["mean_target"].fill_null(overall_mean).to_numpy()

    te_tr_dict[f"{col}_te2"] = tr_te

    # 4. テストデータも同様にjoin
    test_with_mean = test_df.join(means_df, on=col, how="left")
    test_te = test_with_mean["mean_target"].fill_null(overall_mean).to_numpy()

    te_test_dict[f"{col}_te2"] += test_te

te_tr = pl.DataFrame(te_tr_dict).with_columns([
    pl.col(col).cast(pl.Float32) for col in te_tr_dict.keys()
])
te_test = pl.DataFrame(te_test_dict).with_columns([
    pl.col(col).cast(pl.Float32) for col in te_test_dict.keys()
])
te_orig = pl.concat([te_tr, te_test], how="vertical")

print(f"Created {len(te_orig.columns)} new columns")

  0%|          | 0/69 [00:00<?, ?it/s]

Created 69 new columns


In [9]:
# === Target Encoding ===
tr_df = all_data[:len(train)].with_columns(y_tr.alias("target"))
test_df = all_data[len(train):len(train)+len(test)]

te_df = target_encoding(tr_df, test_df, cat_cols=te_cols)

print(f"Created {len(te_df.columns)} new columns")

0it [00:00, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

Created 69 new columns


In [10]:
# === Count Encoding
all_data = all_data[len(train)+len(test)]

ce_cols = [c for c in all_data.columns if c not in NUMS2CATS]
k, m = divmod(len(ce_cols), n)
ce_col_chunks = [
    ce_cols[i * k + min(i, m): (i + 1) * k + min(i + 1, m)] for i in range(n)
]
ce_cols = ce_col_chunks[CHUNK_N]

ce_dict = {f"{col}_ce": np.zeros(all_data.height) for col in ce_cols}
for col in tqdm(ce_cols):
    counts = all_data.group_by(col).agg(pl.len().alias(f"{col}_ce"))
    joined_df = all_data.join(counts, on=col, how="left")
    ce_dict[f"{col}_ce"] = joined_df[f"{col}_ce"]

ce_df = pl.DataFrame(ce_dict).with_columns([
        pl.col(col).cast(pl.Float32) for col in ce_dict.keys()
])

print(f"Created {len(ce_df.columns)} new columns")

  0%|          | 0/69 [00:00<?, ?it/s]

Created 69 new columns


In [11]:
# === 最初のChunkに数値データとidを付与 ===
if CHUNK_N == 0:
    all_data = pl.concat([all_data.select(NUMS), te_df, te_orig, ce_df], how="horizontal")
    all_data = all_data.with_row_index("row_id")
else:
    all_data = pl.concat([te_df, te_orig, ce_df], how="horizontal")

In [12]:
# === Downcast ===
all_data = downcast(all_data)

# === データを分割 ===
tr_df = all_data[:len(train)]
test_df = all_data[len(train):len(train)+len(test)]

# === targetを追加 ===
if CHUNK_N == 0:
    tr_df = tr_df.with_columns(y_tr.alias("target"))

In [13]:
# === 特徴量エンジニアリング後の情報 ===
check_info(tr_df, test_df)

=== Shape & Memory ===
Train Shape: (750000, 207), Test Shape: (250000, 207)
Train Memory: 0.58 GB, Test Memory: 0.19 GB

=== DTypes ===
Float32: 207


In [14]:
# === Save Chunk Data ===
tr_path = os.path.join(FEATURE_DIR, f"tr_df-ch{CHUNK_N}.parquet")
test_path = os.path.join(FEATURE_DIR, f"test_df-ch{CHUNK_N}.parquet")

tr_df.write_parquet(tr_path)
test_df.write_parquet(test_path)
print(f"\nFEATURE ENGINEERING CHUNK_NO.{CHUNK_N} COMPLETE!")


FEATURE ENGINEERING CHUNK_NO.9 COMPLETE!


### Merge Chunks and Add Fold Col

In [4]:
# Merge Chunks
train_list = []
test_list = []

for i in range(CHUNK_SIZE):
    train = pl.read_parquet(FEATURE_DIR / f"tr_df-ch{i}.parquet")
    train_list.append(train)
    test = pl.read_parquet(FEATURE_DIR / f"test_df-ch{i}.parquet")
    test_list.append(test)

tr_df = pl.concat([df for df in train_list], how="horizontal")
test_df = pl.concat([df for df in test_list], how="horizontal")

# Add Fold Col
folds_path = "../../artifacts/folds/folds.parquet"
pairs = [
    ("skf/k=5/s=42@train", "5fold-s42")
]
cfgs = [c for c, _ in pairs]
rename_map = {c: n for c, n in pairs}

# folds をまとめて読み → ワイド化（cfg列を列見出しに）→ 列名をfold_nameにリネーム
folds_wide_lf = (
    pl.scan_parquet(folds_path)
      .filter(pl.col("cfg").is_in(cfgs))
      .unique(subset=["row_id", "cfg"], keep="last")
      .select(["row_id", "cfg", "fold"])
      .pivot(values="fold", index="row_id", columns="cfg", aggregate_function="first")
      .rename(rename_map)
      .with_columns(pl.col("row_id").cast(pl.Int32))
      .with_columns([pl.all().exclude("row_id").cast(pl.Int8)])  # 型を軽く
)

# tr_df が DataFrame の場合
tr_df = tr_df.join(folds_wide_lf.collect(streaming=True), on="row_id", how="left")

In [5]:
# === 特徴量エンジニアリング後の情報 ===
train_mem, test_mem, n_cat = check_info(tr_df, test_df)

=== Shape & Memory ===
Train Shape: (750000, 2098), Test Shape: (250000, 2096)
Train Memory: 5.88 GB, Test Memory: 1.96 GB

=== DTypes ===
UInt32: 1
Int32: 7
Float32: 2088
Int8: 2


In [7]:
# === Save Overall Data ===
tr_path = FEATURE_DIR / "train.parquet"
test_path = FEATURE_DIR / "test.parquet"

tr_df.write_parquet(tr_path)
test_df.write_parquet(test_path)

print(f"tr_df saved successfully to {tr_path}")
print(f"test_df saved successfully to {test_path}")

tr_df saved successfully to ../../artifacts/features/027/tr_df.parquet
test_df saved successfully to ../../artifacts/features/027/test_df.parquet


### Save Meta data

In [6]:
JST = timezone(timedelta(hours=9))
meta = {
    "data_id": ID,
    "train_paths": [str(FEATURE_DIR / "train.parquet")],
    "test_paths": [str(FEATURE_DIR / "test.parquet")],
    "level": level,
    "created_at": datetime.now(JST).isoformat(),
    "train_shape": [tr_df.height, tr_df.width],
    "test_shape": [test_df.height, test_df.width],
    "memory": {
        "train": train_mem,
        "test": test_mem
    },
    "fold_column": pairs,
    "cat_cols": n_cat if n_cat else None
}

with open(f"{FEATURE_DIR}/meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)