In [2]:
import os
import re
import sys
import json
from collections import Counter
from datetime import datetime, timezone, timedelta
from itertools import combinations
from pathlib import Path

import ipynbname
import numpy as np
import polars as pl
from tqdm.notebook import tqdm
sys.path.append(os.path.abspath("../.."))

from src.utils.target_encoding import target_encoding

### Configuration

In [3]:
stem = ipynbname.path().stem
m = re.search(r"(\d+)$", stem)

ID = m.group(1)
SEED = 42
LEVEL = "l1"
ALPHA = 0
FEATURE_DIR = Path(f"../../artifacts/features/{ID}")

os.makedirs(FEATURE_DIR, exist_ok=True)

pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

polars.config.Config

### Urils

In [4]:
def check_info(
    train: pl.DataFrame,
    test: pl.DataFrame
) -> tuple[float, float, float]:
    train_mem = sum(train[col].to_numpy().nbytes for col in train.columns) / 1024**3
    test_mem = sum(test[col].to_numpy().nbytes for col in test.columns) / 1024**3

    print("=== Shape & Memory ===")
    print(f"Train Shape: {train.shape}, Test Shape: {test.shape}")
    print(f"Train Memory: {train_mem:.2f} GB, Test Memory: {test_mem:.2f} GB\n")

    dtype_counts = Counter([str(dt) for dt in train.dtypes])

    n_cat = None
    print("=== DTypes ===")
    for dtype, cnt in dtype_counts.items():
        print(f"{dtype}: {cnt}")
        if dtype == "Categorical":
            n_cat = cnt
    return train_mem, test_mem, n_cat


def downcast(df: pl.DataFrame) -> pl.DataFrame:
    INT32_MIN, INT32_MAX = -2_147_483_648, 2_147_483_647

    df = df.with_columns(pl.col(pl.Float64).cast(pl.Float32))

    # Int64で安全に落とせる列だけ選別
    int64_cols = [c for c, dt in df.schema.items() if dt == pl.Int64]
    safe_cols = []
    for c in int64_cols:
        mn, mx = df[c].min(), df[c].max()
        if mn >= INT32_MIN and mx <= INT32_MAX:
            safe_cols.append(c)

    # 安全な列だけ Int32 に
    if safe_cols:
        df = df.with_columns(pl.col(safe_cols).cast(pl.Int32))
    return df


def compute_group_stats(
    df: pl.DataFrame,
    key_cols: list[str],
    value_cols: list[str],
    *,
    stats: tuple[str, ...] = ("mean", "std", "min", "max", "median")
    ,
) -> pl.DataFrame:
    stats_dict = {}

    base_map = {}
    for v in value_cols:
        v_clean = pl.when(pl.col(v).is_finite()).then(pl.col(v)).otherwise(None)
        base_map[v] = (
            df.select([
                v_clean.mean().alias("mean"),
                v_clean.std(ddof=1).alias("std"),
                v_clean.min().alias("min"),
                v_clean.max().alias("max"),
                v_clean.median().alias("median"),
            ])
            .to_dicts()[0]
        )

    for k in tqdm(key_cols):
        for v in value_cols:
            if v in k:
                continue

            fill_map = {}
            for s in stats:
                name = f"{v}_{s}_by_{k}"
                val = base_map[v].get(s)
                if s == "std":
                    fill_map[name] = (
                        float(val)
                        if (val is not None and np.isfinite(val) and val >= 1e-6)
                        else 1.0
                    )
                else:
                    fill_map[name] = (
                        float(val)
                        if (val is not None and np.isfinite(val)) 
                        else 0.0
                    )

            aggs = []
            col_names = []
            if "mean" in stats:
                aggs.append(pl.col(v).mean().alias(f"{v}_mean_by_{k}"))
                col_names.append(f"{v}_mean_by_{k}")
            if "std" in stats:
                aggs.append(pl.col(v).std(ddof=1).alias(f"{v}_std_by_{k}"))
                col_names.append(f"{v}_std_by_{k}")
            if "min" in stats:
                aggs.append(pl.col(v).min().alias(f"{v}_min_by_{k}"))
                col_names.append(f"{v}_min_by_{k}")
            if "max" in stats:
                aggs.append(pl.col(v).max().alias(f"{v}_max_by_{k}"))
                col_names.append(f"{v}_max_by_{k}")
            if "median" in stats:
                aggs.append(pl.col(v).median().alias(f"{v}_median_by_{k}"))
                col_names.append(f"{v}_median_by_{k}")

            grouped_df = (
                df.select([k, v])
                .group_by(k)
                .agg(aggs)
            )
            stats_array = (
                df.join(
                    grouped_df.select(col_names + [k]),
                    on=k,
                    how="left"
                )
                .select(col_names)
                .with_columns(
                    [
                        pl.col(c).fill_null(fill_map[c]).alias(c)
                        for c in col_names
                    ]
                )
                .to_numpy()
                .astype(dtype=np.float32, copy=False)
            )
            for i, c in enumerate(col_names):
                stats_dict[c] = stats_array[:, i]

            del grouped_df, stats_array

    stats_df = pl.DataFrame(stats_dict)

    return stats_df


def target_encoding_orig(
    tr_df: pl.DataFrame,
    test_df: pl.DataFrame,
    orig_df: pl.DataFrame,
    key_cols: list[str],
    target: str = "target",
    stats: tuple[str, ...] = ("mean", "std", "min", "max", "median", "count"),
    alpha: int = 20,
    n_splits: int = 5,
    seed: int = 42
) -> pl.DataFrame:
    """
    Out-of-fold (OOF) target encoding with M-estimate smoothing.

    For each column in `key_cols`, per-category statistics are computed on the
    fold's training split and joined to the validation split (leak-free).
    Test features are computed per fold and averaged. For "mean", the smoothed
    estimate is:
        (n * mean + alpha * global_mean) / (n + alpha)
    Unseen categories are filled with the fold's global statistics.

    Parameters
    ----------
    tr_df : pl.DataFrame
        Training data. Must contain `target` and all `key_cols`.
    test_df : pl.DataFrame
        Unlabeled data. Must contain all `key_cols`.
    key_cols : list[str]
        Discrete/categorical keys used for grouping. Do not pass raw float
        columns; round/bin or stringify them first to avoid join drift.
    target : str, default "target"
        Target column name. For "count", this function counts positives as
        `(target == 1).sum()` (binary assumption).
    stats : tuple of {"mean","std","min","max","median","count"}, default (...)
        Per-category statistics to output. Only "mean" is smoothed by `alpha`.
        ("count" means positive count for binary targets.)
    alpha : float, default 20.0
        Smoothing strength (half-life). `alpha=0` disables smoothing.
        n≈alpha ⇒ the category mean is trusted ~50%.
    n_splits : int, default 5
        Number of StratifiedKFold splits.
    seed : int, default 42
        Random seed for fold shuffling.

    Returns
    -------
    pl.DataFrame
        Encoded features for train and test stacked vertically. Columns are
        named `{target}_{stat}_by_{col}` in the order of `key_cols`.
        Shape: (tr_df.height + test_df.height, sum_over_cols len(stats_for_col))

    Notes
    -----
    - OOF computation prevents target leakage.
    - Unseen categories are filled with fold-wise global stats (e.g., global_mean).
    - If you need raw frequency n_i, add an explicit aggregation; "count" here
      is the positive-class count (binary).
    """
    def stat_names(col: str) -> list[str]:
        names = []
        if "mean" in stats:
            names.append(f"orig_{target}_mean_by_{col}")
        if "std" in stats:
            names.append(f"orig_{target}_std_by_{col}")
        if "min" in stats:
            names.append(f"orig_{target}_min_by_{col}")
        if "max" in stats:
            names.append(f"orig_{target}_max_by_{col}")
        if "median" in stats:
            names.append(f"orig_{target}_median_by_{col}")
        if "count" in stats:
            names.append(f"orig_{target}_count_by_{col}")  # 1の個数
        return names

    all_cols = []
    for col in key_cols:
        all_cols.extend(stat_names(col))

    N_tr, N_te = tr_df.height, test_df.height

    te_train = {c: np.zeros(N_tr, dtype=np.float32) for c in all_cols}
    te_test = {c: np.zeros(N_te, dtype=np.float32) for c in all_cols}

    train = orig_df
    val = tr_df
    global_mean = train.select(pl.col(target).mean()).to_series()[0]

    base = train.select([
        pl.col(target).mean().alias("mean"),
        pl.col(target).std(ddof=1).alias("std"),
        pl.col(target).min().alias("min"),
        pl.col(target).max().alias("max"),
        pl.col(target).median().alias("median"),
        (pl.col(target) == 1).sum().alias("cnt"),
    ]).to_dicts()[0]

    for col in tqdm(key_cols):
        fill_map = {}
        for s in stats:
            name = f"orig_{target}_{s}_by_{col}"
            if s == "count":
                fill_map[name] = 0.0
            else:
                fill_map[name] = float(base[s])

        aggs = []
        col_names = []
        if "mean" in stats:
            aggs.append(
                (
                    (pl.col(target).sum() + pl.lit(alpha) * pl.lit(global_mean))
                    / (pl.len() + pl.lit(alpha))
                ).alias(f"orig_{target}_mean_by_{col}")
            )
            col_names.append(f"orig_{target}_mean_by_{col}")
        if "std" in stats:
            aggs.append(
                pl.col(target).std(ddof=1).alias(f"orig_{target}_std_by_{col}")
            )
            col_names.append(f"orig_{target}_std_by_{col}")
        if "min" in stats:
            aggs.append(
                pl.col(target).min().alias(f"orig_{target}_min_by_{col}")
            )
            col_names.append(f"orig_{target}_min_by_{col}")
        if "max" in stats:
            aggs.append(
                pl.col(target).max().alias(f"orig_{target}_max_by_{col}")
            )
            col_names.append(f"orig_{target}_max_by_{col}")
        if "median" in stats:
            aggs.append(
                pl.col(target).median().alias(f"orig_{target}_median_by_{col}")
            )
            col_names.append(f"orig_{target}_median_by_{col}")
        if "count" in stats:
            aggs.append(
                (pl.col(target) == 1).sum().alias(f"orig_{target}_count_by_{col}")
            )
            col_names.append(f"orig_{target}_count_by_{col}")

        grouped_df = (
            train.select([col, target])
            .group_by(col)
            .agg(aggs)
        )

        # validation
        val_mat = (
            val.join(
                grouped_df.select(col_names + [col]),
                on=col,
                how="left"
            )
            .select(col_names)
            .with_columns(
                [
                    pl.col(c).fill_null(fill_map[c]).alias(c)
                    for c in col_names
                ]
            )
            .to_numpy()
            .astype(dtype=np.float32, copy=False)
        )

        for j, name in enumerate(col_names):
            te_train[name] = val_mat[:, j]

        # test
        test_mat = (
            test_df.join(
                grouped_df.select(col_names + [col]),
                on=col,
                how="left"
            )
            .select(col_names)
            .with_columns(
                [
                    pl.col(c).fill_null(fill_map[c]).alias(c)
                    for c in col_names
                ]
            )
            .to_numpy()
            .astype(dtype=np.float32, copy=False)
        )
        for j, name in enumerate(col_names):
            te_test[name] += test_mat[:, j]

        del grouped_df, val_mat, test_mat
    del train, val

    te_tr = pl.DataFrame(te_train)
    te_test = pl.DataFrame(te_test)

    return pl.concat([te_tr, te_test], how="vertical")

### Feature Engineering
- num_df2とcat_dfで1-gram stats_df(mean)
- 2-gram TE(mean)
- orig Targetでも2-gram TE(mean)
- 2-gram CE(without orig)

In [5]:
# === Load Data ===
train = pl.read_csv("../../input/train.csv").drop("id")
test = pl.read_csv("../../input/test.csv").drop("id")
orig = pl.read_parquet("../../input/original.parquet")
orig = orig.with_columns(
    pl.when(pl.col("y") == "yes").then(1)
      .when(pl.col("y") == "no").then(0)
      .otherwise(None)
      .alias("y")
)

y_tr = train["y"].cast(pl.Int8)
y_orig = orig["y"].cast(pl.Int8)
y_merged = pl.concat([y_tr, y_orig], how="vertical")

train = train.drop("y")
orig = orig.drop("y")

CATS = [col for col in train.columns if train[col].dtype == pl.Utf8]
NUMS = [col for col in train.columns if train[col].dtype != pl.Utf8]

In [6]:
# === 全データを結合 ===
all_data = pl.concat([train, test, orig], how="vertical")
cat_exprs = [
    pl.col(c)
    .cast(pl.Categorical)
    .to_physical()
    .rank("dense")
    .cast(pl.Int32)
    .alias(c)
    for c in CATS
]
num_df = all_data.select(NUMS)
cat_df = all_data.select(
    [pl.col(c).cast(pl.Utf8).cast(pl.Categorical) for c in CATS]
)

In [7]:
# === NUM → CAT ===
SIZES = {}

num2cat_exprs = [
    pl.col(c)
    .cast(pl.Utf8)
    .cast(pl.Categorical)
    .to_physical()
    .cast(pl.Int32).alias(f"{c}2")
    for c in NUMS
]

num_df2 = all_data.select(num2cat_exprs)
NUMS2 = num_df2.columns

all_data = all_data.with_columns(cat_exprs + num2cat_exprs)

SIZES = all_data.select(
    [pl.col(col)
     .n_unique()
     .alias(col) for col in CATS + NUMS2]
).to_dicts()[0]

print(SIZES)

{'job': 12, 'marital': 3, 'education': 4, 'default': 2, 'housing': 2, 'loan': 2, 'contact': 3, 'month': 12, 'poutcome': 4, 'age2': 78, 'balance2': 8590, 'day2': 31, 'duration2': 1824, 'campaign2': 52, 'pdays2': 628, 'previous2': 54}


In [8]:
# Grouped Dfを作成
stats_df = compute_group_stats(
    pl.concat([cat_df, num_df2, num_df], how="horizontal"),
    CATS + NUMS2,
    NUMS,
    stats=("mean", )
)
print(f"Created {len(stats_df.columns)} new columns")

  0%|          | 0/16 [00:00<?, ?it/s]

Created 104 new columns


In [9]:
# === 2Comboのペアを作成 ===
pairs = list(combinations(CATS + NUMS2, 2))

combo_exprs = [(pl.col(c1) * SIZES[c2] + pl.col(c2))
               .alias(f"{c1}_{c2}") for c1, c2 in pairs]

COMBO = [f"{c1}_{c2}" for c1, c2 in pairs]

combo2_df = all_data.with_columns(combo_exprs)

print(f"Created {len(combo_exprs)} new columns")

Created 120 new columns


In [10]:
# === Orig TargetでTE ===
tr_df = combo2_df[:len(train)]
test_df = combo2_df[len(train):len(train)+len(test)]

orig_df = combo2_df[len(train)+len(test):]
orig_df = orig_df.with_columns(y_orig.alias("target"))

te_cols = CATS + NUMS2 + COMBO

te_orig = target_encoding_orig(
    tr_df,
    test_df,
    orig_df,
    key_cols=te_cols,
    alpha=ALPHA,
    stats=("mean",)
)

print(f"Created {len(te_orig.columns)} new columns")

  0%|          | 0/136 [00:00<?, ?it/s]

Created 136 new columns


In [11]:
# === Target Encoding ===
tr_df = tr_df.with_columns(y_tr.alias("target"))

te_df = target_encoding(
    tr_df,
    test_df,
    key_cols=te_cols,
    stats=("mean", ),
    alpha=ALPHA,
)

print(f"Created {len(te_df.columns)} new columns")

0it [00:00, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

Created 136 new columns


In [12]:
# === Count Encoding
combo2_df = combo2_df[:len(train) + len(test)]
ce_cols = te_cols
ce_dict = {f"{col}_ce": np.zeros(all_data.height) for col in ce_cols}

for col in tqdm(ce_cols):
    counts = combo2_df.group_by(col).agg(pl.len().alias(f"{col}_ce"))
    joined_df = combo2_df.join(counts, on=col, how="left")
    ce_dict[f"{col}_ce"] = joined_df[f"{col}_ce"]

ce_df = pl.DataFrame(ce_dict).with_columns([
        pl.col(col).cast(pl.Float32) for col in ce_dict.keys()
])

print(f"Created {len(ce_df.columns)} new columns")

  0%|          | 0/136 [00:00<?, ?it/s]

Created 136 new columns


In [13]:
# === Dataの統合 ===
all_data = pl.concat([
    num_df,
    te_df,
    te_orig,
    ce_df,
    stats_df
], how="horizontal")

In [14]:
# === row_id を追加 ===
all_data = all_data.with_row_index("row_id")

# === Downcast ===
all_data = downcast(all_data)

# === データを分割 ===
tr_df = all_data[:len(train)]
test_df = all_data[len(train):len(train)+len(test)]

# === targetを追加 ===
tr_df = tr_df.with_columns(y_tr.alias("target"))

In [15]:
# Add Fold Col
folds_path = "../../artifacts/folds/folds.parquet"
pairs = [
    ("skf/k=5/s=42@train", "5fold-s42")
]
cfgs = [c for c, _ in pairs]
rename_map = {c: n for c, n in pairs}

# folds をまとめて読み → ワイド化（cfg列を列見出しに）→ 列名をfold_nameにリネーム
folds_wide = (
    pl.scan_parquet(folds_path)
      .filter(pl.col("cfg").is_in(cfgs))
      .unique(subset=["row_id", "cfg"], keep="last")
      .select(["row_id", "cfg", "fold"])
      .collect(engine="streaming")
      .pivot(values="fold", index="row_id", on="cfg", aggregate_function="first")
      .rename(rename_map)
      .with_columns(pl.col("row_id").cast(pl.Int32))
      .with_columns([pl.all().exclude("row_id").cast(pl.Int8)])  # 型を軽く
)

# tr_df が DataFrame の場合
tr_df = tr_df.join(folds_wide, on="row_id", how="left")

In [16]:
# === 特徴量エンジニアリング後の情報 ===
train_mem, test_mem, n_cat = check_info(tr_df, test_df)

=== Shape & Memory ===
Train Shape: (750000, 522), Test Shape: (250000, 520)
Train Memory: 1.45 GB, Test Memory: 0.48 GB

=== DTypes ===
UInt32: 1
Int32: 7
Float32: 512
Int8: 2


In [17]:
# === Save Overall Data ===
tr_path = FEATURE_DIR / "train.parquet"
test_path = FEATURE_DIR / "test.parquet"

tr_df.write_parquet(tr_path)
test_df.write_parquet(test_path)

print(f"tr_df saved successfully to {tr_path}")
print(f"test_df saved successfully to {test_path}")

tr_df saved successfully to ../../artifacts/features/055/train.parquet
test_df saved successfully to ../../artifacts/features/055/test.parquet


## Meta dataを保存

In [18]:
JST = timezone(timedelta(hours=9))
meta = {
    "data_id": ID,
    "created_at": datetime.now(JST).isoformat(),
    "train_paths": [str(tr_path)],
    "test_paths": [str(test_path)],
    "level": LEVEL,
    "train_shape": [tr_df.height, tr_df.width],
    "test_shape": [test_df.height, test_df.width],
    "memory": {
        "train": train_mem,
        "test": test_mem
    },
    "fold_column": pairs,
    "cat_cols": n_cat if n_cat else None
}

with open(f"{FEATURE_DIR}/meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

for k, v in meta.items():
    print(f"{k}: {v}")

data_id: 055
created_at: 2025-10-05T17:30:13.682676+09:00
train_paths: ['../../artifacts/features/055/train.parquet']
test_paths: ['../../artifacts/features/055/test.parquet']
level: l1
train_shape: [750000, 522]
test_shape: [250000, 520]
memory: {'train': 1.4542602002620697, 'test': 0.48428773880004883}
fold_column: [('skf/k=5/s=42@train', '5fold-s42')]
cat_cols: None
