In [1]:
import os
import re
import sys
import json
import math
from collections import Counter
from datetime import datetime, timezone, timedelta
from itertools import combinations
from pathlib import Path

import ipynbname
import numpy as np
import pandas as pd
import polars as pl
from tqdm.notebook import tqdm

sys.path.append(os.path.abspath("../.."))

from src.utils.target_encoding import target_encoding

### Configuration

In [2]:
stem = ipynbname.path().stem
m = re.search(r"(\d+)$", stem)

ID = m.group(1)
SEED = 42
level = "l1"
FEATURE_DIR = Path(f"../../artifacts/features/{ID}")

os.makedirs(FEATURE_DIR, exist_ok=True)

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

print(f"Feature dir created successfully in \n{FEATURE_DIR}")

Feature dir created successfully in 
../../artifacts/features/045


### Utils

In [3]:
def check_info(
    train: pl.DataFrame,
    test: pl.DataFrame
) -> tuple[float, float, float]:
    train_mem = sum(train[col].to_numpy().nbytes for col in train.columns) / 1024**3
    test_mem = sum(test[col].to_numpy().nbytes for col in test.columns) / 1024**3

    print("=== Shape & Memory ===")
    print(f"Train Shape: {train.shape}, Test Shape: {test.shape}")
    print(f"Train Memory: {train_mem:.2f} GB, Test Memory: {test_mem:.2f} GB\n")

    dtype_counts = Counter([str(dt) for dt in train.dtypes])

    n_cat = None
    print("=== DTypes ===")
    for dtype, cnt in dtype_counts.items():
        print(f"{dtype}: {cnt}")
        if dtype == "Categorical":
            n_cat = cnt
    return train_mem, test_mem, n_cat


def downcast(df: pl.DataFrame) -> pl.DataFrame:
    INT32_MIN, INT32_MAX = -2_147_483_648, 2_147_483_647

    df = df.with_columns(pl.col(pl.Float64).cast(pl.Float32))

    # Int64で安全に落とせる列だけ選別
    int64_cols = [c for c, dt in df.schema.items() if dt == pl.Int64]
    safe_cols = []
    for c in int64_cols:
        mn, mx = df[c].min(), df[c].max()
        if mn >= INT32_MIN and mx <= INT32_MAX:
            safe_cols.append(c)

    # 安全な列だけ Int32 に
    if safe_cols:
        df = df.with_columns(pl.col(safe_cols).cast(pl.Int32))
    return df

def add_bin_columns(
    df: pl.DataFrame,
    cols: list[str],
    n_bins: int,
    *,
    strategy: str = "quantile",
    quantile_method: str = "nearest",
    suffix: str = "_bin",
    drop_source: bool = False,
) -> pl.DataFrame:
    edges: dict[str, np.ndarray] = {}

    for c in cols:
        vals = df[c].to_numpy()
        vals = vals[np.isfinite(vals)]
        if vals.size == 0:
            continue

        if strategy == "quantile":
            qs = np.linspace(0.0, 1.0, n_bins + 1)
            qv = np.quantile(vals, qs, method=quantile_method)
            e = np.unique(qv)
        else:
            vmin = float(np.nanmin(vals))
            vmax = float(np.nanmax(vals))
            e = np.linspace(vmin, vmax, n_bins + 1)

        if e.size < 3:
            vmin = np.nanmin(vals)
            vmax = np.nanmax(vals)
            if not np.isfinite(vmin):
                vmin = 0.0
            if not np.isfinite(vmax):
                vmax = vmin + 1.0
            e = np.array([vmin - 1, vmax + 1], dtype="float64")

        edges[c] = e.astype("float64")

    out = df
    for c, e in edges.items():
        bname = f"{c}{suffix}"
        out = out.with_columns(
            pl.col(c)
              .cut(list(e))
              .cast(pl.Categorical)
              .to_physical()
              .rank("dense")
              .alias(bname)
        )
        if drop_source:
            out = out.drop(c)
    return out


def compute_group_stats(
    df: pl.DataFrame,
    key_cols: list[str],
    value_cols: list[str],
    *,
    stats: tuple[str, ...] = ("mean", "std", "min", "max", "median")
    ,
) -> pl.DataFrame:
    stats_dict = {}

    base_map = {}
    for v in value_cols:
        v_clean = pl.when(pl.col(v).is_finite()).then(pl.col(v)).otherwise(None)
        base_map[v] = (
            df.select([
                v_clean.mean().alias("mean"),
                v_clean.std(ddof=1).alias("std"),
                v_clean.min().alias("min"),
                v_clean.max().alias("max"),
                v_clean.median().alias("median"),
            ])
            .to_dicts()[0]
        )

    for k in tqdm(key_cols):
        for v in value_cols:
            if v in k:
                continue

            fill_map = {}
            for s in stats:
                name = f"{v}_{s}_by_{k}"
                val = base_map[v].get(s)
                if s == "std":
                    fill_map[name] = (
                        float(val)
                        if (val is not None and np.isfinite(val) and val >= 1e-6)
                        else 1.0
                    )
                else:
                    fill_map[name] = (
                        float(val)
                        if (val is not None and np.isfinite(val)) 
                        else 0.0
                    )

            aggs = []
            col_names = []
            if "mean" in stats:
                aggs.append(pl.col(v).mean().alias(f"{v}_mean_by_{k}"))
                col_names.append(f"{v}_mean_by_{k}")
            if "std" in stats:
                aggs.append(pl.col(v).std(ddof=1).alias(f"{v}_std_by_{k}"))
                col_names.append(f"{v}_std_by_{k}")
            if "min" in stats:
                aggs.append(pl.col(v).min().alias(f"{v}_min_by_{k}"))
                col_names.append(f"{v}_min_by_{k}")
            if "max" in stats:
                aggs.append(pl.col(v).max().alias(f"{v}_max_by_{k}"))
                col_names.append(f"{v}_max_by_{k}")
            if "median" in stats:
                aggs.append(pl.col(v).median().alias(f"{v}_median_by_{k}"))
                col_names.append(f"{v}_median_by_{k}")

            grouped_df = (
                df.select([k, v])
                .group_by(k)
                .agg(aggs)
            )
            stats_array = (
                df.join(
                    grouped_df.select(col_names + [k]),
                    on=k,
                    how="left"
                )
                .select(col_names)
                .with_columns(
                    [
                        pl.col(c).fill_null(fill_map[c]).alias(c)
                        for c in col_names
                    ]
                )
                .to_numpy()
                .astype(dtype=np.float32, copy=False)
            )
            for i, c in enumerate(col_names):
                stats_dict[c] = stats_array[:, i]

            del grouped_df, stats_array

    stats_df = pl.DataFrame(stats_dict)

    return stats_df


### Feature Engineering
- 2-gram TE(mean, count, median, std)
- num_df2とcat_dfで1-gram stats_df(mean, std, median, max)
- 2-gram CE(without orig)
- Month, DayをSin, Cosで変換
- Balance, DurationのDigitsを追加
- Balance, Duration関連の積や商を追加

In [4]:
# === Load Data ===
train = pl.read_csv("../../input/train.csv").drop("id")
test = pl.read_csv("../../input/test.csv").drop("id")
orig = pl.read_parquet("../../input/original.parquet")
orig = orig.with_columns(
    pl.when(pl.col("y") == "yes").then(1)
      .when(pl.col("y") == "no").then(0)
      .otherwise(None)
      .alias("y")
)

y_tr = train["y"].cast(pl.Int8)
y_orig = orig["y"].cast(pl.Int8)
y_merged = pl.concat([y_tr, y_orig], how="vertical")

train = train.drop("y")
orig = orig.drop("y")

CATS = [col for col in train.columns if train[col].dtype == pl.Utf8]
NUMS = [col for col in train.columns if train[col].dtype != pl.Utf8]
print(f"NUMS: {len(NUMS)}\n{NUMS}")
print(f"\nCATS: {len(CATS)}\n{CATS}")

NUMS: 7
['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

CATS: 9
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']


In [5]:
# === 全データを結合 ===
all_data = pl.concat([train, test], how="vertical")
cat_exprs = [
    pl.col(c)
    .cast(pl.Categorical)
    .to_physical()
    .rank("dense")
    .cast(pl.Int32).alias(c)
    for c in CATS
]
all_data = all_data.with_columns(cat_exprs)
num_df = all_data.select(NUMS)
cat_df = all_data.select(CATS)

In [6]:
NUMS2CATS = [f"{c}2" for c in NUMS]
num2cat_exprs = [
    pl.col(c)
    .cast(pl.Utf8)
    .cast(pl.Categorical)
    .to_physical()
    .rank("dense")
    .cast(pl.Int32).alias(f"{c}2")
    for c in NUMS
]
num_df2 = num_df.select(num2cat_exprs)
print(f"num_df2: {len(num_df2.columns)}\n{num_df2.columns}")

num_df2: 7
['age2', 'balance2', 'day2', 'duration2', 'campaign2', 'pdays2', 'previous2']


In [7]:
# Grouped Dfを作成
stats_df = compute_group_stats(
    pl.concat([cat_df, num_df2, num_df], how="horizontal"),
    CATS + NUMS2CATS,
    NUMS,
    stats=("mean", "std", "max", "median")
)
print(f"Created {len(stats_df.columns)} new columns")

  0%|          | 0/16 [00:00<?, ?it/s]

Created 416 new columns


In [8]:
# === 2Comboのペアを作成 ===
SIZES = pl.concat(
    [cat_df, num_df2], how="horizontal"
).select(
    [pl.col(col)
     .n_unique()
     .alias(col) for col in CATS + NUMS2CATS]
).to_dicts()[0]

pairs = list(combinations(CATS + NUMS2CATS, 2))

combo_exprs = [(pl.col(c1) * SIZES[c2] + pl.col(c2))
               .alias(f"{c1}_{c2}") for c1, c2 in pairs]

COMBO2 = [f"{c1}_{c2}" for c1, c2 in pairs]

combo2_df = pl.concat(
    [cat_df, num_df2], how="horizontal"
).select(combo_exprs)

print(f"Created {len(combo_exprs)} new columns")

Created 120 new columns


In [9]:
# all_dataをtarget encoding
tr_df = combo2_df[:len(train)].with_columns(y_tr.alias("target"))
test_df = combo2_df[len(train):]

te_df = target_encoding(
    tr_df,
    test_df,
    key_cols=test_df.columns,
    stats=("mean", "count", "std", "median")
)
print(f"New cols: {len(te_df.columns)}")

0it [00:00, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

New cols: 480


In [10]:
# === Count Encoding
ce_cols = test_df.columns
ce_dict = {f"{col}_ce": np.zeros(all_data.height) for col in ce_cols}

for col in tqdm(ce_cols):
    counts = combo2_df.group_by(col).agg(pl.len().alias(f"{col}_ce"))
    joined_df = combo2_df.join(counts, on=col, how="left")
    ce_dict[f"{col}_ce"] = joined_df[f"{col}_ce"]

ce_df = pl.DataFrame(ce_dict).with_columns([
        pl.col(col).cast(pl.Float32) for col in ce_dict.keys()
])

print(f"Created {len(ce_df.columns)} new columns")

  0%|          | 0/120 [00:00<?, ?it/s]

Created 120 new columns


In [11]:
# Durationとbalanceのそれぞれの桁の数
exprs = []
digits_cols = ["duration", "balance"]
for k in (0, 1, 2, 3):
    for c in digits_cols:
        exprs.append(
            ((pl.col(c) // (10**k)) % 10)
            .cast(pl.Int32)
            .alias(f"{c}_digitL{k}")
        )

digits_df = num_df.select(exprs)
print(f"New cols: {len(digits_df.columns)}")

New cols: 8


In [12]:
# 周期のdf
TAU = 2 * math.pi

cyc_exprs = [
    np.sin(pl.col("month") * (TAU/12))
    .alias("month_sin"),
    np.cos(pl.col("month") * (TAU/12))
    .alias("month_cos"),
    np.sin(pl.col("day") * (TAU/31))
    .alias("day_sin"),
    np.cos(pl.col("day") * (TAU/31))
    .alias("day_cos"),
]
cyc_df = all_data.select(cyc_exprs)
print(f"New cols: {len(cyc_df.columns)}")

New cols: 4


In [13]:
# 積と商のdf
exprs = []
pairs = list(combinations(NUMS, 2))
for c1, c2 in pairs:
    exprs.append(
        (
            np.log1p(pl.col(c1).abs()) * pl.col(c1).sign()
            - np.log1p(pl.col(c2).abs()) * pl.col(c1).sign()
        ).alias(f"{c1}_div_{c2}"))
    exprs.append(
        (pl.col(c1) * pl.col(c2)).alias(f"{c1}_mul_{c2}")
    )
for c in NUMS:
    exprs.append((pl.col(c) ** 2).alias(f"{c}_sq"))

arith_df = num_df.select(exprs)
print(f"New cols: {len(arith_df.columns)}")

New cols: 49


In [14]:
# Merge Data
all_data = pl.concat(
    [
        num_df,
        stats_df,
        te_df,
        ce_df,
        digits_df,
        cyc_df,
        arith_df
    ], how="horizontal"
)

In [15]:
# === row_id を追加 ===
all_data = all_data.with_row_index("row_id")

# === Downcast ===
all_data = downcast(all_data)

# === データを分割 ===
tr_df = all_data[:len(train)]
test_df = all_data[len(train):len(train)+len(test)]

# === targetを追加 ===
tr_df = tr_df.with_columns(y_tr.alias("target"))

### Add Fold Col

In [16]:
# Add Fold Col
folds_path = "../../artifacts/folds/folds.parquet"
pairs = [
    ("skf/k=5/s=42@train", "5fold-s42")
]
cfgs = [c for c, _ in pairs]
rename_map = {c: n for c, n in pairs}

# folds をまとめて読み → ワイド化（cfg列を列見出しに）→ 列名をfold_nameにリネーム
folds_wide = (
    pl.scan_parquet(folds_path)
      .filter(pl.col("cfg").is_in(cfgs))
      .unique(subset=["row_id", "cfg"], keep="last")
      .select(["row_id", "cfg", "fold"])
      .collect()
      .pivot(values="fold", index="row_id", on="cfg", aggregate_function="first")
      .rename(rename_map)
      .with_columns(pl.col("row_id").cast(pl.Int32))
      .with_columns([pl.all().exclude("row_id").cast(pl.Int8)])  # 型を軽く
)

# tr_df が DataFrame の場合
tr_df = tr_df.join(folds_wide, on="row_id", how="left")

In [17]:
# === 特徴量エンジニアリング後の情報 ===
train_mem, test_mem, n_cat = check_info(tr_df, test_df)

=== Shape & Memory ===
Train Shape: (750000, 1087), Test Shape: (250000, 1085)
Train Memory: 3.04 GB, Test Memory: 1.01 GB

=== DTypes ===
UInt32: 1
Int32: 42
Float32: 1041
Int64: 1
Int8: 2


In [18]:
# === Save Data ===
tr_path = FEATURE_DIR / "train.parquet"
test_path = FEATURE_DIR / "test.parquet"

tr_df.write_parquet(tr_path)
test_df.write_parquet(test_path)

print(f"tr_df saved successfully to {tr_path}")
print(f"test_df saved successfully to {test_path}")

tr_df saved successfully to ../../artifacts/features/045/train.parquet
test_df saved successfully to ../../artifacts/features/045/test.parquet


### Save Meta data

In [19]:
JST = timezone(timedelta(hours=9))
meta = {
    "data_id": ID,
    "train_paths": [str(tr_path)],
    "test_paths": [str(test_path)],
    "level": level,
    "created_at": datetime.now(JST).isoformat(),
    "train_shape": [tr_df.height, tr_df.width],
    "test_shape": [test_df.height, test_df.width],
    "memory": {
        "train": train_mem,
        "test": test_mem
    },
    "fold_column": pairs,
    "cat_cols": n_cat if n_cat else None
}

with open(f"{FEATURE_DIR}/meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

for k, v in meta.items():
    print(f"{k}: {v}")

data_id: 045
train_paths: ['../../artifacts/features/045/train.parquet']
test_paths: ['../../artifacts/features/045/test.parquet']
level: l1
created_at: 2025-10-03T13:19:16.610524+09:00
train_shape: [750000, 1087]
test_shape: [250000, 1085]
memory: {'train': 3.035645931959152, 'test': 1.0114163160324097}
fold_column: [('skf/k=5/s=42@train', '5fold-s42')]
cat_cols: None
