In [1]:
import os
import sys
import numpy as np
import pandas as pd
import polars as pl
from collections import Counter
from itertools import combinations

sys.path.append(os.path.abspath("../.."))

from src.utils.target_encoding import target_encoding

In [2]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

polars.config.Config

In [3]:
# === Chank処理の個数と処理する番目を確定 ===
CHUNK_SIZE = 5
CHUNK_N = 4  # 0~4

In [4]:
# === Load Data ===
train = pl.read_csv("../../input/train.csv").drop("id")
test = pl.read_csv("../../input/test.csv").drop("id")
orig = pl.read_parquet("../../input/original.parquet")
orig = orig.with_columns(
    pl.when(pl.col("y") == "yes").then(1)
      .when(pl.col("y") == "no").then(0)
      .otherwise(None)
      .alias("y")
)

y_tr = train["y"].cast(pl.Int8)
y_orig = orig["y"].cast(pl.Int8)
y_merged = pl.concat([y_tr, y_orig], how="vertical")

train = train.drop("y")
orig = orig.drop("y")

CATS = [col for col in train.columns if train[col].dtype == pl.Utf8]
NUMS = [col for col in train.columns if train[col].dtype != pl.Utf8]

In [5]:
# === 全データを結合 ===
all_data = pl.concat([train, orig, test], how="vertical")

In [6]:
# === NUM → CAT ===
NUMS2CATS = [f"{c}2" for c in NUMS]
SIZES = {}

num2cat_exprs = [
    pl.col(c)
    .cast(pl.Utf8)
    .cast(pl.Categorical)
    .to_physical()
    .cast(pl.Int32).alias(f"{c}2")
    for c in NUMS
]
cat_exprs = [
    pl.col(c)
    .cast(pl.Categorical)
    .to_physical()
    .cast(pl.Int32).alias(c)
    for c in CATS
]

all_data = all_data.with_columns(
    num2cat_exprs + cat_exprs
)

SIZES = all_data.select(
    [pl.col(col)
     .n_unique()
     .alias(col) for col in CATS + NUMS2CATS]
).to_dicts()[0]

print(SIZES)

{'job': 12, 'marital': 3, 'education': 4, 'default': 2, 'housing': 2, 'loan': 2, 'contact': 3, 'month': 12, 'poutcome': 4, 'age2': 78, 'balance2': 8590, 'day2': 31, 'duration2': 1824, 'campaign2': 52, 'pdays2': 628, 'previous2': 54}


In [7]:
# === 2Comboのペアを作成 ===
pairs = list(combinations(CATS + NUMS2CATS, 2))

combo_exprs = [(pl.col(c1) * SIZES[c2] + pl.col(c2))
               .alias(f"{c1}_{c2}") for c1, c2 in pairs]

COMBO2 = [f"{c1}_{c2}" for c1, c2 in pairs]

all_data = all_data.with_columns(combo_exprs)

print(f"Created {len(combo_exprs)} new Combo columns")

Created 120 new Combo columns


In [8]:
# === 3Comboのペアを作成 ===
pairs = list(combinations(CATS + NUMS2CATS, 3))

combo_exprs = [((pl.col(c1) * SIZES[c2] + pl.col(c2)) * SIZES[c3] + pl.col(c3))
               .alias(f"{c1}_{c2}_{c3}") for c1, c2, c3 in pairs]

COMBO3 = [f"{c1}_{c2}_{c3}" for c1, c2, c3 in pairs]

all_data = all_data.with_columns(combo_exprs)

print(f"Created {len(combo_exprs)} new Combo columns")

Created 560 new Combo columns


In [9]:
# === Target Encoding ===
tr_df = all_data[:len(train)+len(orig)].with_columns(y_merged.alias("target"))
test_df = all_data[len(train)+len(orig):]

te_cols = NUMS2CATS + CATS + COMBO2 + COMBO3

n = CHUNK_SIZE
k, m = divmod(len(te_cols), n)
te_col_chunks = [
    te_cols[i*k + min(i, m):(i+1)*k + min(i+1, m)]
    for i in range(n)
]
te_cols = te_col_chunks[CHUNK_N]

te_df = target_encoding(tr_df, test_df, cat_cols=te_cols)

if CHUNK_N == 0:
    all_data = pl.concat([all_data.select(NUMS), te_df], how="horizontal")
else:
    all_data = te_df

0it [00:00, ?it/s]

  0%|          | 0/139 [00:00<?, ?it/s]

  0%|          | 0/139 [00:00<?, ?it/s]

  0%|          | 0/139 [00:00<?, ?it/s]

  0%|          | 0/139 [00:00<?, ?it/s]

  0%|          | 0/139 [00:00<?, ?it/s]

In [10]:
# === Downcast ===
INT32_MIN, INT32_MAX = -2_147_483_648, 2_147_483_647

all_data = all_data.with_columns(pl.col(pl.Float64).cast(pl.Float32))

# Int64で安全に落とせる列だけ選別
int64_cols = [c for c, dt in all_data.schema.items() if dt == pl.Int64]
safe_cols = []
for c in int64_cols:
    mn, mx = all_data[c].min(), all_data[c].max()
    if mn >= INT32_MIN and mx <= INT32_MAX:
        safe_cols.append(c)

# 安全な列だけ Int32 に
if safe_cols:
    all_data = all_data.with_columns(pl.col(safe_cols).cast(pl.Int32))


# === データを分割 ===
tr_df = all_data[:len(train)+len(orig)]
test_df = all_data[len(train)+len(orig):]

# === targetを追加 ===
if CHUNK_N == 4:
    tr_df = tr_df.with_columns(y_merged.alias("target"))

In [11]:
# === 特徴量エンジニアリング後の情報 ===
tr_memory = sum(tr_df[col].to_numpy().nbytes for col in tr_df.columns) / 1024**2
test_memory = sum(test_df[col].to_numpy().nbytes for col in test_df.columns) / 1024**2

print("=== Shape & Memory ===")
print(f"Train Shape: {tr_df.shape}, Test Shape: {test_df.shape}")
print(f"Train Memory: {tr_memory:.2f} MB, Test Memory: {test_memory:.2f} MB\n")

dtype_counts = Counter([str(dt) for dt in tr_df.dtypes])

print("=== DTypes ===")
for dtype, cnt in dtype_counts.items():
    print(f"{dtype}: {cnt}")

=== Shape & Memory ===
Train Shape: (795211, 140), Test Shape: (250000, 139)
Train Memory: 422.41 MB, Test Memory: 132.56 MB

=== DTypes ===
Float32: 139
Int8: 1


In [12]:
# === Save Data ===
tr_df.write_parquet(f"../../artifacts/features/base/tr_df014-{CHUNK_N}.parquet")
test_df.write_parquet(f"../../artifacts/features/base/test_df014-{CHUNK_N}.parquet")