In [1]:
import numpy as np
import pandas as pd
import polars as pl
from collections import Counter
from itertools import combinations
from sklearn.preprocessing import PowerTransformer

In [2]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

polars.config.Config

In [3]:
# === Load Data ===
train = pl.read_csv("../../input/train.csv").drop("id")
test = pl.read_csv("../../input/test.csv").drop("id")

y = train["y"].cast(pl.Int8)

train = train.drop("y")

CATS = [col for col in train.columns if train[col].dtype == pl.Utf8]
NUMS = [col for col in train.columns if train[col].dtype != pl.Utf8]

In [4]:
# === 全データを結合 ===
all_data = pl.concat([train, test], how="vertical")

# === 値の統合とyeo-johnson変換と外れ値の処理(1%, 99%) ===
all_data = all_data.with_columns([
    (pl.col("previous") > 0).cast(pl.UInt8).alias("previous"),
    (pl.col("pdays") != -1).cast(pl.UInt8).alias("pdays"),
])

pt = PowerTransformer(method="yeo-johnson")
yj_cols = ["balance", "campaign", "duration"]

for c in yj_cols:
    arr = pt.fit_transform(all_data.select(c).to_numpy())
    all_data = all_data.with_columns(
        pl.Series(c, arr[:, 0]).cast(pl.Float32))

    lower = all_data.select(c).quantile(0.01).item()
    upper = all_data.select(c).quantile(0.99).item()

    all_data = all_data.with_columns(
        pl.col(c).clip(lower, upper))

In [5]:
# === NUM → CAT ===
NUMS2CATS = [f"{c}2" for c in NUMS]
SIZES = {}

num2cat_exprs = [
    pl.col(c)
    .cast(pl.Utf8)
    .cast(pl.Categorical)
    .to_physical()
    .cast(pl.Int32).alias(f"{c}2")
    for c in NUMS
]
cat_exprs = [
    pl.col(c)
    .cast(pl.Categorical)
    .to_physical()
    .cast(pl.Int32).alias(c)
    for c in CATS
]

all_data = all_data.with_columns(
    num2cat_exprs + cat_exprs
)

SIZES = all_data.select(
    [pl.col(col)
     .n_unique()
     .alias(col) for col in CATS + NUMS2CATS]
).to_dicts()[0]

print(SIZES)

{'job': 12, 'marital': 3, 'education': 4, 'default': 2, 'housing': 2, 'loan': 2, 'contact': 3, 'month': 12, 'poutcome': 4, 'age2': 78, 'balance2': 7086, 'day2': 31, 'duration2': 1195, 'campaign2': 14, 'pdays2': 2, 'previous2': 2}


In [6]:
# === 2Comboのペアを作成 ===
pairs = list(combinations(CATS + NUMS2CATS, 2))

combo_exprs = [(pl.col(c1) * SIZES[c2] + pl.col(c2))
               .alias(f"{c1}_{c2}") for c1, c2 in pairs]

COMBO = [f"{c1}_{c2}" for c1, c2 in pairs]

all_data = all_data.with_columns(combo_exprs)

print(f"Created {len(combo_exprs)} new Combo columns")

Created 120 new Combo columns


In [7]:
# === 数値変数を標準化 ===
standard_exprs = [((pl.col(c) - pl.col(c).mean()) / pl.col(c).std()).alias(c)
                  for c in NUMS]
all_data = all_data.with_columns(standard_exprs)

In [8]:
# === Downcast ===
INT32_MIN, INT32_MAX = -2_147_483_648, 2_147_483_647

all_data = all_data.with_columns(pl.col(pl.Float64).cast(pl.Float32))

# Int64で安全に落とせる列だけ選別
int64_cols = [c for c, dt in all_data.schema.items() if dt == pl.Int64]
safe_cols = []
for c in int64_cols:
    mn, mx = all_data[c].min(), all_data[c].max()
    if mn >= INT32_MIN and mx <= INT32_MAX:
        safe_cols.append(c)

# 安全な列だけ Int32 に
if safe_cols:
    all_data = all_data.with_columns(pl.col(safe_cols).cast(pl.Int32))


# === データを分割 ===
tr_df = all_data[:len(train)]
test_df = all_data[len(train):]

# === targetを追加 ===
tr_df = tr_df.with_columns(y.alias("target"))

In [9]:
# === 特徴量エンジニアリング後の情報 ===
tr_memory = sum(tr_df[col].to_numpy().nbytes for col in tr_df.columns) / 1024**2
test_memory = sum(test_df[col].to_numpy().nbytes for col in test_df.columns) / 1024**2

print("=== Shape & Memory ===")
print(f"Train Shape: {tr_df.shape}, Test Shape: {test_df.shape}")
print(f"Train Memory: {tr_memory:.2f} MB, Test Memory: {test_memory:.2f} MB\n")

dtype_counts = Counter([str(dt) for dt in tr_df.dtypes])

print("=== DTypes ===")
for dtype, cnt in dtype_counts.items():
    print(f"{dtype}: {cnt}")

=== Shape & Memory ===
Train Shape: (750000, 144), Test Shape: (250000, 143)
Train Memory: 409.84 MB, Test Memory: 136.38 MB

=== DTypes ===
Float32: 7
Int32: 136
Int8: 1


In [10]:
# === Save Data ===
tr_df.write_parquet("../../artifacts/features/base/tr_df009.parquet")
test_df.write_parquet("../../artifacts/features/base/test_df009.parquet")