In [1]:
import pandas as pd
import polars as pl
from collections import Counter

In [2]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

polars.config.Config

In [3]:
# === Load Data ===
train = pl.read_csv("../../input/train.csv").drop("id")
test = pl.read_csv("../../input/test.csv").drop("id")

y = train["y"].cast(pl.Int8)

train = train.drop("y")

CATS = [col for col in train.columns if train[col].dtype == pl.Utf8]
NUMS = [col for col in train.columns if train[col].dtype != pl.Utf8]

In [11]:
# === 全データを結合 ===
all_data = pl.concat([train, test])

# === Label Encoding ===
SIZES = {}

cat_exprs = [
    pl.col(c)
    .cast(pl.Categorical)
    .to_physical()
    .cast(pl.Int32).alias(c)
    for c in CATS
]

le_df = all_data.with_columns(cat_exprs).drop(NUMS)

SIZES = all_data.select(
    [pl.col(col)
     .n_unique()
     .alias(col) for col in CATS]
).to_dicts()[0]

print(SIZES)

{'job': 12, 'marital': 3, 'education': 4, 'default': 2, 'housing': 2, 'loan': 2, 'contact': 3, 'month': 12, 'poutcome': 4}


In [6]:
# === Bin分割 ===
all_data = all_data.to_pandas()
bins_df = pd.DataFrame(index=all_data.index)
bins = 100

bins_df["age"] = all_data["age"]
bins_df["day"] = all_data["day"]

balance_mask = (all_data["balance"] == 0)
campaign_mask = (all_data["campaign"] == 0)
pdays_mask = (all_data["pdays"] == -1)
previous_mask = (all_data["previous"] == 0)
duration_mask = (all_data["duration"] == 0)

bins_df.loc[~balance_mask, "balance"] = pd.qcut(
    all_data.loc[~balance_mask, "balance"],
    q=bins, duplicates="drop", labels=False)
bins_df.loc[balance_mask, "balance"] = -1

bins_df.loc[~campaign_mask, "campaign"] = pd.qcut(
    all_data.loc[~campaign_mask, "campaign"],
    q=bins, duplicates="drop", labels=False)
bins_df.loc[campaign_mask, "campaign"] = -1

bins_df.loc[~pdays_mask, "pdays"] = pd.qcut(
    all_data.loc[~pdays_mask, "pdays"],
    q=bins, duplicates="drop", labels=False)
bins_df.loc[pdays_mask, "pdays"] = -1

bins_df.loc[~previous_mask, "previous"] = pd.qcut(
    all_data.loc[~previous_mask, "previous"],
    q=bins, duplicates="drop", labels=False)
bins_df.loc[previous_mask, "previous"] = -1

bins_df.loc[~duration_mask, "duration"] = pd.qcut(
    all_data.loc[~duration_mask, "duration"],
    q=bins, duplicates="drop", labels=False)
bins_df.loc[duration_mask, "duration"] = -1

bins_df = bins_df.astype("str")

all_data = pl.from_pandas(all_data)
bins_df = pl.from_pandas(bins_df).cast(pl.Int32)

  bins_df.loc[~campaign_mask, "campaign"] = pd.qcut(
  bins_df.loc[~duration_mask, "duration"] = pd.qcut(


In [8]:
bins_df = bins_df.cast(pl.Int32)

In [9]:
# === 数値変数を標準化 ===
standard_exprs = [((pl.col(c) - pl.col(c).mean()) / pl.col(c).std()).alias(c)
                  for c in NUMS]
bins_df = bins_df.with_columns(standard_exprs)

In [12]:
# === Dataの統合 ===
all_data = pl.concat([le_df, bins_df], how="horizontal")

In [13]:
# === Downcast ===
INT32_MIN, INT32_MAX = -2_147_483_648, 2_147_483_647

all_data = all_data.with_columns(pl.col(pl.Float64).cast(pl.Float32))

# Int64で安全に落とせる列だけ選別
int64_cols = [c for c, dt in all_data.schema.items() if dt == pl.Int64]
safe_cols = []
for c in int64_cols:
    mn, mx = all_data[c].min(), all_data[c].max()
    if mn >= INT32_MIN and mx <= INT32_MAX:
        safe_cols.append(c)

# 安全な列だけ Int32 に
if safe_cols:
    all_data = all_data.with_columns(pl.col(safe_cols).cast(pl.Int32))


# === データを分割 ===
tr_df = all_data[:len(train)]
test_df = all_data[len(train):]

# === targetを追加 ===
tr_df = tr_df.with_columns(y.alias("target"))

In [14]:
# === 特徴量エンジニアリング後の情報 ===
tr_memory = sum(tr_df[col].to_numpy().nbytes for col in tr_df.columns) / 1024**2
test_memory = sum(test_df[col].to_numpy().nbytes for col in test_df.columns) / 1024**2

print("=== Shape & Memory ===")
print(f"Train Shape: {tr_df.shape}, Test Shape: {test_df.shape}")
print(f"Train Memory: {tr_memory:.2f} MB, Test Memory: {test_memory:.2f} MB\n")

dtype_counts = Counter([str(dt) for dt in tr_df.dtypes])

print("=== DTypes ===")
for dtype, cnt in dtype_counts.items():
    print(f"{dtype}: {cnt}")

=== Shape & Memory ===
Train Shape: (750000, 17), Test Shape: (250000, 16)
Train Memory: 46.49 MB, Test Memory: 15.26 MB

=== DTypes ===
Int32: 9
Float32: 7
Int8: 1


In [15]:
# === Save Data ===
tr_df.write_parquet("../../artifacts/features/base/tr_df010.parquet")
test_df.write_parquet("../../artifacts/features/base/test_df010.parquet")