In [1]:
import os
import sys
import pandas as pd
import polars as pl
from collections import Counter

sys.path.append(os.path.abspath("../.."))

In [2]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

polars.config.Config

In [3]:
# === Load Data ===
train = pl.read_parquet("../../artifacts/features/base/tr_df023.parquet")
test = pl.read_parquet("../../artifacts/features/base/test_df023.parquet")

y = train["target"].cast(pl.Int8)

train = train.drop("target")

# === 全データを結合 ===
all_data = pl.concat([train, test], how="vertical")

In [4]:
# === 標準化 ===
standard_exprs = [((pl.col(c) - pl.col(c).mean()) / pl.col(c).std()).alias(c)
                  for c in all_data.columns]
all_data = all_data.with_columns(standard_exprs)

In [5]:
# === Downcast ===
INT32_MIN, INT32_MAX = -2_147_483_648, 2_147_483_647

all_data = all_data.with_columns(pl.col(pl.Float64).cast(pl.Float32))

# Int64で安全に落とせる列だけ選別
int64_cols = [c for c, dt in all_data.schema.items() if dt == pl.Int64]
safe_cols = []
for c in int64_cols:
    mn, mx = all_data[c].min(), all_data[c].max()
    if mn >= INT32_MIN and mx <= INT32_MAX:
        safe_cols.append(c)

# 安全な列だけ Int32 に
if safe_cols:
    all_data = all_data.with_columns(pl.col(safe_cols).cast(pl.Int32))


# === データを分割 ===
tr_df = all_data[:len(train)]
test_df = all_data[len(train):]

# === targetを追加 ===
tr_df = tr_df.with_columns(y.alias("target"))

In [8]:
# === 特徴量エンジニアリング後の情報 ===
tr_memory = sum(tr_df[col].to_numpy().nbytes for col in tr_df.columns) / 1024**3
test_memory = sum(test_df[col].to_numpy().nbytes for col in test_df.columns) / 1024**3

print("=== Shape & Memory ===")
print(f"Train Shape: {tr_df.shape}, Test Shape: {test_df.shape}")
print(f"Train Memory: {tr_memory:.2f} GB, Test Memory: {test_memory:.2f} GB\n")

dtype_counts = Counter([str(dt) for dt in tr_df.dtypes])

print("=== DTypes ===")
for dtype, cnt in dtype_counts.items():
    print(f"{dtype}: {cnt}")

=== Shape & Memory ===
Train Shape: (750000, 559), Test Shape: (295211, 558)
Train Memory: 1.56 GB, Test Memory: 0.61 GB

=== DTypes ===
Float32: 558
Int8: 1


In [7]:
# === Save Data ===
tr_df.write_parquet("../../artifacts/features/base/tr_df024.parquet")
test_df.write_parquet("../../artifacts/features/base/test_df024.parquet")