In [1]:
import os
import sys
import numpy as np
import pandas as pd
import polars as pl
from collections import Counter

sys.path.append(os.path.abspath("../.."))

In [2]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

polars.config.Config

In [None]:
# === Chank処理の個数 ===
CHUNK_SIZE = 5

In [5]:
# === Load Data ===
train_list = []
test_list = []

for i in range(CHUNK_SIZE):
    train = pl.read_parquet(f"../../artifacts/features/base/tr_df014-{i}.parquet")
    train_list.append(train)
    test = pl.read_parquet(f"../../artifacts/features/base/test_df014-{i}.parquet")
    test_list.append(test)

tr_df = pl.concat([df for df in train_list], how="horizontal")
test_df = pl.concat([df for df in test_list], how="horizontal")

y = tr_df["target"]
tr_df = tr_df

all_data = pl.concat([tr_df, test_df], how="vertical")

In [7]:
# === Downcast ===
INT32_MIN, INT32_MAX = -2_147_483_648, 2_147_483_647

all_data = all_data.with_columns(pl.col(pl.Float64).cast(pl.Float32))

# Int64で安全に落とせる列だけ選別
int64_cols = [c for c, dt in all_data.schema.items() if dt == pl.Int64]
safe_cols = []
for c in int64_cols:
    mn, mx = all_data[c].min(), all_data[c].max()
    if mn >= INT32_MIN and mx <= INT32_MAX:
        safe_cols.append(c)

# 安全な列だけ Int32 に
if safe_cols:
    all_data = all_data.with_columns(pl.col(safe_cols).cast(pl.Int32))


# === データを分割 ===
tr_df = all_data[:len(tr_df)]
test_df = all_data[len(tr_df):]

# === targetを追加 ===
tr_df = tr_df.with_columns(y.alias("target"))

In [8]:
# === 特徴量エンジニアリング後の情報 ===
tr_memory = sum(tr_df[col].to_numpy().nbytes for col in tr_df.columns) / 1024**2
test_memory = sum(test_df[col].to_numpy().nbytes for col in test_df.columns) / 1024**2

print("=== Shape & Memory ===")
print(f"Train Shape: {tr_df.shape}, Test Shape: {test_df.shape}")
print(f"Train Memory: {tr_memory:.2f} MB, Test Memory: {test_memory:.2f} MB\n")

dtype_counts = Counter([str(dt) for dt in tr_df.dtypes])

print("=== DTypes ===")
for dtype, cnt in dtype_counts.items():
    print(f"{dtype}: {cnt}")

=== Shape & Memory ===
Train Shape: (795211, 704), Test Shape: (250000, 703)
Train Memory: 2133.30 MB, Test Memory: 670.43 MB

=== DTypes ===
Int32: 7
Float32: 696
Int8: 1


In [9]:
# === Save Data ===
tr_df.write_parquet("../../artifacts/features/base/tr_df015.parquet")
test_df.write_parquet("../../artifacts/features/base/test_df015.parquet")