In [1]:
import numpy as np
import pandas as pd
import polars as pl
from sklearn.preprocessing import LabelEncoder

In [2]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

polars.config.Config

In [3]:
# === Load Data ===
train = pd.read_csv("../../input/train.csv").set_index("id")
test = pd.read_csv("../../input/test.csv").set_index("id")

orig = pd.read_parquet("../../input/original.parquet")
orig['y'] = orig.y.map({'yes': 1, 'no': 0})
orig['id'] = (np.arange(len(orig))+1e6).astype('int')
orig = orig.set_index('id')

y_tr = train["y"]
y_orig = orig["y"]
y_merged = pd.concat([y_tr, y_orig], axis=0)
train = train.drop("y", axis=1)

CATS = train.select_dtypes(include=["category", "object"]).columns.to_list()
NUMS = train.select_dtypes(include=np.number).columns.to_list()

In [4]:
# === 全データを結合 ===
all_data = pd.concat([train, test, orig], ignore_index=True, axis=0)

# === label Encoding ===
le_df = pd.DataFrame(index=all_data.index)

for c in CATS:
    le = LabelEncoder()
    le_df[c] = le.fit_transform(all_data[c])
le_df = le_df.astype("str")

# === dfを結合 ===
num_df = all_data[NUMS]
df_feat = pd.concat([num_df, le_df], axis=1)

# === データを分割 ===
tr_df = df_feat.iloc[:len(train)].copy()
test_df = df_feat.iloc[len(train):len(train)+len(test)]
orig_df = df_feat.iloc[len(train)+len(test):]

tr_df = pd.concat([tr_df, orig_df], axis=0, ignore_index=True)

# === targetを追加 ===
tr_df["target"] = y_merged

In [5]:
# === 量子化 ===
tr_df = tr_df.copy()
test_df = test_df.copy()

df_list = [tr_df, test_df]
for df in df_list:
    for c in df.columns:
        if df[c].dtype == "object":
            continue
        elif df[c].dtype == np.dtype("float64"):
            df[c] = df[c].astype(np.float32)
        elif df[c].dtype == np.dtype("int64"):
            df[c] = df[c].astype(np.int32)

In [6]:
# === 特徴量エンジニアリング後の情報 ===
tr_memory = tr_df.memory_usage(deep=True).sum() / 1024**2
test_memory = test_df.memory_usage(deep=True).sum() / 1024**2

print("=== Shape & Memory ===")
print(f"Train Shape: {tr_df.shape}, Test Shape: {test_df.shape}")
print(f"Train Memory: {tr_memory:.2f} MB, Test Memory: {test_memory:.2f} MB\n")

dtype_counts = tr_df.dtypes.astype(str).value_counts()

print("=== DTypes ===")
print(dtype_counts.to_string())

=== Shape & Memory ===
Train Shape: (795211, 17), Test Shape: (250000, 16)
Train Memory: 420.18 MB, Test Memory: 131.14 MB

=== DTypes ===
object     9
int32      7
float32    1


In [7]:
# === Save Data ===
tr_df.to_parquet("../../artifacts/features/base/tr_df003.parquet", index=False)
test_df.to_parquet("../../artifacts/features/base/test_df003.parquet", index=False)