In [1]:
import numpy as np
import pandas as pd
import polars as pl
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer

In [2]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

polars.config.Config

In [3]:
# === Load Data ===
train = pd.read_csv("../../input/train.csv").set_index("id")
test = pd.read_csv("../../input/test.csv").set_index("id")

y = train["y"]

train = train.drop("y", axis=1)

CATS = train.select_dtypes(include=["category", "object"]).columns.difference(["default"]).to_list()
NUMS = train.select_dtypes(include=np.number).columns.to_list()

In [4]:
# === 全データを結合 ===
all_data = pd.concat([train, test], ignore_index=True, axis=0)

# === 外れ値の処理とyeo-johnson変換 ===
all_data["previous"] = (all_data["previous"] > 0).astype(int)
all_data["pdays"] = (all_data["pdays"] != -1).astype(int)

pt = PowerTransformer(method="yeo-johnson")

for col in ["balance", "campaign", "duration"]:
    all_data[col] = pt.fit_transform(all_data[[col]])

# === One Hot Encoding===
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe_df = pd.DataFrame(
    encoder.fit_transform(all_data[CATS]),
    columns=encoder.get_feature_names_out(CATS),
    index=all_data.index)

# === 数値変数を標準化 ===
num_df = all_data[NUMS]
scaler = StandardScaler()
scaled_array = scaler.fit_transform(num_df)
scaled_df = pd.DataFrame(
    scaled_array,
    columns=num_df.columns,
    index=all_data.index
)
# === dfを結合 ===
df_feat = pd.concat([scaled_df, ohe_df], axis=1)

# === データを分割 ===
tr_df = df_feat.iloc[:len(train)].copy()
test_df = df_feat.iloc[len(train):]

# === targetを追加 ===
tr_df["target"] = y

In [5]:
# === 量子化 ===
tr_df = tr_df.copy()
test_df = test_df.copy()

df_list = [tr_df, test_df]
for df in df_list:
    for c in df.columns:
        if df[c].dtype == "object":
            continue
        elif df[c].dtype == np.dtype("float64"):
            df[c] = df[c].astype(np.float32)
        elif df[c].dtype == np.dtype("int64"):
            df[c] = df[c].astype(np.int32)

In [6]:
# === 特徴量エンジニアリング後の情報 ===
tr_memory = tr_df.memory_usage(deep=True).sum() / 1024**2
test_memory = test_df.memory_usage(deep=True).sum() / 1024**2

print("=== Shape & Memory ===")
print(f"Train Shape: {tr_df.shape}, Test Shape: {test_df.shape}")
print(f"Train Memory: {tr_memory:.2f} MB, Test Memory: {test_memory:.2f} MB\n")

dtype_counts = tr_df.dtypes.astype(str).value_counts()

print("=== DTypes ===")
print(dtype_counts.to_string())

=== Shape & Memory ===
Train Shape: (750000, 50), Test Shape: (250000, 49)
Train Memory: 143.05 MB, Test Memory: 46.73 MB

=== DTypes ===
float32    49
int32       1


In [7]:
# === Save Data ===
tr_df.to_parquet("../../artifacts/features/base/tr_df005.parquet", index=False)
test_df.to_parquet("../../artifacts/features/base/test_df005.parquet", index=False)