In [1]:
import os
import sys
import json
from datetime import datetime, timezone, timedelta
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
from sklearn.model_selection import StratifiedKFold

sys.path.append(os.path.abspath("../.."))

### Configuration

In [2]:
FEATURE_DIR = Path("../../artifacts/folds")

os.makedirs(FEATURE_DIR, exist_ok=True)

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(500)

polars.config.Config

### Create Folds

In [3]:
# === Load Data ===
train = pl.read_csv("../../input/train.csv").drop("id")
orig = pl.read_parquet("../../input/original.parquet")

# y を 0/1 に正規化（train/orig とも）
to01 = (
    pl.when(pl.col("y") == "yes")
      .then(1)
      .when(pl.col("y") == "no")
      .then(0)
      .otherwise(pl.col("y"))
      .alias("y")
)
train = train.with_columns(pl.col("y").cast(pl.Int8))
orig = orig.with_columns(to01.cast(pl.Int8))

# グローバル row_id を付与
train = train.with_row_index(name="row_id", offset=0)
orig = orig.with_row_index(name="row_id", offset=train.height)

# 以降の層化は「@train」なので train のみ対象
y_train = train["y"].to_numpy()
rid_train = train["row_id"].to_numpy()

y_orig = orig["y"].to_numpy()
rid_orig = orig["row_id"].to_numpy()

y_merged = np.concatenate([y_train, y_orig])
rid_merged = np.concatenate([rid_train, rid_orig])

cfg_list = []

In [4]:
# === Fold列を追加（trainのみ） ===
N_SPLITS = 5
SEED = 42
cfg = f"skf/k={N_SPLITS}/s={SEED}@train"   # 機械可読な命名
cfg_list.append(cfg)

row_id = rid_train
y = y_train

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

fold_arr = np.full(len(row_id), fill_value=-1, dtype=np.int8)
for f, (_, val_idx) in enumerate(skf.split(row_id, y)):
    fold_arr[val_idx] = f

folds = pl.DataFrame({
    "row_id": row_id.astype(np.int32, copy=False),
    "cfg": [cfg] * len(row_id),
    "fold": fold_arr
})

In [5]:
# === Fold列を追加 ===
N_SPLITS = 5
SEED = 42
cfg = f"skf/k={N_SPLITS}/s={SEED}@train+orig"   # 機械可読な命名
cfg_list.append(cfg)

row_id = rid_merged
y = y_merged

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

fold_arr = np.full(len(row_id), fill_value=-1, dtype=np.int8)
for f, (_, val_idx) in enumerate(skf.split(row_id, y)):
    fold_arr[val_idx] = f

folds = pl.concat(
    [
        folds,
        pl.DataFrame(
            {
                "row_id": row_id.astype(np.int32, copy=False),
                "cfg": [cfg] * len(row_id),
                "fold": fold_arr
            }
        )
    ], how="vertical"
)

In [6]:
# === Save Data ===
folds_path = FEATURE_DIR / "folds.parquet"
folds.write_parquet(folds_path)

print(f"tr_df saved successfully to {folds_path}")

tr_df saved successfully to ../../artifacts/folds/folds.parquet


### Save Meta Data

In [7]:
JST = timezone(timedelta(hours=9))
meta = {
    "created_at": datetime.now(JST).isoformat(),
    "folds_shape": [folds.height, folds.width],
    "cfg_list": cfg_list,
}

with open(f"{FEATURE_DIR}/meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

In [8]:
print(cfg_list)

['skf/k=5/s=42@train', 'skf/k=5/s=42@train+orig']
