# V14 EDA → データセット作成 → 学習 → 評価

このノートブックは `scripts/` と `src/` のロジックを再利用して、EDA、データセット作成、学習、評価を行います。
重い処理を実行する前に、設定セルのフラグとパスを調整してください。

In [5]:
from pathlib import Path
import os
import sys

import pandas as pd
import numpy as np

# ワークスペースルートを import パスに追加
ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.append(str(ROOT))

print("ROOT:", ROOT)

ROOT: /workspace


In [6]:
import yaml

DATASET_SPEC_PATH = ROOT / "config" / "datasets" / "v14_gap_train_2016_2026.yaml"
if not DATASET_SPEC_PATH.exists():
    raise FileNotFoundError(DATASET_SPEC_PATH)

with open(DATASET_SPEC_PATH, "r", encoding="utf-8") as f:
    spec = yaml.safe_load(f)

spec

{'dataset': {'id': 'v14_gap_train_2016_2026',
  'version': '2026-01-24',
  'description': 'Dataset for v14 gap model. Train 2016-2023, valid 2024, test 2025, holdout 2026.',
  'owner': 'masat',
  'tags': ['v14', 'gap', 'leakfree']},
 'source': {'loader': 'JraVanDataLoader',
  'params': {'history_start_date': '2016-01-01',
   'end_date': '2026-12-31',
   'jra_only': True,
   'skip_odds': False,
   'skip_training': False,
   'limit': None,
   'horse_ids': None}},
 'filters': {'date_range': {'start': '2016-01-01', 'end': '2026-12-31'},
  'venue_codes': ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10'],
  'drop_future_races': True},
 'cleansing': {'enabled': True,
  'rules': ['rank_filter', 'sentinel_values', 'type_normalize']},
 'derive': {'add_year': True, 'year_column': 'year', 'date_column': 'date'},
 'features': {'pipeline': 'FeaturePipeline',
  'cache_dir': 'data/features_v14/prod_cache',
  'blocks': ['base_attributes',
   'history_stats',
   'jockey_stats',
   'pace_stats

In [7]:
# --- 実行時設定 ---
OUTPUT_PATH = ROOT / "data" / "processed" / "preprocessed_data_v14_2016_2026.parquet"
CACHE_DIR = ROOT / spec["features"]["cache_dir"]
FEATURE_BLOCKS = spec["features"]["blocks"]

RUN_FULL_BUILD = True  # True にすると全データセットを作成
RUN_TRAINING = True    # True にすると scripts/train_gap_v14_config.py で学習
RUN_EVAL = True        # True にすると保存済みモデルを評価

# EDA サンプル設定
EDA_START = "2024-01-01"
EDA_END = "2024-12-31"
EDA_LIMIT = 50000

# 学習/評価の分割設定
TRAIN_YEARS = list(range(2016, 2024))
VALID_YEAR = 2024
TEST_YEARS = [2025]
HOLDOUT_YEARS = [2026]

print("Output:", OUTPUT_PATH)
print("Cache:", CACHE_DIR)
print("Blocks:", len(FEATURE_BLOCKS))

Output: /workspace/data/processed/preprocessed_data_v14_2016_2026.parquet
Cache: /workspace/data/features_v14/prod_cache
Blocks: 57


In [8]:
# --- EDA サンプル読み込み ---
from src.preprocessing.loader import JraVanDataLoader
from src.preprocessing.cleansing import DataCleanser

loader = JraVanDataLoader()
df_raw = loader.load(
    history_start_date=EDA_START,
    end_date=EDA_END,
    jra_only=True,
    skip_odds=False,
    skip_training=True,
    limit=EDA_LIMIT,
)

cleanser = DataCleanser()
df_clean = cleanser.cleanse(df_raw)

print("Raw rows:", len(df_raw))
print("Clean rows:", len(df_clean))
df_clean.head()

DEBUG: skip_training=True selected.
DEBUG: Constructing query...
DEBUG: Executing pd.read_sql with chunksize...
DEBUG: Read SQL done. Rows: 47212
Raw rows: 47212
Clean rows: 46752


Unnamed: 0,race_id,date,start_time_str,venue,race_number,distance,surface,weather,weather_code,state,...,pass_2,pass_3,pass_4,odds_win_str,odds_umaren_str,rank,weight_ratio,time,weight_diff,passing_rank
0,202406010101,2024-01-06,1005,6,1,1200,ダート,晴,1,良,...,0,15,15,0125031502007804032745160400400205068610061594...,0102010207074010303742710601040072990660105037...,16,0.114894,76.8,2,15-15
1,202406010101,2024-01-06,1005,6,1,1200,ダート,晴,1,良,...,0,1,1,0125031502007804032745160400400205068610061594...,0102010207074010303742710601040072990660105037...,7,0.107955,73.7,4,01-01
2,202406010101,2024-01-06,1005,6,1,1200,ダート,晴,1,良,...,0,15,15,0125031502007804032745160400400205068610061594...,0102010207074010303742710601040072990660105037...,14,0.125,76.2,-4,15-15
3,202406010101,2024-01-06,1005,6,1,1200,ダート,晴,1,良,...,0,4,4,0125031502007804032745160400400205068610061594...,0102010207074010303742710601040072990660105037...,4,0.135714,73.2,2,04-04
4,202406010101,2024-01-06,1005,6,1,1200,ダート,晴,1,良,...,0,12,12,0125031502007804032745160400400205068610061594...,0102010207074010303742710601040072990660105037...,13,0.123913,75.3,-4,12-12


In [None]:
# --- EDA 要約 ---
print(df_clean.dtypes.head(20))
missing = df_clean.isna().mean().sort_values(ascending=False)
missing.head(20)

race_id                        object
date                   datetime64[ns]
start_time_str                 object
venue                          object
race_number                     int64
distance                        int64
surface                        object
weather                        object
weather_code                    int64
state                          object
going_code                      int64
title                          object
grade_code                     object
kyoso_shubetsu_code            object
kyoso_joken_code               object
horse_id                       object
sire_id                        object
mare_id                        object
bms_id                         object
sex                            object
dtype: object


time_diff           1.000000
passing_rank        0.008235
odds_win_str        0.001690
odds_umaren_str     0.000898
yoso_soha_time      0.000000
weight              0.000000
weight_diff_val     0.000000
weight_diff_sign    0.000000
abnormal_code       0.000000
age                 0.000000
horse_name          0.000000
impost              0.000000
honshokin           0.000000
fukashokin          0.000000
mining_kubun        0.000000
yoso_gosa_minus     0.000000
yoso_gosa_plus      0.000000
odds                0.000000
yoso_juni           0.000000
blinker             0.000000
dtype: float64

: 

In [None]:
# --- 全データセット作成 ---
from src.preprocessing.feature_pipeline import FeaturePipeline

if not RUN_FULL_BUILD:
    print("RUN_FULL_BUILD=False. Set True to build the full dataset.")
else:
    loader = JraVanDataLoader()
    df_raw = loader.load(
        history_start_date=spec["source"]["params"]["history_start_date"],
        end_date=spec["source"]["params"]["end_date"],
        jra_only=spec["source"]["params"]["jra_only"],
        skip_odds=spec["source"]["params"]["skip_odds"],
        skip_training=spec["source"]["params"]["skip_training"],
        limit=spec["source"]["params"]["limit"],
    )

    cleanser = DataCleanser()
    df_clean = cleanser.cleanse(df_raw)

    # 日付範囲フィルタを適用
    df_clean["date"] = pd.to_datetime(df_clean["date"])
    start = pd.to_datetime(spec["filters"]["date_range"]["start"])
    end = pd.to_datetime(spec["filters"]["date_range"]["end"])
    df_clean = df_clean[(df_clean["date"] >= start) & (df_clean["date"] <= end)].copy()

    # 任意: 未来レースを除外（今日基準）
    if spec["filters"].get("drop_future_races", False):
        today = pd.Timestamp.now().normalize()
        df_clean = df_clean[df_clean["date"] <= today].copy()

    # 後段の分割用に year 列を追加
    df_clean["year"] = df_clean["date"].dt.year

    pipeline = FeaturePipeline(cache_dir=str(CACHE_DIR))
    df_features = pipeline.load_features(df_clean, FEATURE_BLOCKS)

    keys = ["race_id", "horse_number", "horse_id"]
    feature_cols = [c for c in df_features.columns if c not in df_clean.columns or c in keys]
    df_dataset = pd.merge(df_clean, df_features[feature_cols], on=keys, how="left")

    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    df_dataset.to_parquet(OUTPUT_PATH)
    print("Saved:", OUTPUT_PATH, "rows:", len(df_dataset))

DEBUG: Constructing query...
DEBUG: Executing pd.read_sql with chunksize...
DEBUG: Loaded 10 chunks...
DEBUG: Loaded 20 chunks...
DEBUG: Loaded 30 chunks...
DEBUG: Loaded 40 chunks...
DEBUG: Loaded 50 chunks...
DEBUG: Read SQL done. Rows: 490409


重複データを削除しました: 490409 -> 484726 件


In [8]:
# --- 学習（scripts/train_gap_v14_config.py を使用） ---
import subprocess

if not RUN_TRAINING:
    print("RUN_TRAINING=False. Set True to train.")
else:
    cmd = [
        sys.executable,
        str(ROOT / "scripts" / "train_gap_v14_config.py"),
        "--data-path", str(OUTPUT_PATH),
        "--train-years", ",".join(map(str, TRAIN_YEARS)),
        "--valid-year", str(VALID_YEAR),
        "--test-years", ",".join(map(str, TEST_YEARS)),
        "--holdout-years", ",".join(map(str, HOLDOUT_YEARS)),
    ]
    print(" ".join(cmd))
    subprocess.run(cmd, check=True)

RUN_TRAINING=False. Set True to train.


In [9]:
# --- 評価（簡易） ---
import joblib

if not RUN_EVAL:
    print("RUN_EVAL=False. Set True to evaluate.")
else:
    MODEL_DIR = ROOT / "models" / "experiments" / "exp_gap_v14_production"
    MODEL_PATH = MODEL_DIR / "model_v14.pkl"
    FEATURES_PATH = MODEL_DIR / "features.csv"

    df = pd.read_parquet(OUTPUT_PATH)
    df["date"] = pd.to_datetime(df["date"])
    df["year"] = df["date"].dt.year

    df_test = df[df["year"].isin(TEST_YEARS)].copy()
    if df_test.empty:
        raise ValueError("Test set empty. Check TEST_YEARS or dataset.")

    # 10分前オッズから人気順位を作成
    if "odds_10min" not in df_test.columns:
        raise ValueError("odds_10min missing in dataset.")

    df_test["odds_rank_10min"] = df_test.groupby("race_id")["odds_10min"].rank(method="min")
    df_test["gap_score"] = df_test["odds_rank_10min"] - df_test["rank"]

    model = joblib.load(MODEL_PATH)
    feature_list = pd.read_csv(FEATURES_PATH)["feature"].tolist()

    preds = model.predict(df_test[feature_list])
    df_test["pred_gap"] = preds
    df_test["pred_rank"] = df_test.groupby("race_id")["pred_gap"].rank(ascending=False)

    # 簡易指標
    rmse = np.sqrt(np.mean((df_test["gap_score"] - df_test["pred_gap"]) ** 2))
    print("RMSE:", rmse)

    # Top3 予測の複勝的中率
    picks = df_test[df_test["pred_rank"] <= 3]
    picks = picks[(picks["odds_10min"] >= 10.0) & (picks["odds_10min"] <= 50.0)]
    hits = picks[picks["rank"] <= 3]
    hit_rate = len(hits) / len(picks) if len(picks) > 0 else 0
    print("Bets:", len(picks))
    print("Hit rate:", f"{hit_rate:.2%}")

RUN_EVAL=False. Set True to evaluate.
