# 地方競馬（NAR）アンサンブル戦略 (15モデル)

これまでに構築した強力な2つのモデルを組み合わせることで、さらなる予測精度の向上（特に複勝率と安定性）を目指します。

### 統合するモデル
1. **回帰モデル (Regression)**: 全体的な着順傾向を学習 (Spearman相関が高い)。`10_nar_class_features_model` 相当。
2. **ランク学習モデル (LambdaRank)**: 1位の特定に特化 (Top-1勝率が高い)。`13_nar_optuna_relative_model` 相当。

### 戦略
- **Weighted Blending**: 両モデルの予測スコア（またはランク）を加重平均します。

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import japanize_matplotlib

# プロジェクトのsrcディレクトリをパスに追加
src_path = os.path.abspath(os.path.join(os.getcwd(), '../../src'))
if src_path not in sys.path:
    sys.path.append(src_path)

from nar.loader import NarDataLoader
from nar.features import NarFeatureGenerator

%matplotlib inline

  from optuna import progress_bar as pbar_module


In [2]:
loader = NarDataLoader()
# データ量は多めに確保
raw_df = loader.load(limit=150000, region='south_kanto')

generator = NarFeatureGenerator(history_windows=[1, 2, 3, 4, 5])
df = generator.generate_features(raw_df)

df = df.dropna(subset=['rank']).copy()
df['date'] = pd.to_datetime(df['date'])

# 特徴量定義 (Phase 9 base)
baseline_features = [
    'distance', 'venue', 'state', 'frame_number', 'horse_number', 'weight', 'impost',
    'jockey_win_rate', 'jockey_place_rate', 'trainer_win_rate', 'trainer_place_rate',
    'horse_run_count'
] + [col for col in df.columns if 'horse_prev' in col]

advanced_features = [
    'gender', 'age', 'days_since_prev_race', 'weight_diff',
    'horse_jockey_place_rate', 'is_consecutive_jockey',
    'distance_diff', 'horse_venue_place_rate',
    'trainer_30d_win_rate',
    'impost_diff', 'was_accident_prev1', 'weighted_si_momentum', 'weighted_rank_momentum',
    'class_rank', 'class_diff', 'is_promoted', 'is_demoted'
]

phase9_features = [
    'weighted_si_momentum_race_rank', 'weighted_si_momentum_diff_from_avg', 'weighted_si_momentum_zscore',
    'weighted_rank_momentum_race_rank', 'weighted_rank_momentum_diff_from_avg', 'weighted_rank_momentum_zscore',
    'class_rank_race_rank', 'class_rank_diff_from_avg', 'class_rank_zscore',
    'horse_state_place_rate', 'season', 'is_night_race', 'trainer_momentum_bias'
]

features = list(set(baseline_features + advanced_features + phase9_features))

# カテゴリ処理
categorical_cols = ['venue', 'state', 'gender', 'season']
for col in features:
    if col in df.columns:
        if col in categorical_cols:
            df[col] = df[col].astype(str).astype('category')
        else:
            df[col] = pd.to_numeric(df[col], errors='coerce')

features = [f for f in features if f in df.columns]

split_date = df['date'].quantile(0.8)
train_df = df[df['date'] < split_date].sort_values('race_id').copy()
test_df = df[df['date'] >= split_date].sort_values('race_id').copy()

print(f'Train: {len(train_df)}, Test: {len(test_df)}')

重複データを削除しました: 150000 -> 143895 件


Train: 29819, Test: 7527


In [3]:
# Model 1: Regression (LGBMRegressor)
print('Training Regression Model...')
reg_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=64,
    max_depth=6,
    random_state=42
)
reg_model.fit(
    train_df[features], train_df['rank'],
    eval_set=[(test_df[features], test_df['rank'])],
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
)

# Model 2: LambdaRank (LGBMRanker)
print('Training LambdaRank Model...')
train_groups = train_df.groupby('race_id').size().values
test_groups = test_df.groupby('race_id').size().values
train_label = 20 - train_df['rank']
test_label = 20 - test_df['rank']

rank_model = lgb.LGBMRanker(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=64,
    max_depth=6,
    random_state=42,
    metric='ndcg',
    importance_type='gain'
)
rank_model.fit(
    train_df[features], train_label,
    group=train_groups,
    eval_set=[(test_df[features], test_label)],
    eval_group=[test_groups],
    eval_at=[3],
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
)

Training Regression Model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005520 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7184
[LightGBM] [Info] Number of data points in the train set: 29819, number of used features: 58
[LightGBM] [Info] Start training from score 6.442134
Training LambdaRank Model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004511 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7184
[LightGBM] [Info] Number of data points in the train set: 29819, number of used features: 58


In [4]:
# 予測の実行
test_df['pred_reg'] = reg_model.predict(test_df[features])
test_df['pred_rank'] = rank_model.predict(test_df[features])

# スケーリング (0-1に正規化して合わせる)
# Regressionは「順位」なので小さい方が良い -> 反転させる
# LambdaRankは「スコア」なので大きい方が良い
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Reg: 1位=1.0, 最下位=0.0 になるように反転変換
test_df['score_reg_inv'] = -test_df['pred_reg']
test_df['norm_reg'] = scaler.fit_transform(test_df[['score_reg_inv']])

# Rank: そのまま正規化
test_df['norm_rank'] = scaler.fit_transform(test_df[['pred_rank']])

# アンサンブル (加重平均)
# Rankerの方が勝率が高いので比重を重くする (例: 0.3 : 0.7)
alpha = 0.3
test_df['ensemble_score'] = alpha * test_df['norm_reg'] + (1 - alpha) * test_df['norm_rank']

# レースごとの順位付け
test_df['final_rank'] = test_df.groupby('race_id')['ensemble_score'].rank(ascending=False, method='min')

In [5]:
eval_list = []
for r in range(1, 6):
    matches = test_df[test_df['final_rank'] == r]
    win_rate = (matches['rank'] == 1).mean()
    place_rate = (matches['rank'] <= 3).mean()
    eval_list.append({'predicted_rank': r, 'win_rate': win_rate, 'place_rate': place_rate})

eval_df = pd.DataFrame(eval_list)
print('アンサンブルモデル予測精度:')
print(eval_df)

corr, _ = spearmanr(test_df['ensemble_score'], 20-test_df['rank'])
print(f'Spearman相関係数 (Ensemble): {corr:.4f}')

アンサンブルモデル予測精度:
   predicted_rank  win_rate  place_rate
0               1  0.273927    0.625413
1               2  0.178218    0.481848
2               3  0.145215    0.412541
3               4  0.084158    0.321782
4               5  0.075908    0.268977
Spearman相関係数 (Ensemble): 0.4774
