# 地方競馬（NAR）最新モデルのハイパーパラメータ最適化 (13モデル)

フェーズ 9 で導入した高度な特徴量に対し、Optuna を用いて LightGBM (LGBMRanker) のパラメータを最適化します。

### 最適化対象
- **目的関数**: `lambdarank` (NDCG)
- **評価指標**: NDCG@3 (上位3着の順位付け精度)

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from scipy.stats import spearmanr
import logging

# プロジェクトのsrcディレクトリをパスに追加
src_path = os.path.abspath(os.path.join(os.getcwd(), '../../src'))
if src_path not in sys.path:
    sys.path.append(src_path)

from nar.loader import NarDataLoader
from nar.features import NarFeatureGenerator

optuna.logging.set_verbosity(optuna.logging.WARNING)
logging.getLogger('lightgbm').setLevel(logging.ERROR)

  from optuna import progress_bar as pbar_module


In [2]:
loader = NarDataLoader()
raw_df = loader.load(limit=150000, region='south_kanto')

generator = NarFeatureGenerator(history_windows=[1, 2, 3, 4, 5])
df = generator.generate_features(raw_df)

df = df.dropna(subset=['rank']).copy()
df['date'] = pd.to_datetime(df['date'])

baseline_features = [
    'distance', 'venue', 'state', 'frame_number', 'horse_number', 'weight', 'impost',
    'jockey_win_rate', 'jockey_place_rate', 'trainer_win_rate', 'trainer_place_rate',
    'horse_run_count'
] + [col for col in df.columns if 'horse_prev' in col]

advanced_features = [
    'gender', 'age', 'days_since_prev_race', 'weight_diff',
    'horse_jockey_place_rate', 'is_consecutive_jockey',
    'distance_diff', 'horse_venue_place_rate',
    'trainer_30d_win_rate',
    'impost_diff', 'was_accident_prev1', 'weighted_si_momentum', 'weighted_rank_momentum',
    'class_rank', 'class_diff', 'is_promoted', 'is_demoted'
]

phase9_features = [
    'weighted_si_momentum_race_rank', 'weighted_si_momentum_diff_from_avg', 'weighted_si_momentum_zscore',
    'weighted_rank_momentum_race_rank', 'weighted_rank_momentum_diff_from_avg', 'weighted_rank_momentum_zscore',
    'class_rank_race_rank', 'class_rank_diff_from_avg', 'class_rank_zscore',
    'horse_state_place_rate', 'season', 'is_night_race', 'trainer_momentum_bias'
]

features = list(set(baseline_features + advanced_features + phase9_features))

categorical_cols = ['venue', 'state', 'gender', 'season']
for col in features:
    if col in df.columns:
        if col in categorical_cols:
            df[col] = df[col].astype(str).astype('category')
        else:
            df[col] = pd.to_numeric(df[col], errors='coerce')

features = [f for f in features if f in df.columns]

split_date = df['date'].quantile(0.8)
train_df = df[df['date'] < split_date].sort_values('race_id').copy()
test_df = df[df['date'] >= split_date].sort_values('race_id').copy()

train_groups = train_df.groupby('race_id').size().values
test_groups = test_df.groupby('race_id').size().values

train_label = 20 - train_df['rank']
test_label = 20 - test_df['rank']

print(f'訓練データ: {len(train_df)}')
print(f'テストデータ: {len(test_df)}')

重複データを削除しました: 150000 -> 143895 件


訓練データ: 29819
テストデータ: 7527


In [3]:
def objective(trial):
    params = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'ndcg_at': [3],
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42,
        'num_leaves': trial.suggest_int('num_leaves', 32, 128),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }
    
    model = lgb.LGBMRanker(**params, n_estimators=500)
    
    model.fit(
        train_df[features], train_label,
        group=train_groups,
        eval_set=[(test_df[features], test_label)],
        eval_group=[test_groups],
        eval_at=[3],
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )
    
    return model.best_score_['valid_0']['ndcg@3']

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

print('Best trial:')
trial = study.best_trial
print(f'  Value: {trial.value}')
print('  Params: ')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

Best trial: 35. Best value: 0.536039: 100%|██████████| 50/50 [05:57<00:00,  7.14s/it]

Best trial:
  Value: 0.5360388969770876
  Params: 
    num_leaves: 76
    max_depth: 4
    learning_rate: 0.053533995871727275
    feature_fraction: 0.7191988174741123
    bagging_fraction: 0.8247330987710993
    bagging_freq: 4
    min_child_samples: 75





In [4]:
best_params = study.best_params
model = lgb.LGBMRanker(**best_params, n_estimators=1000, random_state=42)

model.fit(
    train_df[features], train_label,
    group=train_groups,
    eval_set=[(test_df[features], test_label)],
    eval_group=[test_groups],
    eval_at=[1, 3, 5],
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

test_df['pred_score'] = model.predict(test_df[features])
test_df['pred_rank'] = test_df.groupby('race_id')['pred_score'].rank(method='min', ascending=False)

eval_list = []
for r in range(1, 6):
    matches = test_df[test_df['pred_rank'] == r]
    win_rate = (matches['rank'] == 1).mean()
    place_rate = (matches['rank'] <= 3).mean()
    eval_list.append({'predicted_rank': r, 'win_rate': win_rate, 'place_rate': place_rate})

eval_df = pd.DataFrame(eval_list)
print('\n予測順位別 的中率:')
print(eval_df)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[164]	valid_0's ndcg@1: 0.452324	valid_0's ndcg@3: 0.536039	valid_0's ndcg@5: 0.604984

予測順位別 的中率:
   predicted_rank  win_rate  place_rate
0               1  0.262376    0.623762
1               2  0.186469    0.470297
2               3  0.148515    0.432343
3               4  0.095710    0.311881
4               5  0.072607    0.264026
