<a href="https://colab.research.google.com/github/monda00/horse-race-notebook/blob/master/predict_show_simple_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ３着以内の馬を予測

lightGBMでシンプルなモデルを実装する。

- ライブラリ・データ読み込み
- データ前処理
- 学習
- 予測

# ライブラリ・データ読み込み

In [1]:
import numpy as np
import pandas as pd
import re
import collections

import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
DATA_PATH = '/content/drive/My Drive/data/horse-race/'

In [3]:
train_df = pd.read_csv(DATA_PATH + 'train.csv')

日付でソートしておく。

後半をtestデータとするため。

In [4]:
train_df = train_df.sort_values(by=['race_date', 'race_id', 'rank'])

In [5]:
train_df.reset_index(inplace=True, drop=True)

In [6]:
train_df.head()

Unnamed: 0,age,frame_number,horse_number,jockey,name,popular,race_date,race_id,race_name,rank,weight,win,show,clockwise,distance,field_type,place,race_round,start_time,weather
0,牝7,3.0,3,藤本現暉,リコーアペルタ,2.0,2019/1/1,201945010102,C3七　八,1,54.0,3.6,1,左,1400,ダ,川崎,2R,11:50,晴
1,牡7,5.0,5,加藤和博,ミラクルツッキー,1.0,2019/1/1,201945010102,C3七　八,2,56.0,2.0,1,左,1400,ダ,川崎,2R,11:50,晴
2,牡7,2.0,2,瀧川寿希,ロジレガシー,3.0,2019/1/1,201945010102,C3七　八,3,56.0,5.9,1,左,1400,ダ,川崎,2R,11:50,晴
3,牝7,7.0,8,岡村健司,プチプチ,8.0,2019/1/1,201945010102,C3七　八,4,54.0,22.1,0,左,1400,ダ,川崎,2R,11:50,晴
4,牝4,8.0,10,伊藤裕人,スエヒロドラ,4.0,2019/1/1,201945010102,C3七　八,5,54.0,10.3,0,左,1400,ダ,川崎,2R,11:50,晴


In [7]:
len(train_df)

253636

# データ前処理

- レースが開催されるのは週末のため日付は利用しない
  - 季節に変換
- 時刻は時間帯に変換
- ageは性別（？）と年齢に分割
- 気温は関係ありそうなため後々追加
- レース名はレースの種類に変換した方が良さそうだが後回し
- rankはランキング学習のためにデータをソートしたら削除

開催日から季節を追加し、開催日を削除

In [8]:
season = []
for i in range(len(train_df)):
  race_date = train_df.iloc[i]['race_date']
  race_month = int(re.search(r'\/.+?\/', race_date).group().replace('/', ''))
  if 3 <= race_month <= 5:
    season.append('sprint')
  elif 6 <= race_month <= 8:
    season.append('summer')
  elif 9 <= race_month <= 11:
    season.append('autumn')
  else:
    season.append('winter')

In [9]:
train_df['season'] = season

時刻を時間帯に分ける。
まずは、開催時間の種類を確認。

In [10]:
train_df['start_time'].value_counts()

13:15    4516
16:10    4410
20:50    4264
12:45    3772
15:45    3764
         ... 
12:01      15
15:51      13
13:51      12
14:41      10
20:25      10
Name: start_time, Length: 149, dtype: int64

In [11]:
train_df.sort_values(by='start_time')

Unnamed: 0,age,frame_number,horse_number,jockey,name,popular,race_date,race_id,race_name,rank,weight,win,show,clockwise,distance,field_type,place,race_round,start_time,weather,season
49126,牡2,8.0,14,三浦皇成,ロードファビュラス,3.0,2019/12/22,201906050801,2歳未勝利,6,55.0,8.6,0,右,1200,ダ,中山,1R,09:35,曇,winter
49129,牡2,6.0,10,嶋田純次,トマティーナ,10.0,2019/12/22,201906050801,2歳未勝利,9,55.0,180.7,0,右,1200,ダ,中山,1R,09:35,曇,winter
49128,牡2,2.0,2,菅原明良,アースウルフ,9.0,2019/12/22,201906050801,2歳未勝利,8,52.0,171.8,0,右,1200,ダ,中山,1R,09:35,曇,winter
49127,牡2,1.0,1,武藤雅,キタノギャラクシー,6.0,2019/12/22,201906050801,2歳未勝利,7,55.0,20.9,0,右,1200,ダ,中山,1R,09:35,曇,winter
49132,牡2,7.0,13,宮崎北斗,ソレユケタケコ,11.0,2019/12/22,201906050801,2歳未勝利,12,55.0,305.7,0,右,1200,ダ,中山,1R,09:35,曇,winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205545,牡6,1.0,1,倉兼育康,サクラゴール,8.0,2020/3/22,202054032210,ファイナルレース,6,56.0,14.4,0,右,1300,ダ,高知,10R,20:55,曇,sprint
205544,牡8,4.0,4,郷間勇太,ヴェスペルティリオ,10.0,2020/3/22,202054032210,ファイナルレース,5,56.0,21.4,0,右,1300,ダ,高知,10R,20:55,曇,sprint
205543,牝5,6.0,8,濱尚美,ノーブルジャーニー,1.0,2020/3/22,202054032210,ファイナルレース,4,50.0,3.7,0,右,1300,ダ,高知,10R,20:55,曇,sprint
131394,牡4,5.0,5,西川敏弘,ハタノガナール,7.0,2019/7/15,201954071512,ファイナルレース,1,56.0,11.6,1,右,1300,ダ,高知,12R,20:55,曇,summer


時間の値で置き換える

09:55->9  
20:55->20

In [12]:
time_hour = []
for i in range(len(train_df)):
  start_time = train_df.iloc[i]['start_time']
  time_hour.append(int(re.search(r'(.*):(.*)', start_time).group(1)))

In [13]:
train_df['time_hour'] = time_hour

ageを分ける

In [14]:
gen = []
age = []
for i in range(len(train_df)):
  age_v = train_df.iloc[i]['age']
  gen.append(re.search(r'(.*)(\d)', age_v).group(1))
  age.append(re.search(r'(.*)(\d)', age_v).group(2))

In [15]:
train_df['age'] = age
train_df['gen'] = gen

In [16]:
train_df

Unnamed: 0,age,frame_number,horse_number,jockey,name,popular,race_date,race_id,race_name,rank,weight,win,show,clockwise,distance,field_type,place,race_round,start_time,weather,season,time_hour,gen
0,7,3.0,3,藤本現暉,リコーアペルタ,2.0,2019/1/1,201945010102,C3七　八,1,54.0,3.6,1,左,1400,ダ,川崎,2R,11:50,晴,winter,11,牝
1,7,5.0,5,加藤和博,ミラクルツッキー,1.0,2019/1/1,201945010102,C3七　八,2,56.0,2.0,1,左,1400,ダ,川崎,2R,11:50,晴,winter,11,牡
2,7,2.0,2,瀧川寿希,ロジレガシー,3.0,2019/1/1,201945010102,C3七　八,3,56.0,5.9,1,左,1400,ダ,川崎,2R,11:50,晴,winter,11,牡
3,7,7.0,8,岡村健司,プチプチ,8.0,2019/1/1,201945010102,C3七　八,4,54.0,22.1,0,左,1400,ダ,川崎,2R,11:50,晴,winter,11,牝
4,4,8.0,10,伊藤裕人,スエヒロドラ,4.0,2019/1/1,201945010102,C3七　八,5,54.0,10.3,0,左,1400,ダ,川崎,2R,11:50,晴,winter,11,牝
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253631,5,8.0,10,藤原良一,パートカラー,9.0,2020/6/9,202048060911,おおぐま座特別,6,54.0,66.4,0,右,1600,ダ,名古屋,11R,17:00,晴,summer,17,牝
253632,6,1.0,1,浅野皓大,フラワーイレブン,4.0,2020/6/9,202048060911,おおぐま座特別,7,51.0,8.2,0,右,1600,ダ,名古屋,11R,17:00,晴,summer,17,牝
253633,5,4.0,4,加藤聡一,テイエムヨハネス,3.0,2020/6/9,202048060911,おおぐま座特別,8,56.0,6.2,0,右,1600,ダ,名古屋,11R,17:00,晴,summer,17,牡
253634,4,8.0,9,友森翔太,メモリーバリケード,2.0,2020/6/9,202048060911,おおぐま座特別,9,55.0,3.2,0,右,1600,ダ,名古屋,11R,17:00,晴,summer,17,牝


In [17]:
for col, values in train_df.iteritems():
    num_uniques = values.nunique()
    print ('{name}: {num_unique}'.format(name=col, num_unique=num_uniques))

age: 10
frame_number: 8
horse_number: 18
jockey: 450
name: 23752
popular: 18
race_date: 538
race_id: 22630
race_name: 7082
rank: 18
weight: 26
win: 5403
show: 2
clockwise: 11
distance: 54
field_type: 3
place: 24
race_round: 12
start_time: 149
weather: 6
season: 4
time_hour: 12
gen: 6


## Label Encoding

一旦、単純なLabel Encodingをする。

In [18]:
categorical_cols = ['jockey', 'name', 'race_name', 'clockwise', 'field_type', 'place', 'race_round', 'weather', 'season', 'gen']

In [19]:
for c in categorical_cols:
  le = LabelEncoder()
  le.fit(train_df[c])
  train_df[c] = le.transform(train_df[c])

In [20]:
train_df

Unnamed: 0,age,frame_number,horse_number,jockey,name,popular,race_date,race_id,race_name,rank,weight,win,show,clockwise,distance,field_type,place,race_round,start_time,weather,season,time_hour,gen
0,7,3.0,3,380,21625,2.0,2019/1/1,201945010102,763,1,54.0,3.6,1,3,1400,0,10,4,11:50,2,3,11,2
1,7,5.0,5,66,19528,1.0,2019/1/1,201945010102,763,2,56.0,2.0,1,3,1400,0,10,4,11:50,2,3,11,4
2,7,2.0,2,307,22903,3.0,2019/1/1,201945010102,763,3,56.0,5.9,1,3,1400,0,10,4,11:50,2,3,11,4
3,7,7.0,8,179,17138,8.0,2019/1/1,201945010102,763,4,54.0,22.1,0,3,1400,0,10,4,11:50,2,3,11,2
4,4,8.0,10,51,9310,4.0,2019/1/1,201945010102,763,5,54.0,10.3,0,3,1400,0,10,4,11:50,2,3,11,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253631,5,8.0,10,376,15231,9.0,2020/6/9,202048060911,1119,6,54.0,66.4,0,0,1600,0,5,1,17:00,2,2,17,2
253632,6,1.0,1,297,16492,4.0,2020/6/9,202048060911,1119,7,51.0,8.2,0,0,1600,0,5,1,17:00,2,2,17,2
253633,5,4.0,4,68,11967,3.0,2020/6/9,202048060911,1119,8,56.0,6.2,0,0,1600,0,5,1,17:00,2,2,17,4
253634,4,8.0,9,81,20226,2.0,2020/6/9,202048060911,1119,9,55.0,3.2,0,0,1600,0,5,1,17:00,2,2,17,2


## trainデータとtestデータに分割

In [21]:
train_df, test_df = train_test_split(train_df, test_size=53624, shuffle=False)

ランキング学習用にクエリデータ作成。

クエリデータは、何行ごとにクエリがまとまっているかを表している。
クエリ（ここではレース）ごとにデータが連続していなければならない。

In [22]:
train_race_id_counter = collections.Counter(list(train_df['race_id'].values))
test_race_id_counter = collections.Counter(list(test_df['race_id'].values))

In [23]:
train_query = list(train_race_id_counter.values())
test_query = list(test_race_id_counter.values())

## 目的変数を分離

In [24]:
fea_cols = ['age', 'frame_number', 'horse_number', 'jockey', 'name', 'popular', 'race_id', 'race_name', 'weight',
            'win', 'clockwise', 'distance', 'field_type', 'place', 'race_round', 'weather', 'season', 'time_hour', 'gen']
target_col = ['rank']

In [25]:
X_train = train_df[fea_cols].values
y_train = train_df[target_col].values.reshape(-1)
X_test = test_df[fea_cols].values
y_test = test_df[target_col].values.reshape(-1)

# 学習

In [26]:
lgb_train = lgb.Dataset(X_train, y_train, group=train_query)
lgb_eval = lgb.Dataset(X_test, y_test, group=test_query)

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [1, 3],
    'boosting_type': 'gbdt',
}

num_round = 100

In [27]:
model = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_eval,
    num_boost_round=num_round,
)

[1]	valid_0's ndcg@1: 0.323769	valid_0's ndcg@3: 0.475051
[2]	valid_0's ndcg@1: 0.371914	valid_0's ndcg@3: 0.506285
[3]	valid_0's ndcg@1: 0.393932	valid_0's ndcg@3: 0.522159
[4]	valid_0's ndcg@1: 0.404985	valid_0's ndcg@3: 0.527143
[5]	valid_0's ndcg@1: 0.408321	valid_0's ndcg@3: 0.529566
[6]	valid_0's ndcg@1: 0.414805	valid_0's ndcg@3: 0.532403
[7]	valid_0's ndcg@1: 0.417291	valid_0's ndcg@3: 0.533489
[8]	valid_0's ndcg@1: 0.422089	valid_0's ndcg@3: 0.535665
[9]	valid_0's ndcg@1: 0.422347	valid_0's ndcg@3: 0.535545
[10]	valid_0's ndcg@1: 0.424124	valid_0's ndcg@3: 0.536821
[11]	valid_0's ndcg@1: 0.425687	valid_0's ndcg@3: 0.537458
[12]	valid_0's ndcg@1: 0.425297	valid_0's ndcg@3: 0.537725
[13]	valid_0's ndcg@1: 0.426677	valid_0's ndcg@3: 0.538517
[14]	valid_0's ndcg@1: 0.426524	valid_0's ndcg@3: 0.538462
[15]	valid_0's ndcg@1: 0.426532	valid_0's ndcg@3: 0.5383
[16]	valid_0's ndcg@1: 0.427083	valid_0's ndcg@3: 0.53933
[17]	valid_0's ndcg@1: 0.427683	valid_0's ndcg@3: 0.539783
[18]	vali

# 予測

In [28]:
model.predict(X_test)[:50]

array([-0.85559027, -0.47199565, -1.00579254, -0.80021847, -0.13258366,
       -0.16167441, -0.27023039, -0.85930273, -0.75482886, -0.64271283,
       -0.95534629, -0.63331581, -1.45354636, -0.70423876, -0.04755167,
        0.15812304, -0.37702882, -1.45354636, -0.4199507 , -0.42178056,
        0.12665557, -1.01764416, -0.70363388,  0.15512361, -0.20674983,
        0.52550528, -0.87976898, -0.27434424, -0.88118935, -0.88174971,
       -1.02147633, -0.69914322,  0.029383  , -0.42746929,  0.24343017,
       -0.9579222 , -1.14256133, -0.6135922 , -0.36309962, -0.64802168,
       -0.281923  ,  0.02589601, -1.02717471,  0.23783514, -1.28179643,
       -0.9579222 , -0.71747062, -0.3125239 ,  0.01863822, -0.22420382])

In [29]:
y_test[:50]

array([1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5,
       6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9,
       1, 2, 3, 4, 5, 6])