<a href="https://colab.research.google.com/github/monda00/horse-race-notebook/blob/master/predict_show_simple_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ３着以内の馬を予測

lightGBMでシンプルなモデルを実装する。

- ライブラリ・データ読み込み
- データ前処理
- 学習
- 予測

# ライブラリ・データ読み込み

In [1]:
import numpy as np
import pandas as pd
import re
import collections

import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
DATA_PATH = '/content/drive/My Drive/data/horse-race/'

In [3]:
df = pd.read_csv(DATA_PATH + 'train.csv')

日付でソートしておく。

後半をtestデータとするため。

In [4]:
df = df.sort_values(by=['race_date', 'race_id', 'rank'])

In [5]:
df.reset_index(inplace=True, drop=True)

In [6]:
df.head()

Unnamed: 0,age,frame_number,horse_number,jockey,name,popular,race_date,race_id,race_name,rank,weight,win,show,clockwise,distance,field_type,place,race_round,start_time,weather
0,牝7,3.0,3,藤本現暉,リコーアペルタ,2.0,2019/1/1,201945010102,C3七　八,1,54.0,3.6,1,左,1400,ダ,川崎,2R,11:50,晴
1,牡7,5.0,5,加藤和博,ミラクルツッキー,1.0,2019/1/1,201945010102,C3七　八,2,56.0,2.0,1,左,1400,ダ,川崎,2R,11:50,晴
2,牡7,2.0,2,瀧川寿希,ロジレガシー,3.0,2019/1/1,201945010102,C3七　八,3,56.0,5.9,1,左,1400,ダ,川崎,2R,11:50,晴
3,牝7,7.0,8,岡村健司,プチプチ,8.0,2019/1/1,201945010102,C3七　八,4,54.0,22.1,0,左,1400,ダ,川崎,2R,11:50,晴
4,牝4,8.0,10,伊藤裕人,スエヒロドラ,4.0,2019/1/1,201945010102,C3七　八,5,54.0,10.3,0,左,1400,ダ,川崎,2R,11:50,晴


In [7]:
len(df)

253636

# データ前処理

- レースが開催されるのは週末のため日付は利用しない
  - 季節に変換
- 時刻は時間帯に変換
- ageは性別（？）と年齢に分割
- 気温は関係ありそうなため後々追加
- レース名はレースの種類に変換した方が良さそうだが後回し
- rankはランキング学習のためにデータをソートしたら削除

開催日から季節を追加し、開催日を削除

In [8]:
season = []
for i in range(len(df)):
  race_date = df.iloc[i]['race_date']
  race_month = int(re.search(r'\/.+?\/', race_date).group().replace('/', ''))
  if 3 <= race_month <= 5:
    season.append('sprint')
  elif 6 <= race_month <= 8:
    season.append('summer')
  elif 9 <= race_month <= 11:
    season.append('autumn')
  else:
    season.append('winter')

In [9]:
df['season'] = season

時刻を時間帯に分ける。
まずは、開催時間の種類を確認。

In [10]:
df['start_time'].value_counts()

13:15    4516
16:10    4410
20:50    4264
12:45    3772
15:45    3764
         ... 
12:01      15
15:51      13
13:51      12
20:25      10
14:41      10
Name: start_time, Length: 149, dtype: int64

In [11]:
df.sort_values(by='start_time')

Unnamed: 0,age,frame_number,horse_number,jockey,name,popular,race_date,race_id,race_name,rank,weight,win,show,clockwise,distance,field_type,place,race_round,start_time,weather,season
49126,牡2,8.0,14,三浦皇成,ロードファビュラス,3.0,2019/12/22,201906050801,2歳未勝利,6,55.0,8.6,0,右,1200,ダ,中山,1R,09:35,曇,winter
49129,牡2,6.0,10,嶋田純次,トマティーナ,10.0,2019/12/22,201906050801,2歳未勝利,9,55.0,180.7,0,右,1200,ダ,中山,1R,09:35,曇,winter
49128,牡2,2.0,2,菅原明良,アースウルフ,9.0,2019/12/22,201906050801,2歳未勝利,8,52.0,171.8,0,右,1200,ダ,中山,1R,09:35,曇,winter
49127,牡2,1.0,1,武藤雅,キタノギャラクシー,6.0,2019/12/22,201906050801,2歳未勝利,7,55.0,20.9,0,右,1200,ダ,中山,1R,09:35,曇,winter
49132,牡2,7.0,13,宮崎北斗,ソレユケタケコ,11.0,2019/12/22,201906050801,2歳未勝利,12,55.0,305.7,0,右,1200,ダ,中山,1R,09:35,曇,winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205545,牡6,1.0,1,倉兼育康,サクラゴール,8.0,2020/3/22,202054032210,ファイナルレース,6,56.0,14.4,0,右,1300,ダ,高知,10R,20:55,曇,sprint
205544,牡8,4.0,4,郷間勇太,ヴェスペルティリオ,10.0,2020/3/22,202054032210,ファイナルレース,5,56.0,21.4,0,右,1300,ダ,高知,10R,20:55,曇,sprint
205543,牝5,6.0,8,濱尚美,ノーブルジャーニー,1.0,2020/3/22,202054032210,ファイナルレース,4,50.0,3.7,0,右,1300,ダ,高知,10R,20:55,曇,sprint
131394,牡4,5.0,5,西川敏弘,ハタノガナール,7.0,2019/7/15,201954071512,ファイナルレース,1,56.0,11.6,1,右,1300,ダ,高知,12R,20:55,曇,summer


時間の値で置き換える

09:55->9  
20:55->20

In [12]:
time_hour = []
for i in range(len(df)):
  start_time = df.iloc[i]['start_time']
  time_hour.append(int(re.search(r'(.*):(.*)', start_time).group(1)))

In [13]:
df['time_hour'] = time_hour

ageを分ける

In [14]:
gen = []
age = []
for i in range(len(df)):
  age_v = df.iloc[i]['age']
  gen.append(re.search(r'(.*)(\d)', age_v).group(1))
  age.append(re.search(r'(.*)(\d)', age_v).group(2))

In [15]:
df['age'] = age
df['gen'] = gen

In [16]:
df

Unnamed: 0,age,frame_number,horse_number,jockey,name,popular,race_date,race_id,race_name,rank,weight,win,show,clockwise,distance,field_type,place,race_round,start_time,weather,season,time_hour,gen
0,7,3.0,3,藤本現暉,リコーアペルタ,2.0,2019/1/1,201945010102,C3七　八,1,54.0,3.6,1,左,1400,ダ,川崎,2R,11:50,晴,winter,11,牝
1,7,5.0,5,加藤和博,ミラクルツッキー,1.0,2019/1/1,201945010102,C3七　八,2,56.0,2.0,1,左,1400,ダ,川崎,2R,11:50,晴,winter,11,牡
2,7,2.0,2,瀧川寿希,ロジレガシー,3.0,2019/1/1,201945010102,C3七　八,3,56.0,5.9,1,左,1400,ダ,川崎,2R,11:50,晴,winter,11,牡
3,7,7.0,8,岡村健司,プチプチ,8.0,2019/1/1,201945010102,C3七　八,4,54.0,22.1,0,左,1400,ダ,川崎,2R,11:50,晴,winter,11,牝
4,4,8.0,10,伊藤裕人,スエヒロドラ,4.0,2019/1/1,201945010102,C3七　八,5,54.0,10.3,0,左,1400,ダ,川崎,2R,11:50,晴,winter,11,牝
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253631,5,8.0,10,藤原良一,パートカラー,9.0,2020/6/9,202048060911,おおぐま座特別,6,54.0,66.4,0,右,1600,ダ,名古屋,11R,17:00,晴,summer,17,牝
253632,6,1.0,1,浅野皓大,フラワーイレブン,4.0,2020/6/9,202048060911,おおぐま座特別,7,51.0,8.2,0,右,1600,ダ,名古屋,11R,17:00,晴,summer,17,牝
253633,5,4.0,4,加藤聡一,テイエムヨハネス,3.0,2020/6/9,202048060911,おおぐま座特別,8,56.0,6.2,0,右,1600,ダ,名古屋,11R,17:00,晴,summer,17,牡
253634,4,8.0,9,友森翔太,メモリーバリケード,2.0,2020/6/9,202048060911,おおぐま座特別,9,55.0,3.2,0,右,1600,ダ,名古屋,11R,17:00,晴,summer,17,牝


In [17]:
for col, values in df.iteritems():
    num_uniques = values.nunique()
    print ('{name}: {num_unique}'.format(name=col, num_unique=num_uniques))

age: 10
frame_number: 8
horse_number: 18
jockey: 450
name: 23752
popular: 18
race_date: 538
race_id: 22630
race_name: 7082
rank: 18
weight: 26
win: 5403
show: 2
clockwise: 11
distance: 54
field_type: 3
place: 24
race_round: 12
start_time: 149
weather: 6
season: 4
time_hour: 12
gen: 6


## Label Encoding

一旦、単純なLabel Encodingをする。

In [18]:
categorical_cols = ['jockey', 'name', 'race_name', 'clockwise', 'field_type', 'place', 'race_round', 'weather', 'season', 'gen']

In [19]:
for c in categorical_cols:
  le = LabelEncoder()
  le.fit(df[c])
  df[c] = le.transform(df[c])

In [20]:
df

Unnamed: 0,age,frame_number,horse_number,jockey,name,popular,race_date,race_id,race_name,rank,weight,win,show,clockwise,distance,field_type,place,race_round,start_time,weather,season,time_hour,gen
0,7,3.0,3,380,21625,2.0,2019/1/1,201945010102,763,1,54.0,3.6,1,3,1400,0,10,4,11:50,2,3,11,2
1,7,5.0,5,66,19528,1.0,2019/1/1,201945010102,763,2,56.0,2.0,1,3,1400,0,10,4,11:50,2,3,11,4
2,7,2.0,2,307,22903,3.0,2019/1/1,201945010102,763,3,56.0,5.9,1,3,1400,0,10,4,11:50,2,3,11,4
3,7,7.0,8,179,17138,8.0,2019/1/1,201945010102,763,4,54.0,22.1,0,3,1400,0,10,4,11:50,2,3,11,2
4,4,8.0,10,51,9310,4.0,2019/1/1,201945010102,763,5,54.0,10.3,0,3,1400,0,10,4,11:50,2,3,11,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253631,5,8.0,10,376,15231,9.0,2020/6/9,202048060911,1119,6,54.0,66.4,0,0,1600,0,5,1,17:00,2,2,17,2
253632,6,1.0,1,297,16492,4.0,2020/6/9,202048060911,1119,7,51.0,8.2,0,0,1600,0,5,1,17:00,2,2,17,2
253633,5,4.0,4,68,11967,3.0,2020/6/9,202048060911,1119,8,56.0,6.2,0,0,1600,0,5,1,17:00,2,2,17,4
253634,4,8.0,9,81,20226,2.0,2020/6/9,202048060911,1119,9,55.0,3.2,0,0,1600,0,5,1,17:00,2,2,17,2


## trainデータとtestデータに分割

In [21]:
train_df, test_df = train_test_split(df, test_size=53624, shuffle=False)

ランキング学習用にクエリデータ作成。

クエリデータは、何行ごとにクエリがまとまっているかを表している。
クエリ（ここではレース）ごとにデータが連続していなければならない。

In [22]:
train_race_id_counter = collections.Counter(list(train_df['race_id'].values))
test_race_id_counter = collections.Counter(list(test_df['race_id'].values))

In [23]:
train_query = list(train_race_id_counter.values())
test_query = list(test_race_id_counter.values())

# ランキング学習

## 目的変数を分離

In [24]:
fea_cols = ['age', 'frame_number', 'horse_number', 'jockey', 'name', 'race_id', 'race_name', 'weight',
            'clockwise', 'distance', 'field_type', 'place', 'race_round', 'weather', 'season', 'time_hour', 'gen']
target_col = ['rank']

In [25]:
X_train = train_df[fea_cols].values
y_train = train_df[target_col].values.reshape(-1)
X_test = test_df[fea_cols].values
y_test = test_df[target_col].values.reshape(-1)

## 学習

In [26]:
lgb_train = lgb.Dataset(X_train, y_train, group=train_query)
lgb_eval = lgb.Dataset(X_test, y_test, group=test_query)

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [1, 3],
    'boosting_type': 'gbdt',
}

num_round = 100

In [27]:
rank_model = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_eval,
    num_boost_round=num_round,
)

[1]	valid_0's ndcg@1: 0.172986	valid_0's ndcg@3: 0.265123
[2]	valid_0's ndcg@1: 0.216764	valid_0's ndcg@3: 0.312394
[3]	valid_0's ndcg@1: 0.226735	valid_0's ndcg@3: 0.322829
[4]	valid_0's ndcg@1: 0.229314	valid_0's ndcg@3: 0.325274
[5]	valid_0's ndcg@1: 0.23347	valid_0's ndcg@3: 0.328412
[6]	valid_0's ndcg@1: 0.242415	valid_0's ndcg@3: 0.334507
[7]	valid_0's ndcg@1: 0.245532	valid_0's ndcg@3: 0.337413
[8]	valid_0's ndcg@1: 0.247752	valid_0's ndcg@3: 0.341613
[9]	valid_0's ndcg@1: 0.245957	valid_0's ndcg@3: 0.340229
[10]	valid_0's ndcg@1: 0.24487	valid_0's ndcg@3: 0.340432
[11]	valid_0's ndcg@1: 0.249034	valid_0's ndcg@3: 0.341582
[12]	valid_0's ndcg@1: 0.2486	valid_0's ndcg@3: 0.345112
[13]	valid_0's ndcg@1: 0.248574	valid_0's ndcg@3: 0.345604
[14]	valid_0's ndcg@1: 0.249878	valid_0's ndcg@3: 0.346662
[15]	valid_0's ndcg@1: 0.253015	valid_0's ndcg@3: 0.349429
[16]	valid_0's ndcg@1: 0.251451	valid_0's ndcg@3: 0.348931
[17]	valid_0's ndcg@1: 0.250735	valid_0's ndcg@3: 0.348372
[18]	valid

# 分類問題

## 目的変数を分離

In [28]:
fea_cols = ['age', 'frame_number', 'horse_number', 'jockey', 'name', 'race_id', 'race_name', 'weight',
            'clockwise', 'distance', 'field_type', 'place', 'race_round', 'weather', 'season', 'time_hour', 'gen']
target_col = ['show']

In [29]:
X_train = train_df[fea_cols].values
y_train = train_df[target_col].values.reshape(-1)
X_test = test_df[fea_cols].values
y_test = test_df[target_col].values.reshape(-1)

## 学習

In [30]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test)

params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
}

num_round = 100

In [31]:
cls_model = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_eval,
    num_boost_round=num_round,
)

[1]	valid_0's auc: 0.569522
[2]	valid_0's auc: 0.57231
[3]	valid_0's auc: 0.574479
[4]	valid_0's auc: 0.574129
[5]	valid_0's auc: 0.574943
[6]	valid_0's auc: 0.581808
[7]	valid_0's auc: 0.581696
[8]	valid_0's auc: 0.590203
[9]	valid_0's auc: 0.590906
[10]	valid_0's auc: 0.593226
[11]	valid_0's auc: 0.594134
[12]	valid_0's auc: 0.598578
[13]	valid_0's auc: 0.600474
[14]	valid_0's auc: 0.602313
[15]	valid_0's auc: 0.603133
[16]	valid_0's auc: 0.604793
[17]	valid_0's auc: 0.605892
[18]	valid_0's auc: 0.609844
[19]	valid_0's auc: 0.613288
[20]	valid_0's auc: 0.613623
[21]	valid_0's auc: 0.614203
[22]	valid_0's auc: 0.615071
[23]	valid_0's auc: 0.617281
[24]	valid_0's auc: 0.618414
[25]	valid_0's auc: 0.619767
[26]	valid_0's auc: 0.622318
[27]	valid_0's auc: 0.622723
[28]	valid_0's auc: 0.623364
[29]	valid_0's auc: 0.625403
[30]	valid_0's auc: 0.626446
[31]	valid_0's auc: 0.627353
[32]	valid_0's auc: 0.628444
[33]	valid_0's auc: 0.628678
[34]	valid_0's auc: 0.629077
[35]	valid_0's auc: 0.63

# モデル比較

レースごとに予測された確率が最も高い馬が３位以内に入っている確率を算出する。

In [32]:
def calc_prob(predict):
  stack_q = 0
  correct = 0
  for query in test_query:
    ind = np.argmax(predict[stack_q:stack_q+query])
    stack_q += query
    if test_df.iloc[ind]['show'] == 1:
      correct += 1

  print('score is', correct / len(test_query))

In [33]:
predict_rank = rank_model.predict(X_test)
predict_cls = cls_model.predict(X_test)

In [34]:
calc_prob(predict_rank)
calc_prob(predict_cls)

score is 0.3460570469798658
score is 0.5553691275167785


# 結果の考察

- 一番人気を予測しているだけでないか

In [35]:
def calc_prob_most_popular(predict):
  stack_q = 0
  predict_most_pop = 0
  for query in test_query:
    ind_predict = np.argmax(predict[stack_q:stack_q+query])
    ind_pop = np.argmin(test_df.iloc[stack_q:stack_q+query]['popular'])
    stack_q += query
    if ind_predict == ind_pop:
      predict_most_pop += 1

  print('predict most popular prob is', predict_most_pop / len(test_query))

In [36]:
calc_prob_most_popular(predict_rank)
calc_prob_most_popular(predict_cls)

predict most popular prob is 0.026635906040268456
predict most popular prob is 0.2787332214765101


８割が一番人気を予測している。