<a href="https://colab.research.google.com/github/monda00/horse-race-notebook/blob/master/predict_show_simple_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ３着以内の馬を予測

lightGBMでシンプルなモデルを実装する。

- ライブラリ・データ読み込み
- データ前処理
- 学習
- 予測

# ライブラリ・データ読み込み

In [1]:
import numpy as np
import pandas as pd
import re

import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
DATA_PATH = '/content/drive/My Drive/data/horse-race/'

In [3]:
train_df = pd.read_csv(DATA_PATH + 'train.csv')

日付でソートしておく。

後半をtestデータとするため。

In [4]:
train_df = train_df.sort_values(by=['race_date', 'race_id'])

In [5]:
train_df.head()

Unnamed: 0,age,frame_number,horse_number,jockey,name,popular,race_date,race_id,race_name,weight,win,show,clockwise,distance,field_type,place,race_round,start_time,weather
253626,牝7,3.0,3,藤本現暉,リコーアペルタ,2.0,2019/1/1,201945010102,C3七　八,54.0,3.6,1,左,1400,ダ,川崎,2R,11:50,晴
253627,牡7,5.0,5,加藤和博,ミラクルツッキー,1.0,2019/1/1,201945010102,C3七　八,56.0,2.0,1,左,1400,ダ,川崎,2R,11:50,晴
253628,牡7,2.0,2,瀧川寿希,ロジレガシー,3.0,2019/1/1,201945010102,C3七　八,56.0,5.9,1,左,1400,ダ,川崎,2R,11:50,晴
253629,牝7,7.0,8,岡村健司,プチプチ,8.0,2019/1/1,201945010102,C3七　八,54.0,22.1,0,左,1400,ダ,川崎,2R,11:50,晴
253630,牝4,8.0,10,伊藤裕人,スエヒロドラ,4.0,2019/1/1,201945010102,C3七　八,54.0,10.3,0,左,1400,ダ,川崎,2R,11:50,晴


In [6]:
len(train_df)

253636

# データ前処理

- レースが開催されるのは週末のため日付は利用しない
  - 季節に変換
- 時刻は時間帯に変換
- ageは性別（？）と年齢に分割
- 気温は関係ありそうなため後々追加
- レース名はレースの種類に変換した方が良さそうだが後回し

開催日から季節を追加し、開催日を削除

In [7]:
season = []
for i in range(len(train_df)):
  race_date = train_df.iloc[i]['race_date']
  race_month = int(re.search(r'\/.+?\/', race_date).group().replace('/', ''))
  if 3 <= race_month <= 5:
    season.append('sprint')
  elif 6 <= race_month <= 8:
    season.append('summer')
  elif 9 <= race_month <= 11:
    season.append('autumn')
  else:
    season.append('winter')

In [8]:
train_df['season'] = season

In [9]:
train_df = train_df.drop('race_date', axis=1)

時刻を時間帯に分ける。
まずは、開催時間の種類を確認。

In [10]:
train_df['start_time'].value_counts()

13:15    4516
16:10    4410
20:50    4264
12:45    3772
15:45    3764
         ... 
12:01      15
15:51      13
13:51      12
20:25      10
14:41      10
Name: start_time, Length: 149, dtype: int64

In [11]:
train_df.sort_values(by='start_time')

Unnamed: 0,age,frame_number,horse_number,jockey,name,popular,race_id,race_name,weight,win,show,clockwise,distance,field_type,place,race_round,start_time,weather,season
85638,牡2,8.0,14,三浦皇成,ロードファビュラス,3.0,201906050801,2歳未勝利,55.0,8.6,0,右,1200,ダ,中山,1R,09:35,曇,winter
85641,牡2,6.0,10,嶋田純次,トマティーナ,10.0,201906050801,2歳未勝利,55.0,180.7,0,右,1200,ダ,中山,1R,09:35,曇,winter
85640,牡2,2.0,2,菅原明良,アースウルフ,9.0,201906050801,2歳未勝利,52.0,171.8,0,右,1200,ダ,中山,1R,09:35,曇,winter
85639,牡2,1.0,1,武藤雅,キタノギャラクシー,6.0,201906050801,2歳未勝利,55.0,20.9,0,右,1200,ダ,中山,1R,09:35,曇,winter
85644,牡2,7.0,13,宮崎北斗,ソレユケタケコ,11.0,201906050801,2歳未勝利,55.0,305.7,0,右,1200,ダ,中山,1R,09:35,曇,winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45793,牡6,1.0,1,倉兼育康,サクラゴール,8.0,202054032210,ファイナルレース,56.0,14.4,0,右,1300,ダ,高知,10R,20:55,曇,sprint
45792,牡8,4.0,4,郷間勇太,ヴェスペルティリオ,10.0,202054032210,ファイナルレース,56.0,21.4,0,右,1300,ダ,高知,10R,20:55,曇,sprint
45791,牝5,6.0,8,濱尚美,ノーブルジャーニー,1.0,202054032210,ファイナルレース,50.0,3.7,0,右,1300,ダ,高知,10R,20:55,曇,sprint
163124,牡4,5.0,5,西川敏弘,ハタノガナール,7.0,201954071512,ファイナルレース,56.0,11.6,1,右,1300,ダ,高知,12R,20:55,曇,summer


時間の値で置き換える

09:55->9  
20:55->20

In [12]:
time_hour = []
for i in range(len(train_df)):
  start_time = train_df.iloc[i]['start_time']
  time_hour.append(int(re.search(r'(.*):(.*)', start_time).group(1)))

In [13]:
train_df['time_hour'] = time_hour

In [14]:
train_df = train_df.drop('start_time', axis=1)

ageを分ける

In [15]:
gen = []
age = []
for i in range(len(train_df)):
  age_v = train_df.iloc[i]['age']
  gen.append(re.search(r'(.*)(\d)', age_v).group(1))
  age.append(re.search(r'(.*)(\d)', age_v).group(2))

In [16]:
train_df['age'] = age
train_df['gen'] = gen

In [17]:
train_df

Unnamed: 0,age,frame_number,horse_number,jockey,name,popular,race_id,race_name,weight,win,show,clockwise,distance,field_type,place,race_round,weather,season,time_hour,gen
253626,7,3.0,3,藤本現暉,リコーアペルタ,2.0,201945010102,C3七　八,54.0,3.6,1,左,1400,ダ,川崎,2R,晴,winter,11,牝
253627,7,5.0,5,加藤和博,ミラクルツッキー,1.0,201945010102,C3七　八,56.0,2.0,1,左,1400,ダ,川崎,2R,晴,winter,11,牡
253628,7,2.0,2,瀧川寿希,ロジレガシー,3.0,201945010102,C3七　八,56.0,5.9,1,左,1400,ダ,川崎,2R,晴,winter,11,牡
253629,7,7.0,8,岡村健司,プチプチ,8.0,201945010102,C3七　八,54.0,22.1,0,左,1400,ダ,川崎,2R,晴,winter,11,牝
253630,4,8.0,10,伊藤裕人,スエヒロドラ,4.0,201945010102,C3七　八,54.0,10.3,0,左,1400,ダ,川崎,2R,晴,winter,11,牝
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6709,5,8.0,10,藤原良一,パートカラー,9.0,202048060911,おおぐま座特別,54.0,66.4,0,右,1600,ダ,名古屋,11R,晴,summer,17,牝
6710,6,1.0,1,浅野皓大,フラワーイレブン,4.0,202048060911,おおぐま座特別,51.0,8.2,0,右,1600,ダ,名古屋,11R,晴,summer,17,牝
6711,5,4.0,4,加藤聡一,テイエムヨハネス,3.0,202048060911,おおぐま座特別,56.0,6.2,0,右,1600,ダ,名古屋,11R,晴,summer,17,牡
6712,4,8.0,9,友森翔太,メモリーバリケード,2.0,202048060911,おおぐま座特別,55.0,3.2,0,右,1600,ダ,名古屋,11R,晴,summer,17,牝


In [18]:
for col, values in train_df.iteritems():
    num_uniques = values.nunique()
    print ('{name}: {num_unique}'.format(name=col, num_unique=num_uniques))

age: 10
frame_number: 8
horse_number: 18
jockey: 450
name: 23752
popular: 18
race_id: 22630
race_name: 7082
weight: 26
win: 5403
show: 2
clockwise: 11
distance: 54
field_type: 3
place: 24
race_round: 12
weather: 6
season: 4
time_hour: 12
gen: 6


## Label Encoding

一旦、単純なLabel Encodingをする。

In [19]:
categorical_cols = ['jockey', 'name', 'race_name', 'clockwise', 'field_type', 'place', 'race_round', 'weather', 'season', 'gen']

In [20]:
for c in categorical_cols:
  le = LabelEncoder()
  le.fit(train_df[c])
  train_df[c] = le.transform(train_df[c])

In [21]:
train_df

Unnamed: 0,age,frame_number,horse_number,jockey,name,popular,race_id,race_name,weight,win,show,clockwise,distance,field_type,place,race_round,weather,season,time_hour,gen
253626,7,3.0,3,380,21625,2.0,201945010102,763,54.0,3.6,1,3,1400,0,10,4,2,3,11,2
253627,7,5.0,5,66,19528,1.0,201945010102,763,56.0,2.0,1,3,1400,0,10,4,2,3,11,4
253628,7,2.0,2,307,22903,3.0,201945010102,763,56.0,5.9,1,3,1400,0,10,4,2,3,11,4
253629,7,7.0,8,179,17138,8.0,201945010102,763,54.0,22.1,0,3,1400,0,10,4,2,3,11,2
253630,4,8.0,10,51,9310,4.0,201945010102,763,54.0,10.3,0,3,1400,0,10,4,2,3,11,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6709,5,8.0,10,376,15231,9.0,202048060911,1119,54.0,66.4,0,0,1600,0,5,1,2,2,17,2
6710,6,1.0,1,297,16492,4.0,202048060911,1119,51.0,8.2,0,0,1600,0,5,1,2,2,17,2
6711,5,4.0,4,68,11967,3.0,202048060911,1119,56.0,6.2,0,0,1600,0,5,1,2,2,17,4
6712,4,8.0,9,81,20226,2.0,202048060911,1119,55.0,3.2,0,0,1600,0,5,1,2,2,17,2


## trainデータとtestデータに分割

In [22]:
train_df, test_df = train_test_split(train_df, test_size=53624, shuffle=False)

## 目的変数を分離

In [23]:
fea_cols = ['age', 'frame_number', 'horse_number', 'jockey', 'name', 'popular', 'race_id', 'race_name', 'weight',
            'win', 'clockwise', 'distance', 'field_type', 'place', 'race_round', 'weather', 'season', 'time_hour', 'gen']
target_col = ['show']

In [24]:
X_train = train_df[fea_cols].values
y_train = train_df[target_col].values.reshape(-1)
X_test = test_df[fea_cols].values
y_test = test_df[target_col].values.reshape(-1)

# 学習

In [36]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test)

params = {
    'objective': 'binary'
}

num_round = 100

In [37]:
model = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_eval,
    num_boost_round=num_round,
)

[1]	valid_0's binary_logloss: 0.555847
[2]	valid_0's binary_logloss: 0.536929
[3]	valid_0's binary_logloss: 0.521971
[4]	valid_0's binary_logloss: 0.509824
[5]	valid_0's binary_logloss: 0.499889
[6]	valid_0's binary_logloss: 0.49169
[7]	valid_0's binary_logloss: 0.484875
[8]	valid_0's binary_logloss: 0.479181
[9]	valid_0's binary_logloss: 0.474473
[10]	valid_0's binary_logloss: 0.470534
[11]	valid_0's binary_logloss: 0.467191
[12]	valid_0's binary_logloss: 0.464343
[13]	valid_0's binary_logloss: 0.461989
[14]	valid_0's binary_logloss: 0.459998
[15]	valid_0's binary_logloss: 0.458317
[16]	valid_0's binary_logloss: 0.456853
[17]	valid_0's binary_logloss: 0.455602
[18]	valid_0's binary_logloss: 0.45457
[19]	valid_0's binary_logloss: 0.453686
[20]	valid_0's binary_logloss: 0.452952
[21]	valid_0's binary_logloss: 0.452308
[22]	valid_0's binary_logloss: 0.451815
[23]	valid_0's binary_logloss: 0.451373
[24]	valid_0's binary_logloss: 0.451013
[25]	valid_0's binary_logloss: 0.450684
[26]	valid_

# 予測

In [38]:
model.predict(X_test)[:50]

array([0.48269271, 0.22870436, 0.62027229, 0.46297769, 0.11662011,
       0.13381989, 0.16981999, 0.50018671, 0.41577608, 0.38194714,
       0.55210821, 0.27859828, 0.85974418, 0.48969178, 0.06053294,
       0.05604194, 0.21437513, 0.87102017, 0.25329686, 0.27993838,
       0.05785994, 0.65947584, 0.4097252 , 0.04228331, 0.14235622,
       0.01390418, 0.5737584 , 0.17071261, 0.52345538, 0.56807732,
       0.60394902, 0.39303751, 0.06379051, 0.19771734, 0.0454317 ,
       0.59267901, 0.69822861, 0.35673175, 0.23038493, 0.39926073,
       0.17573341, 0.10232016, 0.69409941, 0.04142301, 0.79038596,
       0.60541031, 0.42797009, 0.16329495, 0.08715936, 0.14620668])

In [39]:
y_test[:50]

array([1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0])