# 競馬予想モデル構築
オッズ利用

## データロード

In [301]:
import sqlite3
from google.colab import drive
# Connect Google Drive
drive.mount('/content/drive')
# Load Database File
conn = sqlite3.connect('/content/drive/My Drive/Colab Notebooks/keiba/horse_2021.db')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [302]:
import pandas as pd

sql = '''
SELECT
  t1.entry_id as entry_id,
  t1.race_id as race_id,
  t2.rank as rank,
  t1.bracket as bracket,
  t1.horse_number as horse_number,
  t1.gender as gender,
  t1.age as age,
  t1.burden as burden,
  t1.weight as weight,
  t1.weight_diff as weight_diff,
  t3.tan as tan,
  t3.fuku_min as fuku_min,
  t3.fuku_max as fuku_max
FROM
  entry t1
  LEFT JOIN result t2 on (t1.entry_id = t2.result_id) 
  LEFT JOIN odds t3 on (t1.entry_id = t3.odds_id) 
WHERE
  t1.race_id like "2021%"
'''

db = pd.read_sql(sql, conn, index_col="entry_id")
## 順位（Rank）が空白値をNaNに変換して削除した上で数値型に変更
db[db["rank"] == ""] = None
db = db.dropna(subset=["rank"])
db = db.astype({'rank': int})
db.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47476 entries, 20210601010101 to 20210906091216
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   race_id       47476 non-null  object 
 1   rank          47476 non-null  int64  
 2   bracket       47476 non-null  object 
 3   horse_number  47476 non-null  object 
 4   gender        47476 non-null  object 
 5   age           47476 non-null  float64
 6   burden        47476 non-null  float64
 7   weight        47476 non-null  float64
 8   weight_diff   47476 non-null  float64
 9   tan           47476 non-null  float64
 10  fuku_min      47476 non-null  float64
 11  fuku_max      47476 non-null  float64
dtypes: float64(7), int64(1), object(4)
memory usage: 4.7+ MB


## データの準備： 目的変数の生成

In [303]:
## 目的変数の作成
rk = db.copy()
result = rk["rank"].apply(lambda x: 0 if int(x)>=4 else 1)
result.head()
# result.value_counts(dropna=False)

entry_id
20210601010101    0
20210601010102    0
20210601010103    1
20210601010104    0
20210601010105    0
Name: rank, dtype: int64

## データの準備： ダミー変数の生成

In [304]:
## ダミー変数化
gender = pd.get_dummies(db, columns=["gender"])
gender = gender[["gender_セ", "gender_牝", "gender_牡"]]
gender = gender.rename(columns={"gender_セ": "gender_gelding", "gender_牝": "gender_female", "gender_牡": "gender_male"})

# df["gender"].value_counts()
# gender = df['gender'].map({'牡': 0, 'セ': 1, '牝': 2})
gender.head()

Unnamed: 0_level_0,gender_gelding,gender_female,gender_male
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20210601010101,0,1,0
20210601010102,0,0,1
20210601010103,0,0,1
20210601010104,0,0,1
20210601010105,0,1,0


## データの準備： 標準化

In [305]:
## 標準化変数の選択
std = db[["race_id", "age", "burden", "weight", "tan", "fuku_min", "fuku_max"]]
## データの標準化
standardization = lambda x: (x - x.mean()) / x.std()
std = std.groupby("race_id").transform(standardization)

# 分散がゼロ（馬齢がレース内ですべて同じ）の場合はNaNになるため、ゼロ埋め処理
std = std.fillna(0)
std.head()

# # データ確認用
# std.describe().apply(lambda x: format(x, 'f'))

Unnamed: 0_level_0,age,burden,weight,tan,fuku_min,fuku_max
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20210601010101,0.0,-0.344729,-1.427638,0.085046,-0.028044,-0.025133
20210601010102,0.0,0.268122,0.131326,0.127179,0.111209,0.113165
20210601010103,0.0,0.268122,1.08026,-0.612488,-0.585058,-0.58258
20210601010104,0.0,-0.95758,-1.359857,0.1935,0.008059,0.011037
20210601010105,0.0,-0.344729,-0.139798,-0.677248,-0.657264,-0.663431


## データの準備：データセットの作成

In [306]:
## データセットの結合
df = pd.merge(result, gender, left_index=True, right_index=True)
df = pd.merge(df, std, left_index=True, right_index=True)
df = pd.merge(df, db["weight_diff"], left_index=True, right_index=True)
df.head(10)

Unnamed: 0_level_0,rank,gender_gelding,gender_female,gender_male,age,burden,weight,tan,fuku_min,fuku_max,weight_diff
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
20210601010101,0,0,1,0,0.0,-0.344729,-1.427638,0.085046,-0.028044,-0.025133,-14.0
20210601010102,0,0,0,1,0.0,0.268122,0.131326,0.127179,0.111209,0.113165,6.0
20210601010103,1,0,0,1,0.0,0.268122,1.08026,-0.612488,-0.585058,-0.58258,-4.0
20210601010104,0,0,0,1,0.0,-0.95758,-1.359857,0.1935,0.008059,0.011037,6.0
20210601010105,0,0,1,0,0.0,-0.344729,-0.139798,-0.677248,-0.657264,-0.663431,0.0
20210601010106,0,0,0,1,0.0,0.880974,0.538012,-0.394021,-0.430332,-0.429388,4.0
20210601010107,0,0,1,0,0.0,-1.570432,-0.614266,0.694414,1.034408,1.038697,-12.0
20210601010108,0,0,0,1,0.0,0.880974,-0.343141,-0.48843,-0.45612,-0.45492,-2.0
20210601010109,0,1,0,0,0.0,0.880974,0.944698,-0.578938,-0.528325,-0.525133,16.0
20210601010110,1,0,0,1,0.0,0.880974,0.334669,-0.704557,-0.667579,-0.684707,8.0


In [307]:
df.describe().apply(lambda s: s.apply('{0:.5f}'.format))

Unnamed: 0,rank,gender_gelding,gender_female,gender_male,age,burden,weight,tan,fuku_min,fuku_max,weight_diff
count,47476.0,47476.0,47476.0,47476.0,47476.0,47476.0,47476.0,47476.0,47476.0,47476.0,47476.0
mean,0.21855,0.05556,0.41238,0.53206,0.0,0.0,0.0,0.0,-0.0,-0.0,0.39256
std,0.41327,0.22908,0.49227,0.49898,0.68219,0.94401,0.96293,0.96293,0.96293,0.96293,6.33797
min,0.0,0.0,0.0,0.0,-2.42838,-4.00694,-3.71568,-1.39677,-1.35024,-1.39681,-42.0
25%,0.0,0.0,0.0,0.0,-0.28569,-0.51903,-0.69542,-0.65867,-0.63574,-0.63673,-4.0
50%,0.0,0.0,0.0,1.0,0.0,0.2921,-0.01205,-0.4125,-0.40518,-0.40308,0.0
75%,0.0,0.0,1.0,1.0,0.0,0.64161,0.67806,0.35295,0.27959,0.2872,4.0
max,1.0,1.0,1.0,1.0,3.88057,4.00694,3.08449,3.75425,3.86215,3.85783,66.0


## モデルの作成と評価（１）
ロジスティック回帰、未チューニング

In [308]:
# ライブラリのインポート
from sklearn.model_selection import train_test_split

# 学習用と検証用に分割
X = df.drop("rank", axis=1)
y = df["rank"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [309]:
# ライブラリのインポート
from sklearn.linear_model import LogisticRegression

# モデル構築
model = LogisticRegression(max_iter=1500)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1500)

In [310]:
# ライブラリのインポート
from sklearn.metrics import precision_score, confusion_matrix

# 予測値算出
y_pred = model.predict(X_test)

# スコアの計算
score = precision_score(y_test,y_pred)

# 混合行列
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print({"モデル": "ロジスティクス回帰", "精度": score, "TN": tn, "FP": fp, "FN": fn, "TP": tp})

{'モデル': 'ロジスティクス回帰', '精度': 0.6559766763848397, 'TN': 10988, 'FP': 118, 'FN': 2912, 'TP': 225}


In [311]:
## ライブラリのインポート
import numpy as np

## 予測結果と元データを結合
pred = pd.DataFrame(y_pred, index=y_test.index, columns=["predict"])
pred = pd.merge(db, pred, left_index=True, right_index=True, sort=True)
## 掛け金と払戻金を付与
pred["bet"] = pred["predict"].mask(pred["predict"] == 1, 100)
pred["return"] = np.where(pred["rank"] <= 3, pred["bet"] * pred["fuku_min"], 0)
bet = pred["bet"].sum()
rtn = pred["return"].sum()

print({"モデル": "ロジスティクス回帰", "掛け金": bet, "払戻金": rtn, "収支": rtn - bet, "払戻率": '{:.2%}'.format((rtn/bet))})

{'モデル': 'ロジスティクス回帰', '掛け金': 34300, '払戻金': 27250.0, '収支': -7050.0, '払戻率': '79.45%'}


## モデルの作成と評価（２）
ロジスティック回帰、アンダーサンプリング

In [312]:
# ライブラリのインポート
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

# アンダーサンプリング
rs = RandomUnderSampler(random_state=42)
X, y = rs.fit_resample(df.drop("rank", axis=1), df["rank"])
# インデックス付与
X.index = df.index[rs.sample_indices_]
y.index = df.index[rs.sample_indices_]

# 学習用と検証用に分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
# y.value_counts()

In [313]:
# ライブラリのインポート
from sklearn.linear_model import LogisticRegression

# モデル構築
model = LogisticRegression(max_iter=1500)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1500)

In [314]:
# ライブラリのインポート
from sklearn.metrics import precision_score, confusion_matrix

# 予測値算出
y_pred = model.predict(X_test)

# スコアの計算
score = precision_score(y_test,y_pred)

# 混合行列
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print({"モデル": "ロジスティクス回帰（アンダーサンプル）", "精度": score, "TN": tn, "FP": fp, "FN": fn, "TP": tp})

{'モデル': 'ロジスティクス回帰（アンダーサンプル）', '精度': 0.654833836858006, 'TN': 1771, 'FP': 1371, 'FN': 483, 'TP': 2601}


In [315]:
## ライブラリのインポート
import numpy as np

## 予測結果と元データを結合
pred = pd.DataFrame(y_pred, index=y_test.index, columns=["predict"])
pred = pd.merge(db, pred, left_index=True, right_index=True, sort=True)

## 掛け金と払戻金を付与
pred["bet"] = pred["predict"].mask(pred["predict"] == 1, 100)
pred["return"] = np.where(pred["rank"] <= 3, pred["bet"] * pred["fuku_min"], 0)
bet = pred["bet"].sum()
rtn = pred["return"].sum()

print({"モデル": "ロジスティクス回帰（アンダーサンプル）", "掛け金": bet, "払戻金": rtn, "収支": rtn - bet, "払戻率": '{:.2%}'.format((rtn/bet))})

{'モデル': 'ロジスティクス回帰（アンダーサンプル）', '掛け金': 397200, '払戻金': 504170.0, '収支': 106970.0, '払戻率': '126.93%'}


## モデルの作成と評価（３）
LightGBM、アンダーサンプリング

In [316]:
# ライブラリのインポート
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

# アンダーサンプリング
rs = RandomUnderSampler(random_state=42)
X, y = rs.fit_resample(df.drop("rank", axis=1), df["rank"])
# インデックス付与
X.index = df.index[rs.sample_indices_]
y.index = df.index[rs.sample_indices_]

# 学習用と検証用に分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [317]:
# ライブラリのインポート
import lightgbm as lgb

# モデル構築
# max_depth=-1は無制限を意味する
model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=-1)
model.fit(X, y)

LGBMClassifier()

In [318]:
# ライブラリのインポート
from sklearn.metrics import precision_score, confusion_matrix

# 予測値算出
y_pred = model.predict(X_test)

# スコアの計算
score = precision_score(y_test,y_pred)

# 混合行列
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print({"モデル": "LightGBM（アンダーサンプル）", "精度": score, "TN": tn, "FP": fp, "FN": fn, "TP": tp})

{'モデル': 'LightGBM（アンダーサンプル）', '精度': 0.719645494830133, 'TN': 2193, 'FP': 949, 'FN': 648, 'TP': 2436}


In [319]:
## ライブラリのインポート
import numpy as np

## 予測結果と元データを結合
pred = pd.DataFrame(y_pred, index=y_test.index, columns=["predict"])
pred = pd.merge(db, pred, left_index=True, right_index=True, sort=True)
## 掛け金と払戻金を付与
pred["bet"] = pred["predict"].mask(pred["predict"] == 1, 100)
pred["return"] = np.where(pred["rank"] <= 3, pred["bet"] * pred["fuku_min"], 0)
bet = pred["bet"].sum()
rtn = pred["return"].sum()

print({"モデル": "LightGBM（アンダーサンプル）", "掛け金": bet, "払戻金": rtn, "収支": rtn - bet, "払戻率": '{:.2%}'.format((rtn/bet))})

{'モデル': 'LightGBM（アンダーサンプル）', '掛け金': 338500, '払戻金': 442380.0, '収支': 103880.0, '払戻率': '130.69%'}


# 実データ検証
----

## 実データ出力/データ加工

In [320]:
import pandas as pd

# 2022年6月のデータ抽出
sql = '''
SELECT
  t1.entry_id as entry_id,
  t1.race_id as race_id,
  t2.rank as rank,
  t1.bracket as bracket,
  t1.horse_number as horse_number,
  t1.gender as gender,
  t1.age as age,
  t1.burden as burden,
  t1.weight as weight,
  t1.weight_diff as weight_diff,
  t3.tan as tan,
  t3.fuku_min as fuku_min,
  t3.fuku_max as fuku_max
FROM
  entry t1
  LEFT JOIN result t2 on (t1.entry_id = t2.result_id) 
  LEFT JOIN odds t3 on (t1.entry_id = t3.odds_id) 
  LEFT JOIN race t4 on (t1.race_id = t4.race_id)
WHERE
  t4.race_date like "202206%"
'''

db = pd.read_sql(sql, conn, index_col="entry_id")
## 順位（Rank）が空白値をNaNに変換して削除した上で数値型に変更
db[db["rank"] == ""] = None
db = db.dropna(subset=["rank"])
db = db.astype({'rank': int})
db.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3520 entries, 20220503010101 to 20220903041216
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   race_id       3520 non-null   object 
 1   rank          3520 non-null   int64  
 2   bracket       3520 non-null   object 
 3   horse_number  3520 non-null   object 
 4   gender        3520 non-null   object 
 5   age           3520 non-null   float64
 6   burden        3520 non-null   float64
 7   weight        3520 non-null   float64
 8   weight_diff   3520 non-null   float64
 9   tan           3520 non-null   float64
 10  fuku_min      3520 non-null   float64
 11  fuku_max      3520 non-null   float64
dtypes: float64(7), int64(1), object(4)
memory usage: 357.5+ KB


In [321]:
## ダミー変数化
gender = pd.get_dummies(db, columns=["gender"])
gender = gender[["gender_セ", "gender_牝", "gender_牡"]]
gender = gender.rename(columns={"gender_セ": "gender_gelding", "gender_牝": "gender_female", "gender_牡": "gender_male"})

## 標準化変数の選択
std = db[["race_id", "age", "burden", "weight", "tan", "fuku_min", "fuku_max"]]
## データの標準化
standardization = lambda x: (x - x.mean()) / x.std()
std = std.groupby("race_id").transform(standardization)

# 分散がゼロ（馬齢がレース内ですべて同じ）の場合はNaNになるため、ゼロ埋め処理
std = std.fillna(0)

## データセットの結合
df = pd.merge(gender, std, left_index=True, right_index=True)
df = pd.merge(df, db["weight_diff"], left_index=True, right_index=True)
df.describe().apply(lambda s: s.apply('{0:.5f}'.format))

Unnamed: 0,gender_gelding,gender_female,gender_male,age,burden,weight,tan,fuku_min,fuku_max,weight_diff
count,3520.0,3520.0,3520.0,3520.0,3520.0,3520.0,3520.0,3520.0,3520.0,3520.0
mean,0.05312,0.41108,0.5358,0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.12273
std,0.22431,0.4921,0.49879,0.69033,0.94867,0.96191,0.96191,0.96191,0.96191,5.89732
min,0.0,0.0,0.0,-2.0702,-3.75,-3.70818,-1.65949,-1.33276,-1.50485,-30.0
25%,0.0,0.0,0.0,-0.3078,-0.47754,-0.68679,-0.67154,-0.65214,-0.65445,-4.0
50%,0.0,0.0,1.0,0.0,0.29509,-0.00918,-0.41446,-0.40405,-0.39973,0.0
75%,0.0,1.0,1.0,0.0,0.63057,0.68801,0.3948,0.34902,0.34965,4.0
max,1.0,1.0,1.0,3.14106,3.17543,2.76489,3.51614,3.54998,3.54761,34.0


## 結果予測

In [322]:
# 予測値算出
y_pred = model.predict(df)

## 評価

In [323]:
## ライブラリのインポート
import numpy as np

## 予測結果と元データを結合
pred = pd.DataFrame(y_pred, index=df.index, columns=["predict"])
pred = pd.merge(db, pred, left_index=True, right_index=True, sort=True)
## 掛け金と払戻金を付与
pred["bet"] = pred["predict"].mask(pred["predict"] == 1, 100)
pred["return"] = np.where(pred["rank"] <= 3, pred["bet"] * pred["fuku_min"], 0)
bet = pred["bet"].sum()
rtn = pred["return"].sum()

print({"モデル": "LightGBM（アンダーサンプル）", "掛け金": bet, "払戻金": rtn, "収支": rtn - bet, "払戻率": '{:.2%}'.format((rtn/bet))})

{'モデル': 'LightGBM（アンダーサンプル）', '掛け金': 154900, '払戻金': 107610.0, '収支': -47290.0, '払戻率': '69.47%'}
