## データ収集

In [None]:
# !pip install lxml
# !pip install html5lib
# !pip install bs4
# !pip install imblearn

In [None]:
import pandas as pd

In [None]:
# # サンプル確認
# url = 'https://db.netkeiba.com/race/202009030811/'

# print(type(pd.read_html(url)[0]))
# # -> df

# pd.read_html(url)[0]
# # -> 	着順	枠番	馬番	馬名	性齢	斤量	騎手	タイム	着差	単勝	人気	馬体重	調教師

In [None]:
import time

def scrape_race_results(race_id_list: list)-> dict:
    """
    netkeiba.comのレースIDのリストを渡して、それらをまとめて{'レースID', 結果のDataFrame}という形式の辞書型に格納する
    race_results['201901010101']
    -> df 着順	枠番	馬番	馬名	性齢	斤量	騎手	タイム	着差	単勝	人気	馬体重	調教師
    """
    race_results_dict = {}
    for race_id in race_id_list:
        try:
            url = 'https://db.netkeiba.com/race/' + race_id
            race_results_dict[race_id] = pd.read_html(url)[0]
            time.sleep(0.1)
        except IndexError:
            continue
        except:
            break
    return race_results_dict

In [None]:
# race id を生成する（規則的に生成できる）
# todo: 2019年に絞っている
race_id_list = []

for place in range(1,11):
    for kai in range(1,6):
        for day in range(1,9):
            for r in range(1,13):
                race_id = '2019' + str(place).zfill(2) + str(kai).zfill(2) + str(day).zfill(2) + str(r).zfill(2)
                race_id_list.append(race_id)
# -> ['201901010101', '201901010102', ,,, ]

In [None]:
# todo: listを絞っている
race_results_dict = scrape_race_results(race_id_list[0:10])

In [None]:
# 表として見やすいように、dfのindexにrace idを入れる
for key in race_results_dict.keys():
    race_results_dict[key].index = [key]*len(race_results_dict[key])

# 各レース結果のdfを1つに結合する
race_results_df = pd.concat((race_results_dict[key] for key in race_results_dict.keys()),sort=False)

In [None]:
race_results_df.to_pickle('../../../data/raw/race_results_df.pickle')

## 前処理

In [None]:
import pandas as pd

In [None]:
race_results_df = pd.read_pickle('../../../data/raw/race_results_df.pickle')

In [None]:
def preprocess_netkeiba_past(race_results_df):
    df = race_results_df.copy()

    # データ整形
    df = df[~(df['着順'].astype(str).str.contains('\D'))]
    df['着順'] = df['着順'].astype(int)
    df['性'] = df['性齢'].map(lambda x:str(x)[0])
    df['年齢'] = df['性齢'].map(lambda x:str(x)[1:]).astype(int)
    df['体重'] = df['馬体重'].str.split('(',expand = True)[0].astype(int)
    df['体重変化'] = df['馬体重'].str.split('(',expand = True)[1].str[:-1].astype(int)
    df['単勝'] = df['単勝'].astype(float)
    
    df.drop(['タイム','着差','調教師','性齢','馬体重'],axis = 1,inplace = True)

    # 4位より下はまとめる
    clip_rank = lambda x: x if x < 4 else 4
    df['rank'] = df['着順'].map(clip_rank)

    # test['馬名'].value_counts()などでカウントし、数が多そうなのは落とした後、ダミー変数化
    df.drop(['着順','馬名','騎手'], axis = 1,inplace = True)
    df = pd.get_dummies(df)
    
    return df

In [None]:
race_results_df_processed = preprocess_netkeiba_past(race_results_df)

In [None]:
race_results_df_processed.to_pickle('../../../data/processed/race_results_df_processed.pickle')

## 学習

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
race_results_df_processed = pd.read_pickle('../../../data/processed/race_results_df_processed.pickle')

In [None]:
# 説明変数の取得
X = race_results_df_processed.drop(['rank'],axis=1)
# 目的変数の取得
y = race_results_df_processed['rank']

# train と test に分離
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=0.3,random_state=0)

In [None]:
# 下記で学習もできるが、今回はunder samplingを追加するのでコメントアウトする
# # 学習
# model = LogisticRegression()
# model.fit(X_train,y_train)
# print('train score: ' + str(model.score(X_train,y_train)))
# print('test score: ' + str(model.score(X_test,y_test)))

# # テストデータでの予測結果を取得し、出力する
# y_pred = model.predict(X_test)
# print(pd.DataFrame({'pred':y_pred,'actual':y_test}))

In [None]:
# ランダムアンダーサンプリング
from imblearn.under_sampling import RandomUnderSampler

cnt_rank_1 = y_train.value_counts()[1]
cnt_rank_2 = y_train.value_counts()[2]
cnt_rank_3 = y_train.value_counts()[3]

rus = RandomUnderSampler(sampling_strategy={1:cnt_rank_1,2:cnt_rank_2,3:cnt_rank_3,4:cnt_rank_1},random_state=71)

X_train_rus,y_train_rus = rus.fit_sample(X_train,y_train)

In [None]:
# 学習
model = LogisticRegression()
model.fit(X_train_rus,y_train_rus)
print('train score: ' + str(model.score(X_train,y_train)))
print('test score: ' + str(model.score(X_test,y_test)))

In [None]:
# テストデータでの予測結果を取得し、出力する
y_pred = model.predict(X_test)
print(pd.DataFrame({'pred':y_pred,'actual':y_test}))

In [None]:
import pickle
pickle.dump(model, open('../../../data/models/14_scraping-netkeiba_preprocess_train-lr_valid_predict.pickle', 'wb'))

In [None]:
# 読み込み確認
loaded_model = pickle.load(open('../../../data/models/14_scraping-netkeiba_preprocess_train-lr_valid_predict.pickle', 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

## 検証

In [None]:
import random

In [None]:
# 検証のデータ準備

race_results_df_processed_valid = pd.read_pickle('../../../data/processed/race_results_df_processed_valid_20200722.pickle')

# 説明変数の取得
X_valid = race_results_df_processed_valid.drop(['rank'],axis=1)
# 目的変数の取得
y_valid = race_results_df_processed_valid['rank']

In [None]:
# 推論実行
model = pickle.load(open('../../../data/models/14_scraping-netkeiba_preprocess_train-lr_valid_predict.pickle', 'rb'))

y_valid_pred = model.predict(X_valid)

# 集計用に処理
valid_results_df = pd.DataFrame({'pred':y_valid_pred,'actual':y_valid})
race_id_list = list(set(list(valid_results_df.index)))
valid_results_list = valid_results_df.reset_index().values.tolist()
# シャッフル
random.shuffle(valid_results_list)

In [None]:
# # 集計（1位正解率）
# correct_count = 0
# for race_id in race_id_list:
#     pred_1_cnt_by_race = 0
#     for i in range(len(valid_results_list)):
#         # 対象レースidのうち、一位と予測された馬
#         if valid_results_list[i][0] == race_id and valid_results_list[i][1] == 1:
#             pred_1_cnt_by_race += 1
#             # 対象レースidのうち一位と予測された馬が一つ目で、かつ結果も1位の場合
#             if pred_1_cnt_by_race == 1 and valid_results_list[i][2] == 1:
#                 correct_count += 1
# print('rank1_acc: ' + str(correct_count/100))

In [None]:
# # 集計（1-3位正解率）
# correct_count = 0
# for race_id in race_id_list:
#     pred_3_cnt_by_race = 0
#     for rank in [1, 2, 3]:
#         for i in range(len(valid_results_list)):
#             # 対象レースidのうち、{rank}位と予測された馬
#             if valid_results_list[i][0] == race_id and valid_results_list[i][1] == rank:
#                 pred_3_cnt_by_race += 1
#                 # 対象レースidのうち一位と予測された馬が一つ目で、かつ結果も1位の場合
#                 if pred_3_cnt_by_race <= 3 and valid_results_list[i][2] == 1 or valid_results_list[i][2] == 2 or valid_results_list[i][2] == 3:
#                     correct_count += 1
# print('rank3_acc: ' + str(correct_count/300))

In [None]:
# 集計（馬単）
correct_count = 0
for race_id in race_id_list:
    pred_cnt_by_race = 0
    cnt_by_race = 0
    for rank in [1]:
        for i in range(len(valid_results_list)):
            # 対象レースidのうち、{rank}位と予測された馬
            if valid_results_list[i][0] == race_id and valid_results_list[i][1] == rank:
                pred_cnt_by_race += 1
                if pred_cnt_by_race <= 1 and (valid_results_list[i][2] == 1):
                    cnt_by_race += 1
    if cnt_by_race == 1:
        correct_count += 1
print('acc_exacta_1: ' + str(correct_count/100))

In [None]:
# 集計（馬連）
correct_count = 0
for race_id in race_id_list:
    pred_cnt_by_race = 0
    cnt_by_race = 0
    for rank in [1, 2]:
        for i in range(len(valid_results_list)):
            # 対象レースidのうち、{rank}位と予測された馬
            if valid_results_list[i][0] == race_id and valid_results_list[i][1] == rank:
                pred_cnt_by_race += 1
                if pred_cnt_by_race <= 2 and (valid_results_list[i][2] == 1 or valid_results_list[i][2] == 2):
                    cnt_by_race += 1
    if cnt_by_race == 2:
        correct_count += 1
print('acc_quinella_2: ' + str(correct_count/100))

In [None]:
# 集計（三連複）
correct_count = 0
for race_id in race_id_list:
    pred_cnt_by_race = 0
    cnt_by_race = 0
    for rank in [1, 2, 3]:
        for i in range(len(valid_results_list)):
            # 対象レースidのうち、{rank}位と予測された馬
            if valid_results_list[i][0] == race_id and valid_results_list[i][1] == rank:
                pred_cnt_by_race += 1
                if pred_cnt_by_race <= 3 and (valid_results_list[i][2] == 1 or valid_results_list[i][2] == 2 or valid_results_list[i][2] == 3):
                    cnt_by_race += 1
    if cnt_by_race == 3:
        correct_count += 1
print('acc_trio_3: ' + str(correct_count/100))

## 推論

In [None]:
# seleniumによるクローリングの準備
# # !wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
# !dpkg -i google-chrome-stable_current_amd64.deb
# !apt update
# !apt -f install -y
# !dpkg -i google-chrome-stable_current_amd64.deb
# !apt install python3-selenium -y

In [None]:
# !apt install libgconf2-4 -y

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [None]:
model = pickle.load(open('../../../data/models/14_scraping-netkeiba_preprocess_train-lr_valid_predict.pickle', 'rb'))

In [None]:
# サンプルで動作確認
# options = Options()
# options.binary_location = '/usr/bin/google-chrome'
# options.add_argument('--headless')
# options.add_argument('--window-size=1280,1024')
# options.add_argument("--no-sandbox")

# driver = webdriver.Chrome('chromedriver', chrome_options=options)
# url = 'https://race.netkeiba.com/race/shutuba.html?race_id=202009040511'

# driver.get(url)

# sample_element = driver.find_elements_by_class_name('HorseList')[0]
# sample_tds = sample_element.find_elements_by_tag_name('td')
# for td in sample_tds:
#     print(td.text)

In [None]:
race_id = '202009040611'
url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + race_id

In [None]:
class ShutubaTable:
    # 出馬表を作る
    # = race plans
    def __init__(self):
        self.shutuba_table = pd.DataFrame()

    def scrape_shutuba_table(self, race_id_list):
        options = Options()
        options.binary_location = '/usr/bin/google-chrome'
        options.add_argument('--headless')
        options.add_argument('--window-size=1280,1024')
        options.add_argument("--no-sandbox")
        driver = webdriver.Chrome('chromedriver', chrome_options=options)
        for race_id in race_id_list:
            url  = 'https://race.netkeiba.com/race/shutuba.html?race_id='\
                      + race_id
            driver.get(url)
            elements = driver.find_elements_by_class_name('HorseList')
            for element in elements:
                row = []
                tds = element.find_elements_by_tag_name('td')
                for td in tds:
                    row.append(td.text)
                self.shutuba_table = self.shutuba_table.append(pd.Series(row, name=race_id))
        driver.close()

In [None]:
st = ShutubaTable()
st.scrape_shutuba_table([race_id])

In [None]:
race_plans_df = st.shutuba_table

In [None]:
def preprocess_netkeiba_future(race_plans_df):
    df = race_plans_df.copy()
    
    df = df.rename(columns={0 :'枠番',1:'馬番',3:'馬名',4:'性齢',5:'斤量',6:'騎手',7:'厩舎',8:'馬体重',9:'単勝',10:'人気'})
    df['性'] = df['性齢'].map(lambda x:str(x)[0])
    df['年齢'] = df['性齢'].map(lambda x:str(x)[1:]).astype(int)
    df['体重'] = df['馬体重'].str.split('(',expand = True)[0].astype(int)
    df['体重変化'] = df['馬体重'].str.split('(',expand = True)[1].str[:-1].astype(int)
    df['単勝'] = df['単勝'].astype(float)
    df['人気'] = df['人気'].astype(float)
    df['枠番'] = df['枠番'].astype(int)
    df['馬番'] = df['馬番'].astype(int)
    df['斤量'] = df['斤量'].astype(float)
    df['斤量'] = df['斤量'].astype(int)
    df.drop([2,11,12,'厩舎','性齢','馬体重','馬名','騎手'],axis = 1,inplace = True)
    df['性_セ'] = [0] * 18
    df = pd.get_dummies(df)

    return df

In [None]:
race_plans_df_processed = preprocess_netkeiba_future(race_plans_df)

In [None]:
# テストデータでの予測結果を取得し、出力する
y_pred = model.predict(race_plans_df_processed)
print(pd.DataFrame({'pred':y_pred}))