In [1]:
import pandas as pd
import time
from tqdm.notebook import tqdm
import datetime
import requests
from bs4 import BeautifulSoup
import re

In [5]:
def scrape_race_results(race_id_list, pre_race_results={}):
    race_results = pre_race_results
    for race_id in tqdm(race_id_list):
        if race_id in race_results.keys():
            continue
        try:
            url = "https://db.netkeiba.com/race/" + race_id
            df = pd.read_html(url)[0]

            # horse_idとjockey_idをスクレイピング
            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")
            # horse_id
            horse_id_list = []
            horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                "a", attrs={"href": re.compile("^/horse")}
            )
            for a in horse_a_list:
                horse_id = re.findall(r"\d+", a["href"])
                #qiitaでバックスラッシュを使うとバグるので大文字にしてあります。
                horse_id_list.append(horse_id[0])
            # jockey_id
            jockey_id_list = []
            jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                "a", attrs={"href": re.compile("^/jockey")}
            )
            for a in jockey_a_list:
                jockey_id = re.findall(r"\d+", a["href"])
                jockey_id_list.append(jockey_id[0])

            df["horse_id"] = horse_id_list
            df["jockey_id"] = jockey_id_list
            race_results[race_id] = df
            time.sleep(1)
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
    return race_results

In [6]:
race_id_list = ['202009030811']

In [7]:
results = scrape_race_results(race_id_list)
for key in results:
    results[key].index = [key] * len(results[key])
results = pd.concat([results[key] for key in results], sort=False)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [8]:
results

Unnamed: 0,着順,枠番,馬番,馬名,性齢,斤量,騎手,タイム,着差,単勝,人気,馬体重,調教師,horse_id,jockey_id
202009030811,1,8,16,クロノジェネシス,牝4,56,北村友一,2:13.5,,4.1,2,464(+10),[西] 斉藤崇史,2016104750,1102
202009030811,2,7,14,キセキ,牡6,58,武豊,2:14.5,6,14.2,6,502(-10),[西] 角居勝彦,2014101976,666
202009030811,3,6,12,モズベッロ,牡4,58,池添謙一,2:15.3,5,106.1,12,480(+2),[西] 森田直行,2016100915,1032
202009030811,4,3,5,サートゥルナーリア,牡4,58,ルメール,2:15.6,1.3/4,2.4,1,508(+6),[西] 角居勝彦,2016104505,5339
202009030811,5,5,10,メイショウテンゲン,牡4,58,松山弘平,2:15.6,クビ,206.2,16,466(-4),[西] 池添兼雄,2016102192,1126
202009030811,6,6,11,ラッキーライラック,牝5,56,Ｍ．デム,2:16.0,2.1/2,4.9,3,524(+4),[西] 松永幹夫,2015105046,5212
202009030811,7,3,6,トーセンスーリヤ,牡5,58,横山和生,2:16.3,2,150.2,14,484(+2),[東] 小野次郎,2015101105,1140
202009030811,8,1,1,トーセンカンビーナ,牡4,58,浜中俊,2:16.4,クビ,83.8,11,464(+6),[西] 角居勝彦,2016104990,1115
202009030811,9,7,13,ダンビュライト,セ6,58,松若風馬,2:16.6,1.1/4,82.9,10,488(-2),[西] 音無秀孝,2014106010,1154
202009030811,10,4,8,レッドジェニアル,牡4,58,酒井学,2:16.7,3/4,169.0,15,484(-2),[西] 高橋義忠,2016105292,1034


In [9]:
def scrape_race_info(race_id_list):
    race_infos = {}
    for race_id in tqdm(race_id_list):
        try:
            url = "https://db.netkeiba.com/race/" + race_id
            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")

            texts = (
                soup.find("div", attrs={"class": "data_intro"}).find_all("p")[0].text
                + soup.find("div", attrs={"class": "data_intro"}).find_all("p")[1].text
            )
            info = re.findall(r'\w+', texts)
            info_dict = {}
            for text in info:
                if text in ["芝", "ダート"]:
                    info_dict["race_type"] = text
                if "障" in text:
                    info_dict["race_type"] = "障害"
                if "m" in text:
                    info_dict["course_len"] = int(re.findall(r"\d+", text)[0])
                if text in ["良", "稍重", "重", "不良"]:
                    info_dict["ground_state"] = text
                if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                    info_dict["weather"] = text
                if "年" in text:
                    info_dict["date"] = text
            race_infos[race_id] = info_dict
            time.sleep(0.1)
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
    return race_infos

In [10]:
race_infos = scrape_race_info(race_id_list)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [11]:
df_infos = pd.DataFrame(race_infos.values(), index=race_infos.keys())

In [12]:
results_addinfo = results.merge(df_infos,left_index=True,right_index=True,how='inner')

In [13]:
def preprocessing_rf(results):
    df = results.copy()
    
    df = df[~(df['着順'].astype(str).str.contains('\D'))]
    df['着順'] = df['着順'].astype(int)
    
    df['性'] = df['性齢'].map(lambda x:str(x)[0])
    df['年齢'] = df['性齢'].map(lambda x:str(x)[1:]).astype(int)
    df['体重'] = df['馬体重'].str.split('(',expand = True)[0].astype(int)
    df['体重変化'] = df['馬体重'].str.split('(',expand = True)[1].str[:-1].astype(int)
    
    df['単勝'] = df['単勝'].astype(float)
    
    df.drop(['タイム','着差','調教師','性齢','馬体重'],axis = 1,inplace = True)
    
    df['date'] = pd.to_datetime(df['date'],format='%Y年%m月%d日')
    
    return df

In [14]:
test = preprocessing_rf(results_addinfo)

In [16]:
test

Unnamed: 0,着順,枠番,馬番,馬名,斤量,騎手,単勝,人気,horse_id,jockey_id,course_len,weather,race_type,ground_state,date,性,年齢,体重,体重変化
202009030811,1,8,16,クロノジェネシス,56,北村友一,4.1,2,2016104750,1102,2200,曇,芝,稍重,2020-06-28,牝,4,464,10
202009030811,2,7,14,キセキ,58,武豊,14.2,6,2014101976,666,2200,曇,芝,稍重,2020-06-28,牡,6,502,-10
202009030811,3,6,12,モズベッロ,58,池添謙一,106.1,12,2016100915,1032,2200,曇,芝,稍重,2020-06-28,牡,4,480,2
202009030811,4,3,5,サートゥルナーリア,58,ルメール,2.4,1,2016104505,5339,2200,曇,芝,稍重,2020-06-28,牡,4,508,6
202009030811,5,5,10,メイショウテンゲン,58,松山弘平,206.2,16,2016102192,1126,2200,曇,芝,稍重,2020-06-28,牡,4,466,-4
202009030811,6,6,11,ラッキーライラック,56,Ｍ．デム,4.9,3,2015105046,5212,2200,曇,芝,稍重,2020-06-28,牝,5,524,4
202009030811,7,3,6,トーセンスーリヤ,58,横山和生,150.2,14,2015101105,1140,2200,曇,芝,稍重,2020-06-28,牡,5,484,2
202009030811,8,1,1,トーセンカンビーナ,58,浜中俊,83.8,11,2016104990,1115,2200,曇,芝,稍重,2020-06-28,牡,4,464,6
202009030811,9,7,13,ダンビュライト,58,松若風馬,82.9,10,2014106010,1154,2200,曇,芝,稍重,2020-06-28,セ,6,488,-2
202009030811,10,4,8,レッドジェニアル,58,酒井学,169.0,15,2016105292,1034,2200,曇,芝,稍重,2020-06-28,牡,4,484,-2


In [19]:
#results = scrape_race_results_new(race_id_list)
#results = pd.concat([results[key] for key in results])
horse_id_list = test['horse_id'].unique()

In [20]:
horse_id_list

array(['2016104750', '2014101976', '2016100915', '2016104505',
       '2016102192', '2015105046', '2015101105', '2016104990',
       '2014106010', '2016105292', '2014105517', '2014104449',
       '2015105022', '2015104713', '2014105258', '2015104882',
       '2015104063', '2015101654'], dtype=object)

In [21]:
def scrape_horse_results(horse_id_list, pre_horse_id=[]):
    horse_results = {}
    for horse_id in tqdm(horse_id_list):
        if horse_id in pre_horse_id:
            continue
        try:
            url = 'https://db.netkeiba.com/horse/' + horse_id
            df = pd.read_html(url)[3]
            if df.columns[0]=='受賞歴':
                df = pd.read_html(url)[4]
            horse_results[horse_id] = df
            time.sleep(1)
        except IndexError:
            continue
        except Exception as e:
            import traceback
            traceback.print_exc()
            print(e)
            break
        except:
            break
    return horse_results

In [22]:
horse_results = scrape_horse_results(horse_id_list)
for key in horse_results:
    horse_results[key].index = [key] * len(horse_results[key])
df_horse_results = pd.concat([horse_results[key] for key in horse_results])
#df.to_pickle('horse_results.pickle')

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




In [23]:
df_horse_results

Unnamed: 0,日付,開催,天気,R,レース名,映像,頭数,枠番,馬番,オッズ,...,着差,ﾀｲﾑ指数,通過,ペース,上り,馬体重,厩舎ｺﾒﾝﾄ,備考,勝ち馬(2着馬),賞金
2016104750,2020/06/28,3阪神8,曇,11.0,宝塚記念(G1),,18,8.0,16,4.1,...,-1.0,**,7-8-7-1,34.6-36.3,36.3,464(+10),,,(キセキ),15378.0
2016104750,2020/04/05,2阪神4,晴,11.0,大阪杯(G1),,12,8.0,12,5.2,...,0.0,**,3-3-3-3,36.9-34.2,34.0,454(-6),,,ラッキーライラック,5474.4
2016104750,2020/02/16,2京都6,雨,11.0,京都記念(G2),,9,7.0,7,2.7,...,-0.4,**,3-3-3-3,36.8-36.9,35.8,460(+12),,,(カレンブーケドール),6270.0
2016104750,2019/11/10,5京都4,晴,11.0,エリザベス女王杯(G1),,18,4.0,8,3.5,...,0.3,**,5-6-6-5,37.6-34.6,33.3,448(-4),,,ラッキーライラック,1050.0
2016104750,2019/10/13,4京都4,晴,11.0,秋華賞(G1),,17,3.0,5,6.9,...,-0.3,**,6-7-5-5,34.6-36.4,36.1,452(+20),,,(カレンブーケドール),10382.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015101654,2018/05/19,3京都9,晴,9.0,メルボルンT(500万下),,10,4.0,4,7.1,...,0.2,**,2-2-2-2,36.7-36.5,36.7,462(+2),,,ドレーク,407.2
2015101654,2018/04/29,3京都4,晴,4.0,3歳未勝利,,11,4.0,4,10.5,...,0.0,**,8-8-9-8,34.8-34.6,33.9,460(-6),,,(メイケイゴールド),500.0
2015101654,2018/02/18,2京都8,晴,5.0,3歳未勝利,,13,3.0,3,16.4,...,1.7,**,4-4-4-6,36.3-36.4,37.8,466(-2),,,ネプチュナイト,
2015101654,2018/01/20,1京都6,晴,6.0,3歳未勝利,,15,2.0,2,116.0,...,0.3,**,6-6-10-8,36.7-35.8,35.4,468(-18),,,ジャックローズ,75.0


In [24]:
class HorseResults:
    def __init__(self, horse_results):
        self.horse_results = horse_results[['日付', '着順', '賞金']]
        self.preprocessing()

    def preprocessing(self):
        df = self.horse_results.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)

        df["date"] = pd.to_datetime(df["日付"])
        df.drop(['日付'], axis=1, inplace=True)

        #賞金のNaNを0で埋める
        df['賞金'].fillna(0, inplace=True)

        self.horse_results = df

    def average(self, horse_id_list, date, n_samples='all'):
        target_df = self.horse_results.loc[horse_id_list]

        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')

        average = filtered_df.groupby(level=0)[['着順', '賞金']].mean()
        return average.rename(columns={'着順':'着順_{}R'.format(n_samples), '賞金':'賞金_{}R'.format(n_samples)})

    def merge(self, results, date, n_samples='all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        merged_df = df.merge(self.average(horse_id_list, date, n_samples), left_on='horse_id',
                             right_index=True, how='left')
        return merged_df

    def merge_all(self, results, n_samples='all'):
        date_list = results['date'].unique()
        merged_df = pd.concat([self.merge(results, date, n_samples) for date in tqdm(date_list)])
        return merged_df

In [25]:
hr = HorseResults(df_horse_results)
results_5R = hr.merge_all(test, n_samples=5)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [26]:
results_5R

Unnamed: 0,着順,枠番,馬番,馬名,斤量,騎手,単勝,人気,horse_id,jockey_id,...,weather,race_type,ground_state,date,性,年齢,体重,体重変化,着順_5R,賞金_5R
202009030811,1,8,16,クロノジェネシス,56,北村友一,4.1,2,2016104750,1102,...,曇,芝,稍重,2020-06-28,牝,4,464,10,2.4,5268.2
202009030811,2,7,14,キセキ,58,武豊,14.2,6,2014101976,666,...,曇,芝,稍重,2020-06-28,牡,6,502,-10,5.6,600.0
202009030811,3,6,12,モズベッロ,58,池添謙一,106.1,12,2016100915,1032,...,曇,芝,稍重,2020-06-28,牡,4,480,2,3.0,2063.48
202009030811,4,3,5,サートゥルナーリア,58,ルメール,2.4,1,2016104505,5339,...,曇,芝,稍重,2020-06-28,牡,4,508,6,2.8,5468.56
202009030811,5,5,10,メイショウテンゲン,58,松山弘平,206.2,16,2016102192,1126,...,曇,芝,稍重,2020-06-28,牡,4,466,-4,5.8,851.24
202009030811,6,6,11,ラッキーライラック,56,Ｍ．デム,4.9,3,2015105046,5212,...,曇,芝,稍重,2020-06-28,牝,5,524,4,1.8,5755.32
202009030811,7,3,6,トーセンスーリヤ,58,横山和生,150.2,14,2015101105,1140,...,曇,芝,稍重,2020-06-28,牡,5,484,2,1.6,1660.56
202009030811,8,1,1,トーセンカンビーナ,58,浜中俊,83.8,11,2016104990,1115,...,曇,芝,稍重,2020-06-28,牡,4,464,6,2.8,1469.72
202009030811,9,7,13,ダンビュライト,58,松若風馬,82.9,10,2014106010,1154,...,曇,芝,稍重,2020-06-28,セ,6,488,-2,7.0,1803.6
202009030811,10,4,8,レッドジェニアル,58,酒井学,169.0,15,2016105292,1034,...,曇,芝,稍重,2020-06-28,牡,4,484,-2,5.6,363.64
