In [2]:
import pandas as pd
import time
from tqdm.notebook import tqdm
import datetime
import requests
from bs4 import BeautifulSoup
import re

In [3]:
race_id_list = ['202005030211']
# for year in range(2019,2020,1):
#     for place in range(1, 11, 1):
#         for kai in range(1, 6, 1):
#             for day in range(1, 9, 1):
#                 for r in range(1, 13, 1):
#                     race_id = (
#                         str(year)
#                         + str(place).zfill(2)
#                         + str(kai).zfill(2)
#                         + str(day).zfill(2)
#                         + str(r).zfill(2)
#                     )
#                     race_id_list.append(race_id)

In [4]:
def scrape_race_results(race_id_list, pre_race_results={}):
    race_results = pre_race_results
    for race_id in tqdm(race_id_list):
        if race_id in race_results.keys():
            continue
        try:
            url = "https://db.netkeiba.com/race/" + race_id
            df = pd.read_html(url)[0]

            # horse_idとjockey_idをスクレイピング
            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")
            # horse_id
            horse_id_list = []
            horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                "a", attrs={"href": re.compile("^/horse")}
            )

            for a in horse_a_list:
                horse_id = re.findall(r"\d+", a["href"])
                horse_id_list.append(horse_id[0])
            # jockey_id
            jockey_id_list = []
            jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                "a", attrs={"href": re.compile("^/jockey")}
            )
            for a in jockey_a_list:
                jockey_id = re.findall(r"\d+", a["href"])
                jockey_id_list.append(jockey_id[0])

            df["horse_id"] = horse_id_list
            df["jockey_id"] = jockey_id_list
            df['course_id'] = [int(race_id[4:6])]*len(horse_id_list)
            race_results[race_id] = df
            time.sleep(0.1)
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
    return race_results

In [5]:
results = scrape_race_results(race_id_list[:20])
for key in results:
    results[key].index = [key] * len(results[key])
results = pd.concat([results[key] for key in results], sort=False)
results.to_pickle('results.pickle')

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [6]:
def scrape_race_info(race_id_list):
    race_infos = {}
    for race_id in tqdm(race_id_list):
        try:
            url = "https://db.netkeiba.com/race/" + race_id
            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")

            texts = (
                soup.find("div", attrs={"class": "data_intro"}).find_all("p")[0].text
                + soup.find("div", attrs={"class": "data_intro"}).find_all("p")[1].text
            )
            info = re.findall(r'\w+', texts)
            info_dict = {}
            for text in info:
                if text in ["芝", "ダート"]:
                    info_dict["race_type"] = text
                if "障" in text:
                    info_dict["race_type"] = "障害"
                if "m" in text:
                    info_dict["course_len"] = int(re.findall(r"\d+", text)[0])
                if text in ["良", "稍重", "重", "不良"]:
                    info_dict["ground_state"] = text
                if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                    info_dict["weather"] = text
                if "年" in text:
                    info_dict["date"] = text
                if "右" in text:
                    info_dict["course_type"] = "right"
                if "左" in text:
                    info_dict["course_type"] = "left"
                if "直線" in text:
                    info_dict["course_type"] = "straight"
            race_infos[race_id] = info_dict
            time.sleep(0.1)
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
    return race_infos

In [7]:
race_infos = scrape_race_info(race_id_list[:20])
df_infos = pd.DataFrame(race_infos.values(), index=race_infos.keys())

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [8]:
results_addinfo = results.merge(df_infos,left_index=True,right_index=True,how='inner')

In [9]:
def preprocessing_rf(results):
    df = results.copy()
    
    df = df[~(df['着順'].astype(str).str.contains('\D'))]
    df['着順'] = df['着順'].astype(int)
    
    df['性'] = df['性齢'].map(lambda x:str(x)[0])
    df['所属'] = df['調教師'].map(lambda x:str(x)[1])
    df['年齢'] = df['性齢'].map(lambda x:str(x)[1:]).astype(int)
    df['体重'] = df['馬体重'].str.split('(',expand = True)[0].astype(int)
    df['体重変化'] = df['馬体重'].str.split('(',expand = True)[1].str[:-1].astype(int)
    
    df['単勝'] = df['単勝'].astype(float)
    
    df.drop(['タイム','着差','調教師','性齢','馬体重'],axis = 1,inplace = True)
    
    df['date'] = pd.to_datetime(df['date'],format='%Y年%m月%d日')
    
    return df

In [10]:
test = preprocessing_rf(results_addinfo)

In [11]:
horse_id_list = test['horse_id'].unique()

In [12]:
def scrape_horse_results(horse_id_list, pre_horse_id=[]):
    horse_results = {}
    for horse_id in tqdm(horse_id_list):
        if horse_id in pre_horse_id:
            continue
        try:
            url = 'https://db.netkeiba.com/horse/' + horse_id
            df = pd.read_html(url)[3]
            if df.columns[0]=='受賞歴':
                df = pd.read_html(url)[4]
            horse_results[horse_id] = df
            time.sleep(1)
        except IndexError:
            continue
        except Exception as e:
            import traceback
            traceback.print_exc()
            print(e)
            break
        except:
            break
    return horse_results

In [13]:
horse_results = scrape_horse_results(horse_id_list)
for key in horse_results:
    horse_results[key].index = [key] * len(horse_results[key])
df_horse_results = pd.concat([horse_results[key] for key in horse_results])
#df.to_pickle('horse_results.pickle')

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




In [14]:
class HorseResults:
    def __init__(self, horse_results):
        self.horse_results = horse_results[['日付', '着順', '賞金']]
        self.preprocessing()

    def preprocessing(self):
        df = self.horse_results.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)

        df["date"] = pd.to_datetime(df["日付"])
        df.drop(['日付'], axis=1, inplace=True)

        #賞金のNaNを0で埋める
        df['賞金'].fillna(0, inplace=True)

        self.horse_results = df

    def average(self, horse_id_list, date, n_samples='all'):
        target_df = self.horse_results.loc[horse_id_list]

        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')

        average = filtered_df.groupby(level=0)[['着順', '賞金']].mean()
        return average.rename(columns={'着順':'着順_{}R'.format(n_samples), '賞金':'賞金_{}R'.format(n_samples)})
    
    def max_money(self, horse_id_list, date, n_samples='all'):
        target_df = self.horse_results.loc[horse_id_list]
        
        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')
            
        max_money = filtered_df.groupby(level=0)[['着順', '賞金']].max()
        return max_money.rename(columns={'賞金':'最高賞金_{}R'.format(n_samples)})

    def merge(self, results, date, n_samples='all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        merged_df = df.merge(self.average(horse_id_list, date, n_samples), left_on='horse_id',
                             right_index=True, how='left').merge(self.max_money(horse_id_list, date, n_samples), left_on='horse_id',
                             right_index=True, how='left')
        return merged_df

    def merge_all(self, results, n_samples='all'):
        date_list = results['date'].unique()
        merged_df = pd.concat([self.merge(results, date, n_samples) for date in tqdm(date_list)])
        return merged_df

In [15]:
hr = HorseResults(df_horse_results)
results_5R = hr.merge_all(test, n_samples='all')

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [16]:
results_5R

Unnamed: 0,着順_x,枠番,馬番,馬名,斤量,騎手,単勝,人気,horse_id,jockey_id,...,date,性,所属,年齢,体重,体重変化,着順_allR,賞金_allR,着順_y,最高賞金_allR
202005030211,1,7,11,グランアレグリア,56,池添謙一,12.0,3,2016104532,1032,...,2020-06-07,牝,東,4,492,6,1.5,5161.283333,3,12982.9
202005030211,2,4,5,アーモンドアイ,56,ルメール,1.3,1,2015104961,5339,...,2020-06-07,牝,東,5,488,2,1.916667,8424.441667,9,30302.4
202005030211,3,4,6,インディチャンプ,58,福永祐一,7.0,2,2015104688,1014,...,2020-06-07,牡,西,5,480,4,2.333333,2877.813333,7,11357.0
202005030211,4,3,3,ノームコア,56,横山典弘,49.9,7,2015104765,660,...,2020-06-07,牝,東,5,466,0,3.615385,2126.415385,15,10882.2
202005030211,5,5,8,ケイアイノーテック,58,津村明秀,177.6,11,2015100344,1092,...,2020-06-07,牡,西,5,464,-4,5.526316,960.936842,11,10882.2
202005030211,6,6,9,アドマイヤマーズ,58,川田将雅,20.2,6,2016104422,1088,...,2020-06-07,牡,西,4,476,6,2.333333,3039.166667,9,10890.6
202005030211,7,2,2,ダノンキングリー,58,戸崎圭太,12.9,5,2016102179,5386,...,2020-06-07,牡,東,4,456,0,2.0,3942.377778,5,8722.2
202005030211,8,8,14,ダノンスマッシュ,58,三浦皇成,73.6,8,2015102377,1122,...,2020-06-07,牡,西,5,476,4,3.388889,1874.116667,10,5996.6
202005030211,9,5,7,ペルシアンナイト,58,田辺裕信,142.4,10,2014105258,1075,...,2020-06-07,牡,西,6,502,0,4.130435,1723.33913,11,10690.6
202005030211,10,8,13,ヴァンドギャルド,58,岩田望来,110.0,9,2016105067,1174,...,2020-06-07,牡,西,4,474,0,3.454545,791.918182,9,1852.2
