In [None]:
import pandas as pd
import os
import time
from tqdm.notebook import tqdm
import datetime
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler

## レース結果を取得

In [None]:
def scrape_race_results(race_id_list, pre_race_results={}):
    race_results = pre_race_results
    race_infos = {}
    for race_id in tqdm(race_id_list):
        if race_id in race_results.keys():
            continue
        try:
            url = "https://db.netkeiba.com/race/" + race_id
            df = pd.read_html(url)[0]
            
            if len(df) < 3:
                continue
            # horse_idとjockey_idをスクレイピング
            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")
            # horse_id
            horse_id_list = []
            horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                "a", attrs={"href": re.compile("^/horse")}
            )

            for a in horse_a_list:
                horse_id = re.findall(r"\d+", a["href"])
                horse_id_list.append(horse_id[0])
            # jockey_id
            jockey_id_list = []
            jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                "a", attrs={"href": re.compile("^/jockey")}
            )
            for a in jockey_a_list:
                jockey_id = re.findall(r"\d+", a["href"])
                jockey_id_list.append(jockey_id[0])

            df["horse_id"] = horse_id_list
            df["jockey_id"] = jockey_id_list
            
            # change コースid追加
            df['course_id'] = [int(race_id[4:6])]*len(horse_id_list)
            
            race_results[race_id] = df

            texts = (
                soup.find("div", attrs={"class": "data_intro"}).find_all("p")[0].text
                + soup.find("div", attrs={"class": "data_intro"}).find_all("p")[1].text
            )
            info = re.findall(r'\w+', texts)
            info_dict = {}
            for text in info:
                if text in ["芝", "ダート"]:
                    info_dict["race_type"] = text
                if "障" in text:
                    info_dict["race_type"] = "障害"
                if "m" in text:
                    info_dict["course_len"] = int(re.findall(r"\d+", text)[0])
                if text in ["良", "稍重", "重", "不良"]:
                    info_dict["ground_state"] = text
                if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                    info_dict["weather"] = text
                if "年" in text and "月" in text and "日" in text: 
                    info_dict["date"] = text
                # change コース特性追加
                if "右" in text:
                    info_dict["course_type"] = "right"
                if "左" in text:
                    info_dict["course_type"] = "left"
                if "直線" in text:
                    info_dict["course_type"] = "straight"
            race_infos[race_id] = info_dict
            time.sleep(0.1)
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
    return race_results,race_infos

## 馬の戦績取得

In [None]:
def scrape_horse_results(horse_id_list, pre_horse_id=[]):
    horse_results = {}
    for horse_id in tqdm(horse_id_list):
        if horse_id in pre_horse_id:
            continue
        try:
            url = 'https://db.netkeiba.com/horse/' + horse_id
            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")
            
            ## add(生産地)
            texts = soup.find("div", attrs={"class": "db_prof_area_02"}).find_all("a")
            for text in texts:
                if "breeder" in str(text):
                    Borned_place = str(text)[str(text).find('e="')+3:str(text).find('">')]
            
            df = pd.read_html(url)[3]
            if df.columns[0]=='受賞歴':
                df = pd.read_html(url)[4]
                
            df["Borned_place"] = Borned_place
            horse_results[horse_id] = df
            time.sleep(0.1)
        except IndexError:
            continue
        except Exception as e:
            import traceback
            traceback.print_exc()
            print(e)
            break
        except:
            break
    return horse_results

## 馬の詳細戦績取得

In [None]:
class HorseResults:
    def __init__(self, horse_results):
        self.horse_results = horse_results[['日付','着順', '賞金']]
        self.preprocessing()

    def preprocessing(self):
        df = self.horse_results.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)
        df['着順'].fillna(0, inplace=True)

        df["date"] = pd.to_datetime(df["日付"])
        #df.drop(['日付'], axis=1, inplace=True)

        #賞金のNaNを0で埋める
        df['賞金'].fillna(0, inplace=True)

        self.horse_results = df

    def average(self, horse_id_list, date, n_samples='all'):
        self.horse_results.reindex(horse_id_list, axis=1)
        target_df = self.horse_results.loc[horse_id_list]

        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')

        average = filtered_df.groupby(level=0)[['着順', '賞金']].mean()
        return average.rename(columns={'着順':'着順_{}R'.format(n_samples), '賞金':'賞金_{}R'.format(n_samples)})
    # change 馬の最高賞金追加
    def max_money(self, horse_id_list, date, n_samples='all'):
        self.horse_results.reindex(horse_id_list, axis=1)
        target_df = self.horse_results.loc[horse_id_list]
        
        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')
            
        max_money = filtered_df.groupby(level=0)[['賞金']].max()
        return max_money.rename(columns={'賞金':'最高賞金_{}R'.format(n_samples)})

    def merge(self, results, date, n_samples='all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        merged_df = df.merge(self.average(horse_id_list, date, 3), left_on='horse_id',right_index=True, how='left')\
                      .merge(self.average(horse_id_list, date, 5), left_on='horse_id',right_index=True, how='left')\
                      .merge(self.average(horse_id_list, date, "all"), left_on='horse_id',right_index=True, how='left')\
                      .merge(self.max_money(horse_id_list, date, 'all'), left_on='horse_id',right_index=True, how='left')
        return merged_df

    def merge_all(self, results, n_samples='all'):
        date_list = results['date'].unique()
        merged_df = pd.concat([self.merge(results, date, n_samples) for date in tqdm(date_list)])
        return merged_df

## 馬の血統取得

In [None]:
def scrape_peds(horse_id_list, pre_peds={}):
    peds = pre_peds
    for horse_id in tqdm(horse_id_list):
        if horse_id in peds.keys():
            continue
        try:
            url = "https://db.netkeiba.com/horse/ped/" + horse_id
            df = pd.read_html(url)[0]

            generations = {}
            for i in reversed(range(5)):
                generations[i] = df[i]
                df.drop([i], axis=1, inplace=True)
                df = df.drop_duplicates()

            ped = pd.concat([generations[i] for i in range(5)]).rename(horse_id)
            peds[horse_id] = ped.reset_index(drop=True)
            time.sleep(0.1)
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
    return peds

## カラムカテゴリー化

In [None]:
def process_categorical(df, target_columns):
    df2 = df.copy()
    for column in target_columns:
        df2[column] = LabelEncoder().fit_transform(df2[column].fillna('Na'))
    
    #target_columns以外にカテゴリ変数があれば、ダミー変数にする
    df2 = pd.get_dummies(df2)

    for column in target_columns:
        df2[column] = df2[column].astype('category')

    return df2

## 血統データ結合

In [None]:
def add_blood_data(horse_id_list,df):
    peds = scrape_peds(horse_id_list)
    peds = pd.concat([peds[horse_id] for horse_id in peds], axis=1).T
    peds = peds.add_prefix('peds_')
    df = df.merge(peds,left_on='horse_id', right_index=True, how='left')
    return df

## ジョッキー情報取得

In [None]:
def scrape_jockey_results(jockey_id_list, pre_jockey_id=[]):
    jockey_results = {}
    for jockey_id in tqdm(jockey_id_list):
        if jockey_id in pre_jockey_id:
            continue
        try:
            url = 'https://db.netkeiba.com/jockey/result/' + jockey_id + '/'
            df = pd.read_html(url)[0][['勝率','連対率','複勝率']][:1]
            jockey_results[jockey_id] = df
            time.sleep(0.1)
        except IndexError:
            continue
        except Exception as e:
            import traceback
            traceback.print_exc()
            print(e)
            break
        except:
            break
    return jockey_results

## データ取得手順

In [None]:
def get_race_data(race_id_list,flag):
    results,race_infos = scrape_race_results(race_id_list)
    for key in results:
        results[key].index = [key] * len(results[key])
    results = pd.concat([results[key] for key in results], sort=False)
    df_race_infos = pd.DataFrame(race_infos.values(), index=race_infos.keys())
    results_addinfo = results.merge(df_race_infos,left_index=True,right_index=True,how='inner')
    results_addinfo['date'] = pd.to_datetime(results_addinfo['date'],format='%Y年%m月%d日')

    horse_id_list = results_addinfo['horse_id'].unique()
    horse_results = scrape_horse_results(horse_id_list)
    for key in horse_results:
        horse_results[key].index = [key] * len(horse_results[key])
    df_horse_results = pd.concat([horse_results[key] for key in horse_results])

    jockey_id_list = results_addinfo['jockey_id'].unique()
    jockey_results = scrape_jockey_results(jockey_id_list)
    for key in jockey_results:
        jockey_results[key].index = [key] * len(jockey_results[key])
    df_jockey_results = pd.concat([jockey_results[key] for key in jockey_results])
    results_addinfo = results_addinfo.merge(df_jockey_results,left_on='jockey_id',right_index=True,how='left')

    borned_place_list = []
    for i in range(len(results_addinfo)):
        borned_place_list.append(list(set(list(horse_results[results_addinfo['horse_id'][i]]["Borned_place"])))[0])
    results_addinfo["Borned_place"] = borned_place_list

    results_addinfo = results_addinfo[~(results_addinfo['着順'].astype(str).str.contains('\D'))]
    drop_lines = list(results_addinfo.query('馬体重 == "計不"').index)
    results_addinfo = results_addinfo.drop(index=drop_lines)

    hr = HorseResults(df_horse_results)
    results_5R = hr.merge_all(results_addinfo, n_samples=5)

    add_blood = add_blood_data(horse_id_list,results_5R)
    if flag == True:
        add_blood.to_pickle('race_results_dif_df.pickle')
        return add_blood
    else:
        add_blood.to_pickle('race_results_dif_df.pickle')
        return add_blood        

## メイン関数

In [None]:
def scraping_netkeiba():
    dt_now = datetime.datetime.now()
    race_url_list = []
    for year in range(2018,2021):
        
        if year != dt_now.year:
            for month in range(1,13):
                path = 'race_url/'+str(year)+'-'+str(month)+'.txt'
                with open(path) as f:
                    race_url_list += f.readlines()
        else:
            for month in range(1,dt_now.month+1):
                path = 'race_url/'+str(year)+'-'+str(month)+'.txt'
                with open(path) as f:
                    race_url_list += f.readlines()
    race_id_list = []
    for url in race_url_list:
        race_id_list.append(url[-14:-2])
    if os.path.exists('race_results_df.pickle') != True:
        add_blood = get_race_data(race_id_list,False)
        print("FINISH!!!")
        return

    race_results_df = pd.read_pickle('race_results_df.pickle')
    got_race_id_list = set(list(race_results_df.index))
    difference_id_list = set(race_id_list) ^ got_race_id_list
    
    #なぜか失敗する
    if '201305030305' in difference_id_list:
        difference_id_list.remove('201305030305')
        
    if len(difference_id_list) > 0:
        flag = True
        race_results_dif_df = get_race_data(difference_id_list,flag)
        race_results_df = pd.concat([race_results_df, race_results_dif_df])
        race_results_df.to_pickle('race_results_df.pickle')
        print("FINISH!!!")
        return

    else:
        flag = False
        print("FINISH!!!")
        return

In [None]:
scraping_netkeiba()