In [None]:
import pandas as pd
import time
from tqdm.notebook import tqdm
import datetime
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler

In [None]:
race_id_list = []
for year in range(2019,2020,1):
    for place in range(1, 11, 1):
        for kai in range(1, 6, 1):
            for day in range(1, 9, 1):
                for r in range(1, 13, 1):
                    race_id = (
                        str(year)
                        + str(place).zfill(2)
                        + str(kai).zfill(2)
                        + str(day).zfill(2)
                        + str(r).zfill(2)
                    )
                    race_id_list.append(race_id)

In [None]:
def scrape_race_results(race_id_list, pre_race_results={}):
    race_results = pre_race_results
    for race_id in tqdm(race_id_list):
        if race_id in race_results.keys():
            continue
        try:
            url = "https://db.netkeiba.com/race/" + race_id
            df = pd.read_html(url)[0]
            
            # change データ取得失敗対応
            if len(df) < 3:
                continue
            # horse_idとjockey_idをスクレイピング
            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")
            # horse_id
            horse_id_list = []
            horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                "a", attrs={"href": re.compile("^/horse")}
            )

            for a in horse_a_list:
                horse_id = re.findall(r"\d+", a["href"])
                horse_id_list.append(horse_id[0])
            # jockey_id
            jockey_id_list = []
            jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                "a", attrs={"href": re.compile("^/jockey")}
            )
            for a in jockey_a_list:
                jockey_id = re.findall(r"\d+", a["href"])
                jockey_id_list.append(jockey_id[0])

            df["horse_id"] = horse_id_list
            df["jockey_id"] = jockey_id_list
            
            # change コースid追加
            df['course_id'] = [int(race_id[4:6])]*len(horse_id_list)
            
            race_results[race_id] = df
            time.sleep(0.1)
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
    return race_results

In [None]:
results = scrape_race_results(race_id_list)

In [None]:
for key in results:
    results[key].index = [key] * len(results[key])
results = pd.concat([results[key] for key in results], sort=False)
results.to_pickle('results.pickle')

In [None]:
results = pd.read_pickle('results.pickle')

In [None]:
def scrape_race_info(race_id_list):
    race_infos = {}
    for race_id in tqdm(race_id_list):
        try:
            url = "https://db.netkeiba.com/race/" + race_id
            
            df = pd.read_html(url)[0]
            # change データ取得失敗対応
            if len(df) < 3:
                continue
                
            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")

            texts = (
                soup.find("div", attrs={"class": "data_intro"}).find_all("p")[0].text
                + soup.find("div", attrs={"class": "data_intro"}).find_all("p")[1].text
            )
            info = re.findall(r'\w+', texts)
            info_dict = {}
            for text in info:
                if text in ["芝", "ダート"]:
                    info_dict["race_type"] = text
                if "障" in text:
                    info_dict["race_type"] = "障害"
                if "m" in text:
                    info_dict["course_len"] = int(re.findall(r"\d+", text)[0])
                if text in ["良", "稍重", "重", "不良"]:
                    info_dict["ground_state"] = text
                if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                    info_dict["weather"] = text
                if "年" in text:
                    info_dict["date"] = text
                # change コース特性追加
                if "右" in text:
                    info_dict["course_type"] = "right"
                if "左" in text:
                    info_dict["course_type"] = "left"
                if "直線" in text:
                    info_dict["course_type"] = "straight"
            race_infos[race_id] = info_dict
            time.sleep(0.1)
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
    return race_infos

In [None]:
race_infos = scrape_race_info(race_id_list)
df_race_infos = pd.DataFrame(race_infos.values(), index=race_infos.keys())

In [None]:
results_addinfo = results.merge(df_race_infos,left_index=True,right_index=True,how='inner')
results_addinfo.to_pickle('results_addinfo.pickle')
results_addinfo = pd.read_pickle('results_addinfo.pickle')

In [None]:
# change 計不があるレースは消去
drop_lines = list(results_addinfo.query('馬体重 == "計不"').index)

In [None]:
results_addinfo_new = results_addinfo.drop(index=drop_lines)

In [None]:
def preprocessing_rf(results):
    df = results.copy()
    
    df = df[~(df['着順'].astype(str).str.contains('\D'))]
    df['着順'] = df['着順'].astype(int)
    df['性'] = df['性齢'].map(lambda x:str(x)[0])
    
    # chaneg 馬の所属追加
    df['所属'] = df['調教師'].map(lambda x:str(x)[1])
    df['年齢'] = df['性齢'].map(lambda x:str(x)[1:]).astype(int)
    df['体重'] = df['馬体重'].str.split('(',expand = True)[0].astype(int)
    df['体重変化'] = df['馬体重'].str.split('(',expand = True)[1].str[:-1]
    
    # change 体重変化をint型へ
    object_to_int = [int(s) for s in list(df['体重変化'])]
    df['体重変化'] = object_to_int
    
    
    df['単勝'] = df['単勝'].astype(float)
    df['date'] = pd.to_datetime(df['date'],format='%Y年%m月%d日')
    
    df.drop(['タイム','着差','調教師','性齢','馬体重'],axis = 1,inplace = True)
    
    return df

In [None]:
test = preprocessing_rf(results_addinfo_new)

In [None]:
horse_id_list = test['horse_id'].unique()

In [None]:
def scrape_horse_results(horse_id_list, pre_horse_id=[]):
    horse_results = {}
    for horse_id in tqdm(horse_id_list):
        if horse_id in pre_horse_id:
            continue
        try:
            url = 'https://db.netkeiba.com/horse/' + horse_id
            df = pd.read_html(url)[3]
            if df.columns[0]=='受賞歴':
                df = pd.read_html(url)[4]
            horse_results[horse_id] = df
            time.sleep(0.1)
        except IndexError:
            continue
        except Exception as e:
            import traceback
            traceback.print_exc()
            print(e)
            break
        except:
            break
    return horse_results

In [None]:
horse_results = scrape_horse_results(horse_id_list)
for key in horse_results:
    horse_results[key].index = [key] * len(horse_results[key])
df_horse_results = pd.concat([horse_results[key] for key in horse_results])
df_horse_results.to_pickle('horse_results.pickle')

In [None]:
class HorseResults:
    def __init__(self, horse_results):
        self.horse_results = horse_results[['日付', '着順', '賞金']]
        self.preprocessing()

    def preprocessing(self):
        df = self.horse_results.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)

        df["date"] = pd.to_datetime(df["日付"])
        df.drop(['日付'], axis=1, inplace=True)

        #賞金のNaNを0で埋める
        df['賞金'].fillna(0, inplace=True)

        self.horse_results = df

    def average(self, horse_id_list, date, n_samples='all'):
        target_df = self.horse_results.loc[horse_id_list]

        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')

        average = filtered_df.groupby(level=0)[['着順', '賞金']].mean()
        return average.rename(columns={'着順':'着順_{}R'.format(n_samples), '賞金':'賞金_{}R'.format(n_samples)})
    # change 馬の最高賞金追加
    def max_money(self, horse_id_list, date, n_samples='all'):
        target_df = self.horse_results.loc[horse_id_list]
        
        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')
            
        max_money = filtered_df.groupby(level=0)[['賞金']].max()
        return max_money.rename(columns={'賞金':'最高賞金_{}R'.format(n_samples)})

    def merge(self, results, date, n_samples='all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        merged_df = df.merge(self.average(horse_id_list, date, n_samples), left_on='horse_id',
                             right_index=True, how='left').merge(self.max_money(horse_id_list, date, n_samples), left_on='horse_id',
                             right_index=True, how='left')
        return merged_df

    def merge_all(self, results, n_samples='all'):
        date_list = results['date'].unique()
        merged_df = pd.concat([self.merge(results, date, n_samples) for date in tqdm(date_list)])
        return merged_df

In [None]:
hr = HorseResults(df_horse_results)
results_all = hr.merge_all(test, n_samples='all')
results_all.to_pickle('results_all.pickle')

In [None]:
def preprocessing_last(results):
    df = results.copy()
    df.drop(['馬名'],axis=1,inplace=True)
    df.drop(['騎手'],axis=1,inplace=True)
    df.drop(['horse_id'],axis=1,inplace=True)
    df.drop(['jockey_id'],axis=1,inplace=True)
    df['rank'] = df['着順'].map(lambda x: x if x<4 else 4)
    return df.fillna(0)

In [None]:
def split_data(df,test_size):
    sorted_id_list = df.sort_values('date').index.unique()
    train_id_list = sorted_id_list[:round(len(sorted_id_list)*(1-test_size))]
    test_id_list = sorted_id_list[round(len(sorted_id_list)*(1-test_size)):]
    train = df.loc[train_id_list]
    test = df.loc[test_id_list]
    return train,test

In [None]:
sample = pd.get_dummies(preprocessing_last(results_all))
sample.to_pickle('sample.pickle')

In [None]:
sample = pd.read_pickle('sample.pickle')

In [None]:
train,test = split_data(sample,0.3)

In [None]:
rank_1 = train['rank'].value_counts()[1]
rank_2 = train['rank'].value_counts()[2]
rank_3 = train['rank'].value_counts()[3]

model = LogisticRegression()
rus = RandomUnderSampler(sampling_strategy={1:rank_1,2:rank_2,3:rank_3,4:rank_1},random_state=71)

X_train = train.drop(['着順','date','rank'],axis=1)
y_train = train['rank']
X_test = test.drop(['着順','date','rank'],axis=1)
y_test = test['rank']

X_train_rus,y_train_rus = rus.fit_sample(X_train,y_train)

In [None]:
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train_rus,y_train_rus)

print(clf.score(X_train,y_train),clf.score(X_test,y_test))

In [None]:
import pickle
pickle.dump(clf, open('11_race_id_to_horse_info.pickle', 'wb'))

In [None]:
loaded_model = pickle.load(open('11_race_id_to_horse_info.pickle', 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)