In [None]:
import pandas as pd
import time
from tqdm.notebook import tqdm
import datetime
import requests
from bs4 import BeautifulSoup
import re
from selenium.webdriver import Chrome, ChromeOptions

In [None]:
race_id_list = ['202004020211']
day = ["2020年7月26日"]

## 出馬表取得

In [None]:
class ShutubaTable:
    def __init__(self):
        self.shutuba_table = pd.DataFrame()
    
    def scrape_shutuba_table(self, race_id_list):
        options = ChromeOptions()
        #driver = Chrome(options=options)
        driver = Chrome()
        for race_id in tqdm(race_id_list):
            url  = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + race_id
            driver.get(url)
            elements = driver.find_elements_by_class_name('HorseList')
            for element in elements:
                row = []
                tds = element.find_elements_by_tag_name('td')
                for td in tds:
                    row.append(td.text)
                self.shutuba_table = self.shutuba_table.append(pd.Series(row, name=race_id))
        driver.close()

In [None]:
st = ShutubaTable()
st.scrape_shutuba_table(race_id_list)
race_plans_df = st.shutuba_table

## 出馬表整形

In [None]:
def preprocessing_race_plans_df(race_plans_df):
    df = race_plans_df.copy()  
    df = df.rename(columns={0 :'枠番',1:'馬番',3:'馬名',4:'性齢',5:'斤量',6:'騎手',7:'厩舎',8:'馬体重',9:'単勝',10:'人気'})
    df['性'] = df['性齢'].map(lambda x:str(x)[0])
    df['年齢'] = df['性齢'].map(lambda x:str(x)[1:]).astype(int)
    df['体重'] = df['馬体重'].str.split('(',expand = True)[0].astype(int)
    df['体重変化'] = df['馬体重'].str.split('(',expand = True)[1].str[:-1]
    df.loc[df['体重変化'] == "前計不", '体重変化'] = 0
    object_to_int = [int(s) for s in list(df['体重変化'])]
    df['体重変化'] = object_to_int
    df['枠番'] = df['枠番'].astype(int)
    df['馬番'] = df['馬番'].astype(int)
    df['斤量'] = df['斤量'].astype(float)
    df['斤量'] = df['斤量'].astype(int)
    df['所属'] = df['厩舎'].map(lambda x:str(x)[:2])
    df['単勝'] = df['単勝'].astype(float)
    df['人気'] = df['人気'].astype(float)
    horse_count = len(df)
    df['course_id'] = [int(race_id_list[0][4:6])]*horse_count
    df.drop([2,11,12,'性齢','馬体重'],axis = 1,inplace = True)
    
    return df

In [None]:
horse_table = preprocessing_race_plans_df(race_plans_df)

## 対象レース情報取得

In [None]:
def get_race_info(race_id_list):
    for race_id in race_id_list:
        url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + race_id
        html = requests.get(url)
        html.encoding = "EUC-JP"
        soup = BeautifulSoup(html.text, "html.parser")
        text_race_data = str(soup.find('div',attrs={'class':'RaceData01'}))
        race_data = soup.find('div',attrs={'class':'RaceData01'})
        
        whether_text = [text_race_data[text_race_data.find("天候")+3:text_race_data.find('<span class="Icon_Weather')]]
        course_type_text = [text_race_data[text_race_data.find("(")+1:text_race_data.find(")")]]
        ground_type_text = [race_data.find_all('span')[0].text]
        ground_state_text = [race_data.find_all('span')[2].text[race_data.find_all('span')[2].text.find(":")+1:]]

        race_info = ground_state_text+ ground_type_text + whether_text + course_type_text + day
        
        info_dict = {}
        race_infos = {}
        for text in race_info:
            if "芝" in text:
                info_dict["race_type"] = '芝'
            if "ダ" in text:
                info_dict["race_type"] = 'ダート'
            if "障" in text:
                info_dict["race_type"] = "障害"
            if "m" in text:
                info_dict["course_len"] = int(re.findall(r"\d+", text)[0])
            if text in ["良", "稍","稍重", "重", "不良"]:
                info_dict["ground_state"] = text
            if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                info_dict["weather"] = text
            if "年" in text:
                info_dict["date"] = text
            if "右" in text:
                info_dict["course_type"] = "right"
            if "左" in text:
                info_dict["course_type"] = "left"
            if "直線" in text:
                info_dict["course_type"] = "straight"

        race_infos[race_id] = info_dict
        return race_infos
    

In [None]:
race_infos = get_race_info(race_id_list)
df_infos = pd.DataFrame(race_infos.values(), index=race_infos.keys())
predict_add_race_info = horse_table.merge(df_infos,left_index=True,right_index=True,how='inner')     
predict_add_race_info['date'] = pd.to_datetime(predict_add_race_info['date'],format='%Y年%m月%d日')

## 馬idとジョッキーid追加

In [None]:
def add_horse_jockey_id(predict_add_race_info,race_id_list):
    df = predict_add_race_info.copy() 
    for race_id in race_id_list:
        url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + race_id
        html = requests.get(url)
        html.encoding = "EUC-JP"
        soup = BeautifulSoup(html.text, "html.parser")
    
        horse_id_list = []
        horse_soup_list  = soup.find_all("td", attrs={"class": "HorseInfo"})

        for horse_soup in horse_soup_list:
            horse_id_list.append(horse_soup.find("a").get('href')[-10:])
    
        jockey_id_list = []
        jockey_soup_list  = soup.find_all("td", attrs={"class": "Jockey"})

        for jockey_soup in jockey_soup_list:
            jockey_id_list.append(jockey_soup.find("a").get('href')[-6:-1])
        
        df['horse_id'] = horse_id_list
        df['jockey_id'] = jockey_id_list
    
    return df,horse_id_list,jockey_id_list


In [None]:
predict_addinfo,horse_id_list,jockey_id_list = add_horse_jockey_id(predict_add_race_info,race_id_list)

## 馬の過去戦績取得

In [None]:
def scrape_horse_results(horse_id_list, pre_horse_id=[]):
    horse_results = {}
    for horse_id in tqdm(horse_id_list):
        if horse_id in pre_horse_id:
            continue
        try:
            url = 'https://db.netkeiba.com/horse/' + horse_id
            df = pd.read_html(url)[3]
            if df.columns[0]=='受賞歴':
                df = pd.read_html(url)[4]
            horse_results[horse_id] = df
            time.sleep(0.1)
        except IndexError:
            continue
        except Exception as e:
            import traceback
            traceback.print_exc()
            print(e)
            break
        except:
            break
    return horse_results

In [None]:
horse_results = scrape_horse_results(horse_id_list)
for key in horse_results:
    horse_results[key].index = [key] * len(horse_results[key])
df_horse_results = pd.concat([horse_results[key] for key in horse_results])

In [None]:
class HorseResults:
    def __init__(self, horse_results):
        self.horse_results = horse_results[['日付', '着順', '賞金']]
        self.preprocessing()

    def preprocessing(self):
        df = self.horse_results.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)

        df["date"] = pd.to_datetime(df["日付"])
        df.drop(['日付'], axis=1, inplace=True)

        #賞金のNaNを0で埋める
        df['賞金'].fillna(0, inplace=True)

        self.horse_results = df

    def average(self, horse_id_list, date, n_samples='all'):
        target_df = self.horse_results.loc[horse_id_list]

        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')

        average = filtered_df.groupby(level=0)[['着順', '賞金']].mean()
        return average.rename(columns={'着順':'着順_{}R'.format(n_samples), '賞金':'賞金_{}R'.format(n_samples)})
    # change 馬の最高賞金追加
    def max_money(self, horse_id_list, date, n_samples='all'):
        target_df = self.horse_results.loc[horse_id_list]
        
        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')
            
        max_money = filtered_df.groupby(level=0)[['賞金']].max()
        return max_money.rename(columns={'賞金':'最高賞金_{}R'.format(n_samples)})

    def merge(self, results, date, n_samples='all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        merged_df = df.merge(self.average(horse_id_list, date, n_samples), left_on='horse_id',
                             right_index=True, how='left').merge(self.max_money(horse_id_list, date, n_samples), left_on='horse_id',
                             right_index=True, how='left')
        return merged_df

    def merge_all(self, results, n_samples='all'):
        date_list = results['date'].unique()
        merged_df = pd.concat([self.merge(results, date, n_samples) for date in tqdm(date_list)])
        return merged_df

In [None]:
hr = HorseResults(df_horse_results)
predict_all = hr.merge_all(predict_addinfo, n_samples='all')

## 予想データ整形

In [None]:
def preprocessing_predict(predict_all):
    df = predict_all.copy()
    df.drop(['馬名'],axis=1,inplace=True)
    df.drop(['騎手'],axis=1,inplace=True)
    df.drop(['厩舎'],axis=1,inplace=True)
    df.drop(['horse_id'],axis=1,inplace=True)
    df.drop(['jockey_id'],axis=1,inplace=True)
    df.drop(['date'],axis=1,inplace=True)
    df = df.replace('栗東', '西')
    df = df.replace('美浦', '東')
    df = df.replace('地方', '地')
    df = df.replace('海外', '海')
    return df.fillna(0)

In [None]:
preprocessing_predict = preprocessing_predict(predict_all)
horse_count = len(preprocessing_predict)

## 学習データ整形

In [None]:
results_all = pd.read_pickle('results_all.pickle')

In [None]:
def preprocessing_results(results):
    df = results.copy()
    df.drop(['着順'],axis=1,inplace=True)
    df.drop(['馬名'],axis=1,inplace=True)
    df.drop(['騎手'],axis=1,inplace=True)
    df.drop(['horse_id'],axis=1,inplace=True)
    df.drop(['jockey_id'],axis=1,inplace=True)
    df.drop(['date'],axis=1,inplace=True)
    return df.fillna(0)

preprocessing_results = preprocessing_results(results_all)

In [None]:
df_concat = pd.concat([preprocessing_results, preprocessing_predict])

In [None]:
pred_data = pd.get_dummies(df_concat)

In [None]:
pred_data

In [None]:
## 予想対象レース抽出
pred_data_new =  pred_data[len(pred_data)-horse_count:len(pred_data)]

## ランダムフォレストで予想

In [None]:
import pandas as pd
import time
from tqdm.notebook import tqdm
import datetime
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import pickle

In [None]:
## たまにエラーする
#pred_data_new = pred_data_new.drop(['ground_state_稍'],axis=1)

In [None]:
loaded_model = pickle.load(open('11_race_id_to_horse_info.pickle', 'rb'))
result = loaded_model.predict(pred_data_new)
print(result)