In [None]:
import pandas as pd
import datetime
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import datetime

##レースデータを加工する関数
def preprocessing(results):
    df = results.copy()

    # 着順に数字以外の文字列が含まれているものを取り除く
    df ['着順']=pd.to_numeric(df['着順'], errors='coerce')
    df.dropna(subset=['着順'], inplace=True)
    df["着順"] = df["着順"].astype(int)

    # 性齢を性と年齢に分ける
    df["性"] = df["性齢"].map(lambda x: str(x)[0])
    df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

    # 馬体重を体重と体重変化に分ける
    df["体重"] = df["馬体重"].str.split("(", expand=True)[0].astype(int)
    df["体重変化"] = df["馬体重"].str.split("(", expand=True)[1].str[:-1].astype(int)

    # データをint, floatに変換
    df["単勝"] = df["単勝"].astype(float)

    # 不要な列を削除
    df.drop(["タイム", "着差", "調教師", "性齢", "馬体重","馬名","騎手"], axis=1, inplace=True)
    
    df['date']=pd.to_datetime(df['date'],format='%Y年%m月%d日')
    return df

##データをテストデータと訓練データに分ける
def split_data(df,test_size=0.3):
    sorted_id_list=df.sort_values('date').index.unique()
    train_id_list=sorted_id_list[:round(len(sorted_id_list)*(1-test_size))]
    test_id_list=sorted_id_list[round(len(sorted_id_list)*(1-test_size)):]
    
    train=df.loc[train_id_list].drop(['date'],axis=1)
    test=df.loc[test_id_list].drop(['date'],axis=1)
    return train,test

##カテゴライズ関数

def categoraize(results,peds):
    df=results.copy()

    ##horse_idとpedsを０からはじまる整数型に変更
    df['horse_id'] = LabelEncoder().fit_transform(df['horse_id'])
    for i in peds.columns:
        df[i] = LabelEncoder().fit_transform(df[i])


    #ダミー変数化
    results_d = pd.get_dummies(df)

    #horse_idとpedsをpandasのcategory型に変換
    results_d['horse_id'] = results_d['horse_id'].astype('category')
    for i in peds.columns:
        results_d[i] = results_d[i].astype('category')
    return results_d

##実際にシュミレーションが賭けた総数をキーにして、値に回収率を入れる
def gain(return_func, X, n_samples=100, min_threshold=0.5):
    gain = {}
    for i in tqdm(range(n_samples)):
        threshold = 1 * i / n_samples + min_threshold * (1-(i/n_samples))
        n_bets, money = return_func(X, threshold)
        if n_bets > 50:
            gain[n_bets] = (n_bets*100 + money) / (n_bets*100)
    return pd.Series(gain)

##馬ごとの過去成績をデータを操作するクラス
class HorseResults:
    def __init__(self, horse_results):
        self.horse_results = horse_results[['日付', '着順', '賞金']]
        self.preprocessing()
        
    def preprocessing(self):
        df = self.horse_results.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)

        df["date"] = pd.to_datetime(df["日付"])
        df.drop(['日付'], axis=1, inplace=True)
        
        #賞金のNaNを0で埋める
        df['賞金'].fillna(0, inplace=True)
    
        self.horse_results = df
        
    def average(self, horse_id_list, date, n_samples='all'):
        target_df = self.horse_results.loc[horse_id_list]
        
        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')
            
        average = filtered_df.groupby(level=0)[['着順', '賞金']].mean()
        return average.rename(columns={'着順':'着順_{}R'.format(n_samples), '賞金':'賞金_{}R'.format(n_samples)})
    
    def merge(self, results, date, n_samples='all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        merged_df = df.merge(self.average(horse_id_list, date, n_samples), left_on='horse_id',
                             right_index=True, how='left')
        return merged_df
    
    def merge_all(self, results, n_samples='all'):
        date_list = results['date'].unique()
        merged_df = pd.concat([self.merge(results, date, n_samples) for date in tqdm(date_list)])
        return merged_df

    ##レースの払い戻しデータを操作をするクラス
class Odds:
    def __init__(self,return_tables):
        self.return_tables=return_tables
        
    ##複勝のオッズを取り出す
    def hukusho(self):
        lists=["賭け方","馬番","オッズ","人気"]
        self.return_tables.columns=lists
        self.hukusho_tables= self.return_tables[self.return_tables.賭け方=="複勝"]
        self.hukusho_tables.drop(columns=["人気","賭け方"],inplace=True)
        
        num=self.hukusho_tables["馬番"].str.split('br',expand=True)
        odds=self.hukusho_tables["オッズ"].str.split('br',expand=True)
        num.columns=["wins1","wins2","wins3"]
        odds.columns=["wins1_odds","wins2_odds","wins3_odds"]
        hukusho=pd.concat([num,odds],axis=1)
        for column in hukusho.columns:
            hukusho[column] = hukusho[column].str.replace(',', '')
        return  hukusho.fillna(0).astype(int)
    
    ##単勝のオッズを取り出す
    def tansho(self):
        tansho_tables=self.return_tables[self.return_tables.賭け方=="単勝"]
        tansho_tables.drop(columns=["人気","賭け方"],inplace=True)
        tansho_tables.columns=['win','オッズ']
        for column in tansho_tables.columns:
            tansho_tables[column] = pd.to_numeric(tansho_tables[column], errors='coerce')
            
    
        return tansho_tables
    
    ##モデルについてのクラス
class Model:
    def __init__(self,model,return_tables):
        self.model=model
        self.hukusho=Odds(return_tables).hukusho()
        self.tansho=Odds(return_tables).tansho()
        
    ##モデルが3着以内にくると予測する確率をだす
    def predict_proba(self, X):
        return self.model.predict_proba(X)[:,1]
    
    ##確率が0.5以上のものを1、それ以外を0とする
    def predict_value(self,X,threshold=0.5):
        
        y_pred=self.predict_proba(X)
        for index,element in enumerate(y_pred) :
            if   element>threshold:
                y_pred[index]=1
            else:
                y_pred[index]=0 
            
        return y_pred
    
    ##aucscoreを表示
    def score(self, y_true, X):
        return roc_auc_score(y_true, self.predict_proba(X))
    
　　##特徴量の重要度を表示
    def feature_importance(self, X, n_display=20):
        importances = pd.DataFrame({"features": X.columns, 
                                    "importance": self.model.feature_importances_})
        return importances.sort_values("importance", ascending=False)[:n_display]

    ##predの値が１の馬番を取り出す 
    def pred_table(self,X,threshold=0.5):
        pred_table=X.copy()[["馬番"]]
        pred_table["pred"]=self.predict_value(X,threshold)
        pred_table=pred_table[pred_table.pred==1]
        
        return pred_table
    
  ##複勝で購入した場合の払い戻し額の合計を算出
    def simulate(self,X,threshold=0.5):
        pred_table = self.pred_table(X,threshold)
        df=self.hukusho
        df=df.merge(pred_table, left_index=True, right_index=True, how='right')
        n_bets=len(pred_table)
        money=-100*n_bets
        for i in range(3):  
             money += df[df['wins{}'.format(i+1)]==df['馬番']]['wins{}_odds'.format(i+1)].sum()
        return n_bets,money
    
    ##単勝で購入した場合の払い戻し額の合計を算出
    def tansho_simulate(self,X,threshold=0.5):
        pred_table = self.pred_table(X,threshold)
        df=self.tansho
        df=df.merge(pred_table, left_index=True, right_index=True, how='right')
        n_bets=len(pred_table)
        money=-100*n_bets
        money += df[df['win']==df['馬番']]['オッズ'].sum()
        return n_bets,money

In [None]:
import pandas as pd
import time
from tqdm.notebook import tqdm

def scrape_race_results(race_id_list, pre_race_results={}):
  
    race_results = pre_race_results.copy() 
    for race_id in tqdm(race_id_list):
        if race_id in race_results.keys():
            continue
        try:
            time.sleep(1)
            url = "https://db.netkeiba.com/race/" + race_id
            race_results[race_id] = pd.read_html(url)[0]
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
        except:
            break
    return race_results

#レースIDのリストを作る
race_id_list = []
for kai in range(1, 6, 1):
    for day in range(1, 13, 1):
         for r in range(1, 13, 1):
            race_id = "201908" + str(kai).zfill(2) +\
            str(day).zfill(2) + str(r).zfill(2)
            race_id_list.append(race_id)


In [None]:
test3 = scrape_race_results(race_id_list)
for key in test3: 
    test3[key].index = [key] * len(test3[key])
results = pd.concat([test3[key] for key in test3], sort=False) 

In [None]:
from bs4 import BeautifulSoup
import time
from tqdm.notebook import tqdm
import re
import datetime
def scrape_race_info(race_id_list):
  
    race_infos={}
    for race_id in tqdm(race_id_list):
        try:
            url="https://db.netkeiba.com/race/" + race_id
            html =requests.get(url)
            html.encoding="EUC-JP"
            soup=BeautifulSoup(html.text, "html.parser")

            texts=soup.find('div',attrs={'class':'data_intro'}).find_all('p')[0].text+\
                soup.find('div',attrs={'class':'data_intro'}).find_all('p')[1].text
            info=re.findall(r'\w+',texts)
            info_dict={}
            for text in info:
                if text in ['芝','ダート']:
                    info_dict['race_type'] = text
                if '障' in text:
                    info_dict['race_type']= '障害'
                if 'm'in text:
                    info_dict['course_len']=int(re.findall(r'\d+',text)[0])
                if text in ['良','慎重','重','不良']:
                    info_dict['ground_state']=text
                if text in ['曇','晴','雨','小雨','小雪','雪']:
                    info_dict['weather']=text
                if '年'in text:
                    info_dict['date']=text

            race_infos[race_id]=info_dict
            time.sleep(1)
        except IndexError:
             continue
        except:
               break
    return race_infos

In [None]:
race_id_list = results.index.unique()
race_infos = scrape_race_info(race_id_list)

#DataFrame型にする
race_infos = pd.DataFrame(race_infos).T

#resultsに結合
results_addinfo = results.merge(race_infos, left_index=True, right_index=True, how="inner")


In [None]:
def scrape_race_add(race_id_list):   
    #race_idをkeyにしてDataFrame型を格納
    race_results = {}
    for race_id in tqdm(race_id_list):
        try:
            url = "https://db.netkeiba.com/race/" + race_id
            time.sleep(1)
	    #メインとなるテーブルデータを取得
            df = pd.read_html(url)[0]
            
            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")
            
            #馬ID、騎手IDをスクレイピング
            horse_id_list = []
            horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                "a", attrs={"href": re.compile("^/horse")}
            )
            for a in horse_a_list:
                horse_id = re.findall(r"\d+", a["href"])
                horse_id_list.append(horse_id[0])
            jockey_id_list = []
            jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                "a", attrs={"href": re.compile("^/jockey")}
            )
            for a in jockey_a_list:
                jockey_id = re.findall(r"\d+", a["href"])
                jockey_id_list.append(jockey_id[0])
            df["horse_id"] = horse_id_list
            df["jockey_id"] = jockey_id_list
            
            race_results[race_id] = df
	#存在しないrace_idを飛ばす
        except IndexError:
            continue
	#wifiの接続が切れた時などでも途中までのデータを返せるようにする
        except Exception as e:
            print(e)
            break
	#Jupyterで停止ボタンを押した時の対処    
        except:
            break
    
    return race_results

In [None]:
#スクレイピング実行
race_results = scrape_race_results(race_id_list)

#indexをrace_idにする
for key in race_results:
    race_results[key].index = [key] * len(race_results[key])

#pd.DataFrame型にして一つのデータにまとめる
race_results_df = pd.concat([race_results[key] for key in race_results])

#race_infosをmerge
results = race_results_df.merge(race_infos, left_index=True,
    right_index=True, how='left')


In [None]:
def scrape_horse_results(horse_id_list):
    #horse_idをkeyにしてDataFrame型を格納
    horse_results = {}
    for horse_id in tqdm(horse_id_list):
        try:
            url = 'https://db.netkeiba.com/horse/' + horse_id
            df = pd.read_html(url)[3]
	
            if df.columns[0]=='受賞歴':
                df = pd.read_html(url)[4]
            horse_results[horse_id] = df
            time.sleep(1)
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
        except:
            break
    
    return horse_results

In [None]:
horse_results=scrape_horse_results(horse_id_list)

In [None]:
for key in horse_results:
    horse_results[key].index = [key] * len(horse_results[key])
    
#一つのDataFrame型のデータにまとめる。
horse_results = pd.concat([horse_results[key] for key in horse_results])


In [None]:
import pandas as pd
from urllib.request import urlopen

def scrape_return_tables(race_id_list, pre_return_tables={}):
    return_tables = pre_return_tables
    for race_id in tqdm(race_id_list):
        try:
            url = "https://db.netkeiba.com/race/" + race_id
            
            #普通にスクレイピングすると複勝やワイドなどが区切られないで繋がってしまう。
            #そのため、改行コードを文字列brに変換して後でsplitする
            f = urlopen(url)
            html = f.read()
            html = html.replace(b'<br />', b'br')
            dfs = pd.read_html(html)

            #dfsの1番目に単勝〜馬連、2番目にワイド〜三連単がある
            df = pd.concat([dfs[1], dfs[2]])

            df.index = [race_id] * len(df)
            return_tables[race_id] = df
            time.sleep(1)
        except IndexError:
            continue
        except Exception as e: #捕捉できるエラーは原因がわかるようにprintしてからbreak
            print(e)
            break
        except:
            break
    return return_tables

In [None]:
return_tables=scrape_return_tables(rece_id_list)

In [None]:
for key in return_tables.keys():
    return_tables[key].index = [key]*len(return_tables[key])

In [None]:
return_tables = pd.concat([return_tables[key] for key in return_tables.keys()])

In [None]:
def scrape_peds(horse_id_list, pre_peds = {}):
    peds = pre_peds
    for horse_id in tqdm(horse_id_list):
        try:
            url = "https://db.netkeiba.com/horse/ped/" + horse_id
            df = pd.read_html(url)[0]

            #重複を削除して1列のSeries型データに直す
            generations = {}
            for i in reversed(range(5)):
                generations[i] = df[i]
                df.drop([i], axis=1, inplace=True)
                df = df.drop_duplicates()
            ped = pd.concat([generations[i] for i in range(5)]).rename(horse_id)

            peds[horse_id] = ped.reset_index(drop=True)
            time.sleep(1)
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
        except:
            break
    return peds

In [None]:
peds=scrape_peds(horse_id_list)

In [None]:
for key in peds:
    peds[key].index = [key]*len(peds[key])

In [None]:
peds = pd.concat([peds[key] for key in peds])

In [None]:
a=peds[0].copy()

peds= pd.DataFrame(index=horse_id_list)
for e in tqdm(horse_id_list):
    for i  in range(62):
        peds3.loc[e, [i]]=a[e][i]

In [None]:
peds=peds.add_prefix('peds_')

##　予測開始

In [None]:
results_p=preprocessing(results)

In [None]:
hr=HorseResults(horse_results)
results_m=hr.merge_all(results_p,n_samples=5)
results_m=hr.merge_all(results_m,n_samples=9)
results_m=hr.merge_all(results_m,n_samples='all')

In [None]:
results_m=results_m.merge(peds,left_on='horse_id', right_index=True,how='left')

In [None]:
results_d=categoraize(results_m,peds)

In [None]:
results_d['rank']=results_d['着順'].map(lambda x: 1 if x<4 else 0)
results_r=results_d.drop(['着順'],axis=1)

In [None]:
import lightgbm as lgb

train, test = split_data(results_r)
X_train = train.drop(['rank'], axis=1)
y_train = train['rank']
X_test = test.drop(['rank'], axis=1)
y_test = test['rank']
X_train.drop(['単勝','人気'],axis=1,inplace=True)
X_test.drop(['単勝','人気'],axis=1,inplace=True)
params = {
    "num_leaves": 4,
    "n_estimators": 80,
    "class_weight": "balanced",
    "random_state": 100,
}
lgb_clf = lgb.LGBMClassifier(**params)
lgb_clf.fit(X_train.values, y_train.values)

In [None]:
me=Model(lgb_clf,return_tables)
tansho=gain(me.tansho_simulate,X_test)
hukusho=gain(me.simulate,X_test)

In [None]:
tansho.rename('tansho').plot(legend=True)
hukusho.rename('hukusho').plot(legend=True)

In [None]:
me.feature_importance(X_test)

In [None]:
me.score(y_test,X_test)