In [181]:
import pandas as pd
import numpy as np
import datetime
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import requests
from bs4 import BeautifulSoup
import time
from tqdm.notebook import tqdm
import re
from urllib.request import urlopen
import optuna.integration.lightgbm as lgb_o
from itertools import combinations, permutations
import lxml
import urllib.request as req

In [182]:
class Results:
    @staticmethod
    def scrape(race_id_list):
        """
        レース結果データをスクレイピングする関数
        Parameters:
        ----------
        race_id_list : list
            レースIDのリスト
        Returns:
        ----------
        race_results_df : pandas.DataFrame
            全レース結果データをまとめてDataFrame型にしたもの
        """
        #race_idをkeyにしてDataFrame型を格納
        race_results = {}
        for race_id in tqdm(race_id_list):
            time.sleep(1)
            try:
                url = "https://db.netkeiba.com/race/" + race_id
                #メインとなるテーブルデータを取得
                df = pd.read_html(url)[0]
                html = requests.get(url)
                html.encoding = "EUC-JP"
                soup = BeautifulSoup(html.text, "html.parser")
                #天候、レースの種類、コースの長さ、馬場の状態、日付をスクレイピング
                texts = (
                    soup.find("div", attrs={"class": "data_intro"}).find_all("p")[0].text
                    + soup.find("div", attrs={"class": "data_intro"}).find_all("p")[1].text
                )
                info = re.findall(r'\w+', texts)
                for text in info:
                    if text in ["芝", "ダート"]:
                        df["race_type"] = [text] * len(df)
                    if "障" in text:
                        df["race_type"] = ["障害"] * len(df)
                    if "m" in text:
                        df["course_len"] = [int(re.findall(r"\d+", text)[-1])] * len(df)
                    if text in ["良", "稍重", "重", "不良"]:
                        df["ground_state"] = [text] * len(df)
                    if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                        df["weather"] = [text] * len(df)
                    if "年" in text:
                        df["date"] = [text] * len(df)
                #馬ID、騎手IDをスクレイピング
                horse_id_list = []
                horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/horse")}
                )
                for a in horse_a_list:
                    horse_id = re.findall(r"\d+", a["href"])
                    horse_id_list.append(horse_id[0])
                jockey_id_list = []
                jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/jockey")}
                )
                for a in jockey_a_list:
                    jockey_id = re.findall(r"\d+", a["href"])
                    jockey_id_list.append(jockey_id[0])
                df["horse_id"] = horse_id_list
                df["jockey_id"] = jockey_id_list
                #インデックスをrace_idにする
                df.index = [race_id] * len(df)
                race_results[race_id] = df
            #存在しないrace_idを飛ばす
            except IndexError:
                continue
            except AttributeError: #存在しないrace_idでAttributeErrorになるページもあるので追加
                continue
            #wifiの接続が切れた時などでも途中までのデータを返せるようにする
            except Exception as e:
                print(e)
                break
            #Jupyterで停止ボタンを押した時の対処
            except:
                break
            #pd.DataFrame型にして一つのデータにまとめる
        race_results_df = pd.concat([race_results[key] for key in race_results])
        return race_results_df

In [183]:
results = Results.scrape(race_id_list)

  0%|          | 0/20 [00:00<?, ?it/s]

In [186]:
results.head()

Unnamed: 0,着順,枠番,馬番,馬名,性齢,斤量,騎手,タイム,着差,単勝,人気,馬体重,調教師,course_len,weather,race_type,ground_state,date,horse_id,jockey_id
202105030211,1,7,11,ダノンキングリー,牡5,58,川田将雅,1:31.7,,47.6,8,456(+6),[東] 萩原清,1600,曇,芝,良,2021年6月6日,2016102179,1088
202105030211,2,4,5,グランアレグリア,牝5,56,ルメール,1:31.7,アタマ,1.5,1,502(+4),[東] 藤沢和雄,1600,曇,芝,良,2021年6月6日,2016104532,5339
202105030211,3,8,13,シュネルマイスター,牡3,54,横山武史,1:31.8,1/2,10.2,4,474(-6),[東] 手塚貴久,1600,曇,芝,良,2021年6月6日,2018110007,1170
202105030211,4,5,8,インディチャンプ,牡6,58,福永祐一,1:31.9,クビ,7.0,2,484(+6),[西] 音無秀孝,1600,曇,芝,良,2021年6月6日,2015104688,1014
202105030211,5,6,9,トーラスジェミニ,牡5,58,戸崎圭太,1:32.1,1.1/4,153.6,13,472(-4),[東] 小桧山悟,1600,曇,芝,良,2021年6月6日,2016106518,5386


In [96]:
urls = "https://db.netkeiba.com/?pid=race_list&word=%5E%B0%C2%C5%C4%B5%AD%C7%B0"

res = req.urlopen(urls)
soup = BeautifulSoup(res,"html.parser")

# tags = soup.find_all(title="安田記念(G1)")
race_id_list = []
for element in soup.find_all(title="安田記念(G1)"):
    urls = element.get("href")
    race_id_list.append(urls)

In [187]:
results.to_pickle('results.pickle')

In [189]:
#馬の過去成績データを処理するクラス
class HorseResults:
    @staticmethod
    def scrape(horse_id_list):
        """
        馬の過去成績データをスクレイピングする関数

        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト

        Returns:
        ----------
        horse_results_df : pandas.DataFrame
            全馬の過去成績データをまとめてDataFrame型にしたもの
        """

        #horse_idをkeyにしてDataFrame型を格納
        horse_results = {}
        for horse_id in tqdm(horse_id_list):
            time.sleep(1)
            try:
                url = 'https://db.netkeiba.com/horse/' + horse_id
                df = pd.read_html(url)[3]
                #受賞歴がある馬の場合、3番目に受賞歴テーブルが来るため、4番目のデータを取得する
                if df.columns[0]=='受賞歴':
                    df = pd.read_html(url)[4]
                df.index = [horse_id] * len(df)
                horse_results[horse_id] = df
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる        
        horse_results_df = pd.concat([horse_results[key] for key in horse_results])

        return horse_results_df

In [190]:
horse_id_list = results['horse_id'].unique()
horse_results = HorseResults.scrape(horse_id_list)

  0%|          | 0/180 [00:00<?, ?it/s]

In [194]:
horse_results.to_pickle('horse_results.pickle')

In [192]:
#血統データを処理するクラス
class Peds:
    @staticmethod
    def scrape(horse_id_list):
        """
        血統データをスクレイピングする関数

        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト

        Returns:
        ----------
        peds_df : pandas.DataFrame
            全血統データをまとめてDataFrame型にしたもの
        """

        peds_dict = {}
        for horse_id in tqdm(horse_id_list):
            time.sleep(1)
            try:
                url = "https://db.netkeiba.com/horse/ped/" + horse_id
                df = pd.read_html(url)[0]

                #重複を削除して1列のSeries型データに直す
                generations = {}
                for i in reversed(range(5)):
                    generations[i] = df[i]
                    df.drop([i], axis=1, inplace=True)
                    df = df.drop_duplicates()
                ped = pd.concat([generations[i] for i in range(5)]).rename(horse_id)

                peds_dict[horse_id] = ped.reset_index(drop=True)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #列名をpeds_0, ..., peds_61にする
        peds_df = pd.concat([peds_dict[key] for key in peds_dict], axis=1).T.add_prefix('peds_')

        return peds_df

In [193]:
peds_results = Peds.scrape(horse_id_list)

  0%|          | 0/180 [00:00<?, ?it/s]

In [195]:
peds_results.to_pickle("peds.pickle")

In [196]:
peds_results.shape

(180, 62)

In [197]:
#払い戻し表データを処理するクラス
class Return:
    @staticmethod
    def scrape(race_id_list):
        """
        払い戻し表データをスクレイピングする関数

        Parameters:
        ----------
        race_id_list : list
            レースIDのリスト

        Returns:
        ----------
        return_tables_df : pandas.DataFrame
            全払い戻し表データをまとめてDataFrame型にしたもの
        """

        return_tables = {}
        for race_id in tqdm(race_id_list):
            time.sleep(1)
            try:
                url = "https://db.netkeiba.com/race/" + race_id

                #普通にスクレイピングすると複勝やワイドなどが区切られないで繋がってしまう。
                #そのため、改行コードを文字列brに変換して後でsplitする
                f = urlopen(url)
                html = f.read()
                html = html.replace(b'<br />', b'br')
                dfs = pd.read_html(html)

                #dfsの1番目に単勝〜馬連、2番目にワイド〜三連単がある
                df = pd.concat([dfs[1], dfs[2]])

                df.index = [race_id] * len(df)
                return_tables[race_id] = df
            except IndexError:
                continue
            except AttributeError: #存在しないrace_idでAttributeErrorになるページもあるので追加
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる
        return_tables_df = pd.concat([return_tables[key] for key in return_tables])
        return return_tables_df

In [136]:
return_results = Return.scrape(race_id_list)

  0%|          | 0/20 [00:00<?, ?it/s]

In [129]:
# 数値部分のみ取得
race_id_list_tmp = []
for tmp_race_id in race_id_list:
    race_id_list_tmp.append(tmp_race_id[6:18])

In [137]:
return_results.to_pickle('return_tables.pickle')

In [138]:
def update_data(old, new):
    """
    Parameters:
    ----------
    old : pandas.DataFrame
        古いデータ
    new : pandas.DataFrame
        新しいデータ
    """

    filtered_old = old[~old.index.isin(new.index)]
    return pd.concat([filtered_old, new])

In [227]:
class DataProcessor:
    """    
    Attributes:
    ----------
    data : pd.DataFrame
        rawデータ
    data_p : pd.DataFrame
        preprocessing後のデータ
    data_h : pd.DataFrame
        merge_horse_results後のデータ
    data_pe : pd.DataFrame
        merge_peds後のデータ
    data_c : pd.DataFrame
        process_categorical後のデータ
    no_peds: Numpy.array
        merge_pedsを実行した時に、血統データが存在しなかった馬のhorse_id一覧
    """
    
    def __init__(self):
        self.data = pd.DataFrame()
        self.data_p = pd.DataFrame()
        self.data_h = pd.DataFrame()
        self.data_pe = pd.DataFrame()
        self.data_c = pd.DataFrame()
        
    def merge_horse_results(self, hr, n_samples_list=[5, 9, 'all']):
        """
        馬の過去成績データから、
        n_samples_listで指定されたレース分の着順と賞金の平均を追加してdata_hに返す

        Parameters:
        ----------
        hr : HorseResults
            馬の過去成績データ
        n_samples_list : list, default [5, 9, 'all']
            過去何レース分追加するか
        """

        self.data_h = self.data_p.copy()
        for n_samples in n_samples_list:
            self.data_h = hr.merge_all(self.data_h, n_samples=n_samples)
            
        #6/6追加： 馬の出走間隔追加
        self.data_h['interval'] = (self.data_h['date'] - self.data_h['latest']).dt.days
        self.data_h.drop(['開催', 'latest'], axis=1, inplace=True)
        
    def merge_peds(self, peds):
        """
        5世代分血統データを追加してdata_peに返す

        Parameters:
        ----------
        peds : Peds.peds_e
            Pedsクラスで加工された血統データ。
        """

        self.data_pe = \
            self.data_h.merge(peds, left_on='horse_id', right_index=True,
                                                             how='left')
        self.no_peds = self.data_pe[self.data_pe['peds_0'].isnull()]\
            ['horse_id'].unique()
        if len(self.no_peds) > 0:
            print('scrape peds at horse_id_list "no_peds"')
            
    def process_categorical(self, le_horse, le_jockey, results_m):
        """
        カテゴリ変数を処理してdata_cに返す

        Parameters:
        ----------
        le_horse : sklearn.preprocessing.LabelEncoder
            horse_idを0始まりの整数に変換するLabelEncoderオブジェクト。
        le_jockey : sklearn.preprocessing.LabelEncoder
            jockey_idを0始まりの整数に変換するLabelEncoderオブジェクト。
        results_m : Results.data_pe
            ダミー変数化のとき、ResultsクラスとShutubaTableクラスで列を合わせるためのもの
        """

        df = self.data_pe.copy()
        
        #ラベルエンコーディング。horse_id, jockey_idを0始まりの整数に変換
        mask_horse = df['horse_id'].isin(le_horse.classes_)
        new_horse_id = df['horse_id'].mask(mask_horse).dropna().unique()
        le_horse.classes_ = np.concatenate([le_horse.classes_, new_horse_id])
        df['horse_id'] = le_horse.transform(df['horse_id'])
        mask_jockey = df['jockey_id'].isin(le_jockey.classes_)
        new_jockey_id = df['jockey_id'].mask(mask_jockey).dropna().unique()
        le_jockey.classes_ = np.concatenate([le_jockey.classes_, new_jockey_id])
        df['jockey_id'] = le_jockey.transform(df['jockey_id'])
        
        #horse_id, jockey_idをpandasのcategory型に変換
        df['horse_id'] = df['horse_id'].astype('category')
        df['jockey_id'] = df['jockey_id'].astype('category')
        
        #そのほかのカテゴリ変数をpandasのcategory型に変換してからダミー変数化
        #列を一定にするため
        weathers = results_m['weather'].unique()
        race_types = results_m['race_type'].unique()
        ground_states = results_m['ground_state'].unique()
        sexes = results_m['性'].unique()
        df['weather'] = pd.Categorical(df['weather'], weathers)
        df['race_type'] = pd.Categorical(df['race_type'], race_types)
        df['ground_state'] = pd.Categorical(df['ground_state'], ground_states)
        df['性'] = pd.Categorical(df['性'], sexes)
        df = pd.get_dummies(df, columns=['weather', 'race_type', 'ground_state', '性'])
        
        self.data_c = df

In [202]:
class Results(DataProcessor):
    def __init__(self, results):
        super(Results, self).__init__()
        self.data = results
        
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(race_id_list):
        """
        レース結果データをスクレイピングする関数

        Parameters:
        ----------
        race_id_list : list
            レースIDのリスト

        Returns:
        ----------
        race_results_df : pandas.DataFrame
            全レース結果データをまとめてDataFrame型にしたもの
        """

        #race_idをkeyにしてDataFrame型を格納
        race_results = {}
        for race_id in tqdm(race_id_list):
            time.sleep(1)
            try:
                url = "https://db.netkeiba.com/race/" + race_id
                #メインとなるテーブルデータを取得
                df = pd.read_html(url)[0]

                html = requests.get(url)
                html.encoding = "EUC-JP"
                soup = BeautifulSoup(html.text, "html.parser")

                #天候、レースの種類、コースの長さ、馬場の状態、日付をスクレイピング
                texts = (
                    soup.find("div", attrs={"class": "data_intro"}).find_all("p")[0].text
                    + soup.find("div", attrs={"class": "data_intro"}).find_all("p")[1].text
                )
                info = re.findall(r'\w+', texts)
                for text in info:
                    if text in ["芝", "ダート"]:
                        df["race_type"] = [text] * len(df)
                    if "障" in text:
                        df["race_type"] = ["障害"] * len(df)
                    if "m" in text:
                        df["course_len"] = [int(re.findall(r"\d+", text)[-1])] * len(df) #20211212：[0]→[-1]に修正
                    if text in ["良", "稍重", "重", "不良"]:
                        df["ground_state"] = [text] * len(df)
                    if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                        df["weather"] = [text] * len(df)
                    if "年" in text:
                        df["date"] = [text] * len(df)

                #馬ID、騎手IDをスクレイピング
                horse_id_list = []
                horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/horse")}
                )
                for a in horse_a_list:
                    horse_id = re.findall(r"\d+", a["href"])
                    horse_id_list.append(horse_id[0])
                jockey_id_list = []
                jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/jockey")}
                )
                for a in jockey_a_list:
                    jockey_id = re.findall(r"\d+", a["href"])
                    jockey_id_list.append(jockey_id[0])
                df["horse_id"] = horse_id_list
                df["jockey_id"] = jockey_id_list

                #インデックスをrace_idにする
                df.index = [race_id] * len(df)

                race_results[race_id] = df
            #存在しないrace_idを飛ばす
            except IndexError:
                continue
            except AttributeError: #存在しないrace_idでAttributeErrorになるページもあるので追加
                continue
            #wifiの接続が切れた時などでも途中までのデータを返せるようにする
            except Exception as e:
                print(e)
                break
            #Jupyterで停止ボタンを押した時の対処
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる
        race_results_df = pd.concat([race_results[key] for key in race_results])

        return race_results_df
    
    #前処理    
    def preprocessing(self):
        df = self.data.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)
        df['rank'] = df['着順'].map(lambda x:1 if x<4 else 0)

        # 性齢を性と年齢に分ける
        df["性"] = df["性齢"].map(lambda x: str(x)[0])
        df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

        # 馬体重を体重と体重変化に分ける
        df["体重"] = df["馬体重"].str.split("(", expand=True)[0]
        df["体重変化"] = df["馬体重"].str.split("(", expand=True)[1].str[:-1]
        
        #errors='coerce'で、"計不"など変換できない時に欠損値にする
        df['体重'] = pd.to_numeric(df['体重'], errors='coerce')
        df['体重変化'] = pd.to_numeric(df['体重変化'], errors='coerce')

        # 単勝をfloatに変換
        df["単勝"] = df["単勝"].astype(float)
        # 距離は10の位を切り捨てる
        df["course_len"] = df["course_len"].astype(float) // 100

        # 不要な列を削除
        df.drop(["タイム", "着差", "調教師", "性齢", "馬体重", '馬名', '騎手', '人気', '着順'],
                axis=1, inplace=True)

        df["date"] = pd.to_datetime(df["date"], format="%Y年%m月%d日")
        
        #開催場所
        df['開催'] = df.index.map(lambda x:str(x)[4:6])
        
        #6/6出走数追加
        df['n_horses'] = df.index.map(df.index.value_counts())

        self.data_p = df
    
    #カテゴリ変数の処理
    def process_categorical(self):
        self.le_horse = LabelEncoder().fit(self.data_pe['horse_id'])
        self.le_jockey = LabelEncoder().fit(self.data_pe['jockey_id'])
        super().process_categorical(self.le_horse, self.le_jockey, self.data_pe)

In [228]:
class ShutubaTable(DataProcessor):
    def __init__(self, shutuba_tables):
        super(ShutubaTable, self).__init__()
        self.data = shutuba_tables
    
    @classmethod
    def scrape(cls, race_id_list, date):
        data = pd.DataFrame()
        for race_id in tqdm(race_id_list):
            time.sleep(1)
            url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + race_id
            df = pd.read_html(url)[0]
            df = df.T.reset_index(level=0, drop=True).T

            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")

            texts = soup.find('div', attrs={'class': 'RaceData01'}).text
            texts = re.findall(r'\w+', texts)
            for text in texts:
                if 'm' in text:
                    df['course_len'] = [int(re.findall(r'\d+', text)[-1])] * len(df) #20211212：[0]→[-1]に修正
                if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                    df["weather"] = [text] * len(df)
                if text in ["良", "稍重", "重"]:
                    df["ground_state"] = [text] * len(df)
                if '不' in text:
                    df["ground_state"] = ['不良'] * len(df)
                # 2020/12/13追加
                if '稍' in text:
                    df["ground_state"] = ['稍重'] * len(df)
                if '芝' in text:
                    df['race_type'] = ['芝'] * len(df)
                if '障' in text:
                    df['race_type'] = ['障害'] * len(df)
                if 'ダ' in text:
                    df['race_type'] = ['ダート'] * len(df)
            df['date'] = [date] * len(df)

            # horse_id
            horse_id_list = []
            horse_td_list = soup.find_all("td", attrs={'class': 'HorseInfo'})
            for td in horse_td_list:
                horse_id = re.findall(r'\d+', td.find('a')['href'])[0]
                horse_id_list.append(horse_id)
            # jockey_id
            jockey_id_list = []
            jockey_td_list = soup.find_all("td", attrs={'class': 'Jockey'})
            for td in jockey_td_list:
                jockey_id = re.findall(r'\d+', td.find('a')['href'])[0]
                jockey_id_list.append(jockey_id)
            df['horse_id'] = horse_id_list
            df['jockey_id'] = jockey_id_list

            df.index = [race_id] * len(df)
            data = data.append(df)
        return cls(data)
             
    #前処理            
    def preprocessing(self):
        df = self.data.copy()
        
        df["性"] = df["性齢"].map(lambda x: str(x)[0])
        df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

        # 馬体重を体重と体重変化に分ける
        df = df[df["馬体重(増減)"] != '--']
        df["体重"] = df["馬体重(増減)"].str.split("(", expand=True)[0].astype(int)
        df["体重変化"] = df["馬体重(増減)"].str.split("(", expand=True)[1].str[:-1]
        # 2020/12/13追加：増減が「前計不」などのとき欠損値にする
        df['体重変化'] = pd.to_numeric(df['体重変化'], errors='coerce')
        
        df["date"] = pd.to_datetime(df["date"])
        
        df['枠'] = df['枠'].astype(int)
        df['馬番'] = df['馬番'].astype(int)
        df['斤量'] = df['斤量'].astype(int)
        df['開催'] = df.index.map(lambda x:str(x)[4:6])
        
        #6/6出走数追加
        df['n_horses'] = df.index.map(df.index.value_counts())

        # 距離は10の位を切り捨てる
        df["course_len"] = df["course_len"].astype(float) // 100

        # 使用する列を選択
        df = df[['枠', '馬番', '斤量', 'course_len', 'weather','race_type',
        'ground_state', 'date', 'horse_id', 'jockey_id', '性', '年齢',
       '体重', '体重変化', '開催', 'n_horses']]
        
        self.data_p = df.rename(columns={'枠': '枠番'})

In [204]:
class HorseResults:
    def __init__(self, horse_results):
        self.horse_results = horse_results[['日付', '着順', '賞金', '着差', '通過', '開催', '距離']]
        self.preprocessing()
    
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(horse_id_list):
        """
        馬の過去成績データをスクレイピングする関数

        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト

        Returns:
        ----------
        horse_results_df : pandas.DataFrame
            全馬の過去成績データをまとめてDataFrame型にしたもの
        """

        #horse_idをkeyにしてDataFrame型を格納
        horse_results = {}
        for horse_id in tqdm(horse_id_list):
            time.sleep(1)
            try:
                url = 'https://db.netkeiba.com/horse/' + horse_id
                df = pd.read_html(url)[3]
                #受賞歴がある馬の場合、3番目に受賞歴テーブルが来るため、4番目のデータを取得する
                if df.columns[0]=='受賞歴':
                    df = pd.read_html(url)[4]
                df.index = [horse_id] * len(df)
                horse_results[horse_id] = df
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる        
        horse_results_df = pd.concat([horse_results[key] for key in horse_results])

        return horse_results_df
    
    def preprocessing(self):
        df = self.horse_results.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)

        df["date"] = pd.to_datetime(df["日付"])
        df.drop(['日付'], axis=1, inplace=True)
        
        #賞金のNaNを0で埋める
        df['賞金'].fillna(0, inplace=True)
        
        #1着の着差を0にする
        df['着差'] = df['着差'].map(lambda x: 0 if x<0 else x)
        
        #レース展開データ
        #n=1: 最初のコーナー位置, n=4: 最終コーナー位置
        def corner(x, n):
            if type(x) != str:
                return x
            elif n==4:
                return int(re.findall(r'\d+', x)[-1])
            elif n==1:
                return int(re.findall(r'\d+', x)[0])
        df['first_corner'] = df['通過'].map(lambda x: corner(x, 1))
        df['final_corner'] = df['通過'].map(lambda x: corner(x, 4))
        
        df['final_to_rank'] = df['final_corner'] - df['着順']
        df['first_to_rank'] = df['first_corner'] - df['着順']
        df['first_to_final'] = df['first_corner'] - df['final_corner']
        
        #開催場所
        df['開催'] = df['開催'].str.extract(r'(\D+)')[0].map(place_dict).fillna('11')
        #race_type
        df['race_type'] = df['距離'].str.extract(r'(\D+)')[0].map(race_type_dict)
        #距離は10の位を切り捨てる
        df['course_len'] = df['距離'].str.extract(r'(\d+)').astype(int) // 100
        df.drop(['距離'], axis=1, inplace=True)
        #インデックス名を与える
        df.index.name = 'horse_id'
        
        self.horse_results = df
        self.target_list = ['着順', '賞金', '着差', 'first_corner', 'final_corner',
                            'first_to_rank', 'first_to_final','final_to_rank']
    
    #n_samplesレース分馬ごとに平均する
    def average(self, horse_id_list, date, n_samples='all'):
        target_df = self.horse_results.query('index in @horse_id_list')
        
        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')
        
        #集計して辞書型に入れる
        self.average_dict = {}
        self.average_dict['non_category'] = filtered_df.groupby(level=0)[self.target_list].mean()\
            .add_suffix('_{}R'.format(n_samples))
        for column in ['course_len', 'race_type', '開催']:
            self.average_dict[column] = filtered_df.groupby(['horse_id', column])\
                [self.target_list].mean().add_suffix('_{}_{}R'.format(column, n_samples))

        #6/6追加: 馬の出走間隔追加のために、全レースの日付を変数latestに格納
        if n_samples == 5:
            self.latest = filtered_df.groupby('horse_id')['date'].max().rename('latest')
    
    def merge(self, results, date, n_samples='all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        self.average(horse_id_list, date, n_samples)
        merged_df = df.merge(self.average_dict['non_category'], left_on='horse_id',
                             right_index=True, how='left')
        for column in ['course_len','race_type', '開催']:
            merged_df = merged_df.merge(self.average_dict[column], 
                                        left_on=['horse_id', column],
                                        right_index=True, how='left')

        #6/6追加：馬の出走間隔追加のために、全レースの日付を変数latestに格納
        if n_samples == 5:
            merged_df = merged_df.merge(self.latest, left_on='horse_id',
                             right_index=True, how='left')
        return merged_df
    
    def merge_all(self, results, n_samples='all'):
        date_list = results['date'].unique()
        merged_df = pd.concat([self.merge(results, date, n_samples) for date in tqdm(date_list)])
        return merged_df

#開催場所をidに変換するための辞書型
place_dict = {
    '札幌':'01',  '函館':'02',  '福島':'03',  '新潟':'04',  '東京':'05', 
    '中山':'06',  '中京':'07',  '京都':'08',  '阪神':'09',  '小倉':'10'
}

#レースタイプをレース結果データと整合させるための辞書型
race_type_dict = {
    '芝': '芝', 'ダ': 'ダート', '障': '障害'
}

In [205]:
# horse_idの距離が芝のみのデータがあると
horse_results[horse_results['距離'] == '芝'].index

Index(['2005190008'], dtype='object')

In [206]:
horse_results = horse_results.drop(['2005190008'])

In [207]:
hr = HorseResults(horse_results)

In [208]:
hr.horse_results.head()

Unnamed: 0_level_0,着順,賞金,着差,通過,開催,date,first_corner,final_corner,final_to_rank,first_to_rank,first_to_final,race_type,course_len
horse_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2016102179,8,0.0,,,11,2021-12-12,,,,,,芝,16
2016102179,2,2726.0,0.0,12-5-5,5,2021-10-10,12.0,5.0,3.0,10.0,7.0,芝,18
2016102179,1,13298.2,0.0,8-8,5,2021-06-06,8.0,8.0,7.0,7.0,0.0,芝,16
2016102179,12,0.0,2.9,5-6-6,5,2020-11-01,5.0,6.0,-6.0,-7.0,-1.0,芝,20
2016102179,7,0.0,0.8,6-5,5,2020-06-07,6.0,5.0,-2.0,-1.0,1.0,芝,16


In [209]:
class Peds:
    def __init__(self, peds):
        self.peds = peds
        self.peds_e = pd.DataFrame() #after label encoding and transforming into category
    
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(horse_id_list):
        """
        血統データをスクレイピングする関数

        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト

        Returns:
        ----------
        peds_df : pandas.DataFrame
            全血統データをまとめてDataFrame型にしたもの
        """

        peds_dict = {}
        for horse_id in tqdm(horse_id_list):
            time.sleep(1)
            try:
                url = "https://db.netkeiba.com/horse/ped/" + horse_id
                df = pd.read_html(url)[0]

                #重複を削除して1列のSeries型データに直す
                generations = {}
                for i in reversed(range(5)):
                    generations[i] = df[i]
                    df.drop([i], axis=1, inplace=True)
                    df = df.drop_duplicates()
                ped = pd.concat([generations[i] for i in range(5)]).rename(horse_id)

                peds_dict[horse_id] = ped.reset_index(drop=True)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #列名をpeds_0, ..., peds_61にする
        peds_df = pd.concat([peds_dict[key] for key in peds_dict], axis=1).T.add_prefix('peds_')

        return peds_df
    
    def encode(self):
        df = self.peds.copy()
        for column in df.columns:
            df[column] = LabelEncoder().fit_transform(df[column].fillna('Na'))
        self.peds_e = df.astype('category')

In [210]:
p = Peds.read_pickle(['peds.pickle'])
p.encode()

In [218]:
p.peds_e.head()

Unnamed: 0,peds_0,peds_1,peds_2,peds_3,peds_4,peds_5,peds_6,peds_7,peds_8,peds_9,...,peds_52,peds_53,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61
2016102179,58,132,34,49,66,9,5,39,2,8,...,14,10,16,25,1,45,63,42,31,95
2016104532,58,87,34,49,67,24,5,39,2,8,...,52,68,28,62,19,95,56,41,3,90
2018110007,15,84,17,43,64,62,3,28,49,23,...,11,51,18,5,70,48,36,8,47,105
2015104688,45,38,34,55,82,133,5,39,54,72,...,6,102,44,7,67,10,32,76,23,114
2016106518,31,43,19,0,116,120,10,20,28,1,...,39,100,25,37,54,1,88,102,21,5


In [212]:
r = Results.read_pickle(['results.pickle'])
r.preprocessing()
r.data_p.head() #jupyterで出力

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,weather,race_type,ground_state,date,horse_id,jockey_id,rank,性,年齢,体重,体重変化,開催,n_horses
202105030211,7,11,58,47.6,16.0,曇,芝,良,2021-06-06,2016102179,1088,1,牡,5,456,6,5,14
202105030211,4,5,56,1.5,16.0,曇,芝,良,2021-06-06,2016104532,5339,1,牝,5,502,4,5,14
202105030211,8,13,54,10.2,16.0,曇,芝,良,2021-06-06,2018110007,1170,1,牡,3,474,-6,5,14
202105030211,5,8,58,7.0,16.0,曇,芝,良,2021-06-06,2015104688,1014,0,牡,6,484,6,5,14
202105030211,6,9,58,153.6,16.0,曇,芝,良,2021-06-06,2016106518,5386,0,牡,5,472,-4,5,14


In [213]:
r.merge_horse_results(hr, n_samples_list=[5, 9, 'all'])
r.data_h.head() #jupyterで出力

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,weather,race_type,ground_state,date,horse_id,...,final_to_rank_race_type_allR,着順_開催_allR,賞金_開催_allR,着差_開催_allR,first_corner_開催_allR,final_corner_開催_allR,first_to_rank_開催_allR,first_to_final_開催_allR,final_to_rank_開催_allR,interval
202105030211,7,11,58,47.6,16.0,曇,芝,良,2021-06-06,2016102179,...,1.363636,4.0,3336.233333,0.616667,5.333333,5.166667,1.333333,0.166667,1.166667,217.0
202105030211,4,5,56,1.5,16.0,曇,芝,良,2021-06-06,2016104532,...,4.454545,1.0,7054.55,0.0,5.5,5.25,4.5,0.25,4.25,21.0
202105030211,8,13,54,10.2,16.0,曇,芝,良,2021-06-06,2018110007,...,3.0,1.0,10882.2,0.0,9.0,9.0,8.0,0.0,8.0,28.0
202105030211,5,8,58,7.0,16.0,曇,芝,良,2021-06-06,2015104688,...,3.631579,2.0,5086.8,0.225,5.5,5.0,3.5,0.5,3.0,70.0
202105030211,6,9,58,153.6,16.0,曇,芝,良,2021-06-06,2016106518,...,-3.62963,6.875,283.175,0.875,1.25,1.125,-5.625,0.125,-5.75,64.0


In [214]:
r.merge_peds(p.peds_e)
r.data_pe.head() #jupyterで出力

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,weather,race_type,ground_state,date,horse_id,...,peds_52,peds_53,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61
202105030211,7,11,58,47.6,16.0,曇,芝,良,2021-06-06,2016102179,...,14,10,16,25,1,45,63,42,31,95
202105030211,4,5,56,1.5,16.0,曇,芝,良,2021-06-06,2016104532,...,52,68,28,62,19,95,56,41,3,90
202105030211,8,13,54,10.2,16.0,曇,芝,良,2021-06-06,2018110007,...,11,51,18,5,70,48,36,8,47,105
202105030211,5,8,58,7.0,16.0,曇,芝,良,2021-06-06,2015104688,...,6,102,44,7,67,10,32,76,23,114
202105030211,6,9,58,153.6,16.0,曇,芝,良,2021-06-06,2016106518,...,39,100,25,37,54,1,88,102,21,5


In [215]:
r.process_categorical() #r.le_horse, r.le_jockeyに対応関係が保存される

In [217]:
r.data_c.head()

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,date,horse_id,jockey_id,rank,年齢,...,weather_曇,weather_晴,weather_小雨,race_type_芝,ground_state_良,ground_state_稍重,ground_state_不良,性_牡,性_牝,性_セ
202105030211,7,11,58,47.6,16.0,2021-06-06,169,31,1,5,...,1,0,0,1,1,0,0,1,0,0
202105030211,4,5,56,1.5,16.0,2021-06-06,172,58,1,5,...,1,0,0,1,1,0,0,0,1,0
202105030211,8,13,54,10.2,16.0,2021-06-06,179,46,1,3,...,1,0,0,1,1,0,0,1,0,0
202105030211,5,8,58,7.0,16.0,2021-06-06,164,19,0,6,...,1,0,0,1,1,0,0,1,0,0
202105030211,6,9,58,153.6,16.0,2021-06-06,175,59,0,5,...,1,0,0,1,1,0,0,1,0,0


In [233]:
# 予想したいデータ (体重が発表されたのち、実行可能になる)
# 現時点では試しに21年のを取得(テストデータに入っているので、そこは排除)
race_id_list = ['202105030211']
st = ShutubaTable.scrape(race_id_list, '2021/06/06')

  0%|          | 0/1 [00:00<?, ?it/s]

In [234]:
#前処理
st.preprocessing()

#馬の過去成績データの追加。新馬はNaNが追加される
st.merge_horse_results(hr)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [236]:
st.no_peds

array([], dtype=object)

In [237]:
st.merge_peds(p.peds_e)

In [240]:
#5世代分の血統データの追加
st.merge_peds(p.peds_e)

#scrape peds at horse_id_list "no_peds"と表示された場合
if(len(st.no_peds) > 0):
    peds_new = Peds.scrape(st.no_peds)
    p.peds_e.to_pickle('peds_h.pickle') #pedsを更新する前にバックアップ
    peds = update_data(p.peds, peds_new)
    peds.to_pickle('peds.pickle')
    
p = Peds.read_pickle(['peds.pickle'])
p.encode()
st.merge_peds(p.peds_e)

In [241]:
st.process_categorical(r.le_horse, r.le_jockey, r.data_pe)

In [242]:
st.data_c

Unnamed: 0,枠番,馬番,斤量,course_len,date,horse_id,jockey_id,年齢,体重,体重変化,...,weather_曇,weather_晴,weather_小雨,race_type_芝,ground_state_良,ground_state_稍重,ground_state_不良,性_牡,性_牝,性_セ
202105030211,1,1,58,16.0,2021-06-06,178,40,4,536,-2,...,1,0,0,1,1,0,0,1,0,0
202105030211,2,2,58,16.0,2021-06-06,163,47,6,512,0,...,1,0,0,1,1,0,0,1,0,0
202105030211,3,3,58,16.0,2021-06-06,153,29,7,492,-2,...,1,0,0,1,1,0,0,0,0,1
202105030211,3,4,58,16.0,2021-06-06,176,49,5,526,-4,...,1,0,0,1,1,0,0,1,0,0
202105030211,4,5,56,16.0,2021-06-06,172,58,5,502,4,...,1,0,0,1,1,0,0,0,1,0
202105030211,4,6,58,16.0,2021-06-06,161,24,6,510,10,...,1,0,0,1,1,0,0,1,0,0
202105030211,5,7,58,16.0,2021-06-06,177,52,4,516,2,...,1,0,0,1,1,0,0,1,0,0
202105030211,5,8,58,16.0,2021-06-06,164,19,6,484,6,...,1,0,0,1,1,0,0,1,0,0
202105030211,6,9,58,16.0,2021-06-06,175,59,5,472,-4,...,1,0,0,1,1,0,0,1,0,0
202105030211,6,10,58,16.0,2021-06-06,148,5,7,480,-4,...,1,0,0,1,1,0,0,1,0,0


In [243]:
#時系列に沿って訓練データとテストデータに分ける関数
def split_data(df, test_size=0.3):
    sorted_id_list = df.sort_values("date").index.unique()
    train_id_list = sorted_id_list[: round(len(sorted_id_list) * (1 - test_size))]
    test_id_list = sorted_id_list[round(len(sorted_id_list) * (1 - test_size)) :]
    train = df.loc[train_id_list]
    test = df.loc[test_id_list]
    return train, test

In [244]:
train, test = split_data(r.data_c)

In [245]:
train, valid = split_data(train)

#説明変数と目的変数に分ける。dateはこの後不要なので省く。単勝オッズも学習時には使わない。
X_train = train.drop(['rank', 'date', '単勝'], axis=1)
y_train = train['rank']
X_valid = valid.drop(['rank', 'date', '単勝'], axis=1)
y_valid = valid['rank']

In [248]:
#データセットを作成
lgb_train = lgb_o.Dataset(X_train.values, y_train.values)
lgb_valid = lgb_o.Dataset(X_valid.values, y_valid.values)

params = {
    'objective': 'binary', #今回は0or1の二値予測なのでbinaryを指定
    'random_state': 100
}

#チューニング実行
lgb_clf_o = lgb_o.train(params, lgb_train,
                        valid_sets=(lgb_train, lgb_valid),
                        verbose_eval=100,
                        early_stopping_rounds=10,
                        optuna_seed=100 #optunaのseed固定
                        )

[32m[I 2022-06-03 02:12:39,112][0m A new study created in memory with name: no-name-e52b60cc-7d61-472d-953a-a5a7f4b9bb7d[0m
feature_fraction, val_score: 0.429655:   0%|          | 0/7 [00:00<?, ?it/s][32m[I 2022-06-03 02:12:39,142][0m Trial 0 finished with value: 0.4296554747111902 and parameters: {'feature_fraction': 1.0}. Best is trial 0 with value: 0.4296554747111902.[0m
feature_fraction, val_score: 0.429655:  14%|#4        | 1/7 [00:00<00:00, 20.93it/s][32m[I 2022-06-03 02:12:39,166][0m Trial 1 finished with value: 0.4306351859907726 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.4296554747111902.[0m
feature_fraction, val_score: 0.408874:  29%|##8       | 2/7 [00:00<00:00, 27.59it/s][32m[I 2022-06-03 02:12:39,190][0m Trial 2 finished with value: 0.4088736507061187 and parameters: {'feature_fraction': 0.7}. Best is trial 2 with value: 0.4088736507061187.[0m
feature_fraction, val_score: 0.408874:  43%|####2     | 3/7 [00:00<00:00, 

[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[9]	valid_0's binary_logloss: 0.289984	valid_1's binary_logloss: 0.429655
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
E

num_leaves, val_score: 0.408874:   0%|          | 0/20 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176


num_leaves, val_score: 0.408874:   0%|          | 0/20 [00:00<?, ?it/s][32m[I 2022-06-03 02:12:39,324][0m Trial 7 finished with value: 0.4088736507061187 and parameters: {'num_leaves': 140}. Best is trial 7 with value: 0.4088736507061187.[0m
num_leaves, val_score: 0.408874:   5%|5         | 1/20 [00:00<00:00, 19.52it/s][32m[I 2022-06-03 02:12:39,347][0m Trial 8 finished with value: 0.4088736507061187 and parameters: {'num_leaves': 72}. Best is trial 7 with value: 0.4088736507061187.[0m
num_leaves, val_score: 0.408874:  10%|#         | 2/20 [00:00<00:00, 21.81it/s][32m[I 2022-06-03 02:12:39,388][0m Trial 9 finished with value: 0.4088736507061187 and parameters: {'num_leaves': 110}. Best is trial 7 with value: 0.4088736507061187.[0m
num_leaves, val_score: 0.408874:  20%|##        | 4/20 [00:00<00:00, 32.12it/s][32m[I 2022-06-03 02:12:39,420][0m Trial 10 finished with value: 0.4088736507061187 and parameters: {'num_leaves': 217}. Best is trial 7 with value: 0.4088736507061187.

[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[13]	valid_0's binary_logloss: 0.245421	valid_1's binary_logloss: 0.408874
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[13]	valid_0's binary_logloss: 0.245421	valid_1's binary_logloss: 0.408874
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info

num_leaves, val_score: 0.408874:  30%|###       | 6/20 [00:00<00:00, 32.12it/s][32m[I 2022-06-03 02:12:39,488][0m Trial 13 finished with value: 0.4088736507061187 and parameters: {'num_leaves': 173}. Best is trial 7 with value: 0.4088736507061187.[0m
num_leaves, val_score: 0.408874:  35%|###5      | 7/20 [00:00<00:00, 32.12it/s]

Early stopping, best iteration is:
[13]	valid_0's binary_logloss: 0.245421	valid_1's binary_logloss: 0.408874


num_leaves, val_score: 0.408874:  35%|###5      | 7/20 [00:00<00:00, 32.12it/s][32m[I 2022-06-03 02:12:39,519][0m Trial 14 finished with value: 0.4088736507061187 and parameters: {'num_leaves': 212}. Best is trial 7 with value: 0.4088736507061187.[0m
num_leaves, val_score: 0.408874:  45%|####5     | 9/20 [00:00<00:00, 37.26it/s][32m[I 2022-06-03 02:12:39,543][0m Trial 15 finished with value: 0.4088736507061187 and parameters: {'num_leaves': 36}. Best is trial 7 with value: 0.4088736507061187.[0m
num_leaves, val_score: 0.408874:  45%|####5     | 9/20 [00:00<00:00, 37.26it/s][32m[I 2022-06-03 02:12:39,569][0m Trial 16 finished with value: 0.4088736507061187 and parameters: {'num_leaves': 148}. Best is trial 7 with value: 0.4088736507061187.[0m
num_leaves, val_score: 0.408874:  50%|#####     | 10/20 [00:00<00:00, 37.26it/s][32m[I 2022-06-03 02:12:39,604][0m Trial 17 finished with value: 0.4088736507061187 and parameters: {'num_leaves': 248}. Best is trial 7 with value: 0.408873

[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[13]	valid_0's binary_logloss: 0.245421	valid_1's binary_logloss: 0.408874
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.408874:  65%|######5   | 13/20 [00:00<00:00, 35.78it/s][32m[I 2022-06-03 02:12:39,691][0m Trial 20 finished with value: 0.4088736507061187 and parameters: {'num_leaves': 140}. Best is trial 7 with value: 0.4088736507061187.[0m
num_leaves, val_score: 0.408874:  70%|#######   | 14/20 [00:00<00:00, 35.78it/s][32m[I 2022-06-03 02:12:39,720][0m Trial 21 finished with value: 0.4088736507061187 and parameters: {'num_leaves': 71}. Best is trial 7 with value: 0.4088736507061187.[0m
num_leaves, val_score: 0.408874:  75%|#######5  | 15/20 [00:00<00:00, 35.78it/s][32m[I 2022-06-03 02:12:39,750][0m Trial 22 finished with value: 0.4088736507061187 and parameters: {'num_leaves': 166}. Best is trial 7 with value: 0.4088736507061187.[0m
num_leaves, val_score: 0.408874:  85%|########5 | 17/20 [00:00<00:00, 34.90it/s][32m[I 2022-06-03 02:12:39,778][0m Trial 23 finished with value: 0.4088736507061187 and parameters: {'num_leaves': 118}. Best is trial 7 with value: 0.408

Early stopping, best iteration is:
[13]	valid_0's binary_logloss: 0.245421	valid_1's binary_logloss: 0.408874
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[13]	valid_0's binary_logloss: 0.245421	valid_1's binary_logloss: 0.408874
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[Ligh

num_leaves, val_score: 0.408874:  95%|#########5| 19/20 [00:00<00:00, 34.90it/s][32m[I 2022-06-03 02:12:39,871][0m Trial 26 finished with value: 0.4088736507061187 and parameters: {'num_leaves': 119}. Best is trial 7 with value: 0.4088736507061187.[0m
num_leaves, val_score: 0.408874: 100%|##########| 20/20 [00:00<00:00, 34.67it/s]


Early stopping, best iteration is:
[13]	valid_0's binary_logloss: 0.245421	valid_1's binary_logloss: 0.408874


bagging, val_score: 0.408874:   0%|          | 0/10 [00:00<?, ?it/s][32m[I 2022-06-03 02:12:39,896][0m Trial 27 finished with value: 0.4491265134443783 and parameters: {'bagging_fraction': 0.7260429650751228, 'bagging_freq': 2}. Best is trial 27 with value: 0.4491265134443783.[0m
bagging, val_score: 0.408874:  10%|#         | 1/10 [00:00<00:00, 23.42it/s][32m[I 2022-06-03 02:12:39,918][0m Trial 28 finished with value: 0.45866569100589905 and parameters: {'bagging_fraction': 0.6547105544499044, 'bagging_freq': 6}. Best is trial 27 with value: 0.4491265134443783.[0m
bagging, val_score: 0.385607:  20%|##        | 2/10 [00:00<00:00, 28.63it/s][32m[I 2022-06-03 02:12:39,944][0m Trial 29 finished with value: 0.38560689218374256 and parameters: {'bagging_fraction': 0.4028313137145883, 'bagging_freq': 1}. Best is trial 29 with value: 0.38560689218374256.[0m
bagging, val_score: 0.385607:  30%|###       | 3/10 [00:00<00:00, 32.15it/s][32m[I 2022-06-03 02:12:39,969][0m Trial 30 finishe

[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[6]	valid_0's binary_logloss: 0.361913	valid_1's binary_logloss: 0.449127
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
E

bagging, val_score: 0.385607:  70%|#######   | 7/10 [00:00<00:00, 44.19it/s][32m[I 2022-06-03 02:12:40,050][0m Trial 34 finished with value: 0.48550475943619015 and parameters: {'bagging_fraction': 0.531818495575215, 'bagging_freq': 7}. Best is trial 29 with value: 0.38560689218374256.[0m
bagging, val_score: 0.385607:  80%|########  | 8/10 [00:00<00:00, 44.19it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[2]	valid_0's binary_logloss: 0.419849	valid_1's binary_logloss: 0.485505
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds


bagging, val_score: 0.385607:  80%|########  | 8/10 [00:00<00:00, 44.19it/s][32m[I 2022-06-03 02:12:40,074][0m Trial 35 finished with value: 0.44383404225736495 and parameters: {'bagging_fraction': 0.8870098894544057, 'bagging_freq': 2}. Best is trial 29 with value: 0.38560689218374256.[0m
bagging, val_score: 0.385607: 100%|##########| 10/10 [00:00<00:00, 45.37it/s][32m[I 2022-06-03 02:12:40,097][0m Trial 36 finished with value: 0.4419225135462358 and parameters: {'bagging_fraction': 0.8897348492363203, 'bagging_freq': 2}. Best is trial 29 with value: 0.38560689218374256.[0m
bagging, val_score: 0.385607: 100%|##########| 10/10 [00:00<00:00, 44.82it/s]


Early stopping, best iteration is:
[8]	valid_0's binary_logloss: 0.318717	valid_1's binary_logloss: 0.443834
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[9]	valid_0's binary_logloss: 0.305249	valid_1's binary_logloss: 0.441923


feature_fraction_stage2, val_score: 0.385607:   0%|          | 0/6 [00:00<?, ?it/s][32m[I 2022-06-03 02:12:40,131][0m Trial 37 finished with value: 0.4294873231198179 and parameters: {'feature_fraction': 0.652}. Best is trial 37 with value: 0.4294873231198179.[0m
feature_fraction_stage2, val_score: 0.385607:  17%|#6        | 1/6 [00:00<00:00, 17.22it/s][32m[I 2022-06-03 02:12:40,160][0m Trial 38 finished with value: 0.38723113105707563 and parameters: {'feature_fraction': 0.748}. Best is trial 38 with value: 0.38723113105707563.[0m
feature_fraction_stage2, val_score: 0.385607:  33%|###3      | 2/6 [00:00<00:00, 21.91it/s][32m[I 2022-06-03 02:12:40,194][0m Trial 39 finished with value: 0.38560689218374256 and parameters: {'feature_fraction': 0.716}. Best is trial 39 with value: 0.38560689218374256.[0m
feature_fraction_stage2, val_score: 0.385607:  67%|######6   | 4/6 [00:00<00:00, 33.21it/s][32m[I 2022-06-03 02:12:40,222][0m Trial 40 finished with value: 0.38593174450192597 a

[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[11]	valid_0's binary_logloss: 0.376383	valid_1's binary_logloss: 0.429487
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds


feature_fraction_stage2, val_score: 0.385607:  67%|######6   | 4/6 [00:00<00:00, 33.21it/s][32m[I 2022-06-03 02:12:40,263][0m Trial 41 finished with value: 0.4014101552300085 and parameters: {'feature_fraction': 0.62}. Best is trial 39 with value: 0.38560689218374256.[0m
feature_fraction_stage2, val_score: 0.385607:  83%|########3 | 5/6 [00:00<00:00, 33.21it/s]

Early stopping, best iteration is:
[50]	valid_0's binary_logloss: 0.256976	valid_1's binary_logloss: 0.40141
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds


feature_fraction_stage2, val_score: 0.385588:  83%|########3 | 5/6 [00:00<00:00, 33.21it/s][32m[I 2022-06-03 02:12:40,294][0m Trial 42 finished with value: 0.3855879725773789 and parameters: {'feature_fraction': 0.6839999999999999}. Best is trial 42 with value: 0.3855879725773789.[0m
feature_fraction_stage2, val_score: 0.385588: 100%|##########| 6/6 [00:00<00:00, 30.86it/s]


Early stopping, best iteration is:
[37]	valid_0's binary_logloss: 0.285594	valid_1's binary_logloss: 0.385588


regularization_factors, val_score: 0.385588:   0%|          | 0/20 [00:00<?, ?it/s][32m[I 2022-06-03 02:12:40,332][0m Trial 43 finished with value: 0.3856202196758068 and parameters: {'lambda_l1': 0.0007773998922821829, 'lambda_l2': 3.2012859298995277e-06}. Best is trial 43 with value: 0.3856202196758068.[0m
regularization_factors, val_score: 0.385588:   5%|5         | 1/20 [00:00<00:01, 18.33it/s][32m[I 2022-06-03 02:12:40,353][0m Trial 44 finished with value: 0.4321634610542389 and parameters: {'lambda_l1': 6.616957066014342e-05, 'lambda_l2': 0.400853048601546}. Best is trial 43 with value: 0.3856202196758068.[0m
regularization_factors, val_score: 0.385588:  10%|#         | 2/20 [00:00<00:00, 24.12it/s][32m[I 2022-06-03 02:12:40,381][0m Trial 45 finished with value: 0.38558797414840346 and parameters: {'lambda_l1': 1.1027313099672533e-08, 'lambda_l2': 1.242001404761155e-07}. Best is trial 45 with value: 0.38558797414840346.[0m
regularization_factors, val_score: 0.385588:  20

[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[37]	valid_0's binary_logloss: 0.28563	valid_1's binary_logloss: 0.38562
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Ea

regularization_factors, val_score: 0.385588:  20%|##        | 4/20 [00:00<00:00, 38.61it/s][32m[I 2022-06-03 02:12:40,439][0m Trial 47 finished with value: 0.3856254010664742 and parameters: {'lambda_l1': 1.6996492507894156e-07, 'lambda_l2': 0.0014991323116035308}. Best is trial 45 with value: 0.38558797414840346.[0m
regularization_factors, val_score: 0.385588:  25%|##5       | 5/20 [00:00<00:00, 38.61it/s]

Early stopping, best iteration is:
[37]	valid_0's binary_logloss: 0.285649	valid_1's binary_logloss: 0.385625
[LightGBM] [Info] Number of positive: 24, number of negative: 120


regularization_factors, val_score: 0.385588:  25%|##5       | 5/20 [00:00<00:00, 38.61it/s][32m[I 2022-06-03 02:12:40,461][0m Trial 48 finished with value: 0.435712397666919 and parameters: {'lambda_l1': 1.0517138394360073, 'lambda_l2': 7.635176818135586e-07}. Best is trial 45 with value: 0.38558797414840346.[0m
regularization_factors, val_score: 0.385588:  30%|###       | 6/20 [00:00<00:00, 38.61it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[11]	valid_0's binary_logloss: 0.390035	valid_1's binary_logloss: 0.435712
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds


regularization_factors, val_score: 0.385588:  30%|###       | 6/20 [00:00<00:00, 38.61it/s][32m[I 2022-06-03 02:12:40,490][0m Trial 49 finished with value: 0.38558797954673474 and parameters: {'lambda_l1': 4.655367559816141e-07, 'lambda_l2': 9.449134137745608e-08}. Best is trial 45 with value: 0.38558797414840346.[0m


Early stopping, best iteration is:
[37]	valid_0's binary_logloss: 0.285594	valid_1's binary_logloss: 0.385588


regularization_factors, val_score: 0.385588:  40%|####      | 8/20 [00:00<00:00, 35.23it/s][32m[I 2022-06-03 02:12:40,523][0m Trial 50 finished with value: 0.43290368572725435 and parameters: {'lambda_l1': 9.490245203532942e-07, 'lambda_l2': 6.421168438428032}. Best is trial 45 with value: 0.38558797414840346.[0m
regularization_factors, val_score: 0.385588:  40%|####      | 8/20 [00:00<00:00, 35.23it/s][32m[I 2022-06-03 02:12:40,556][0m Trial 51 finished with value: 0.40487127530789463 and parameters: {'lambda_l1': 0.2019055894080857, 'lambda_l2': 3.5275169933928286e-07}. Best is trial 45 with value: 0.38558797414840346.[0m
regularization_factors, val_score: 0.385588:  45%|####5     | 9/20 [00:00<00:00, 35.23it/s][32m[I 2022-06-03 02:12:40,585][0m Trial 52 finished with value: 0.40494687461896833 and parameters: {'lambda_l1': 0.22183125618514202, 'lambda_l2': 2.9286247167445133e-06}. Best is trial 45 with value: 0.38558797414840346.[0m
regularization_factors, val_score: 0.3855

[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[43]	valid_0's binary_logloss: 0.351713	valid_1's binary_logloss: 0.432904
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds


regularization_factors, val_score: 0.385588:  50%|#####     | 10/20 [00:00<00:00, 35.23it/s][32m[I 2022-06-03 02:12:40,620][0m Trial 53 finished with value: 0.38560603379839664 and parameters: {'lambda_l1': 1.0730217089799505e-08, 'lambda_l2': 0.00018582152608880508}. Best is trial 45 with value: 0.38558797414840346.[0m
regularization_factors, val_score: 0.385588:  55%|#####5    | 11/20 [00:00<00:00, 35.23it/s]

Early stopping, best iteration is:
[37]	valid_0's binary_logloss: 0.285611	valid_1's binary_logloss: 0.385606
[LightGBM] [Info] Number of positive: 24, number of negative: 120


regularization_factors, val_score: 0.385588:  60%|######    | 12/20 [00:00<00:00, 32.90it/s][32m[I 2022-06-03 02:12:40,655][0m Trial 54 finished with value: 0.38560423707650093 and parameters: {'lambda_l1': 3.3746414037369566e-06, 'lambda_l2': 2.0899032843310533e-08}. Best is trial 45 with value: 0.38558797414840346.[0m
regularization_factors, val_score: 0.385588:  60%|######    | 12/20 [00:00<00:00, 32.90it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[37]	valid_0's binary_logloss: 0.285606	valid_1's binary_logloss: 0.385604
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds


regularization_factors, val_score: 0.385588:  60%|######    | 12/20 [00:00<00:00, 32.90it/s][32m[I 2022-06-03 02:12:40,691][0m Trial 55 finished with value: 0.38558797297196257 and parameters: {'lambda_l1': 1.4762226478780532e-08, 'lambda_l2': 1.5458004122318363e-08}. Best is trial 55 with value: 0.38558797297196257.[0m
regularization_factors, val_score: 0.385588:  65%|######5   | 13/20 [00:00<00:00, 32.90it/s][32m[I 2022-06-03 02:12:40,726][0m Trial 56 finished with value: 0.38558797294553065 and parameters: {'lambda_l1': 1.48957750785298e-08, 'lambda_l2': 1.4673017387722897e-08}. Best is trial 56 with value: 0.38558797294553065.[0m
regularization_factors, val_score: 0.385588:  70%|#######   | 14/20 [00:00<00:00, 32.90it/s][32m[I 2022-06-03 02:12:40,762][0m Trial 57 finished with value: 0.3855882191468465 and parameters: {'lambda_l1': 2.018767681185217e-05, 'lambda_l2': 1.2817565414492942e-08}. Best is trial 56 with value: 0.38558797294553065.[0m
regularization_factors, val_s

Early stopping, best iteration is:
[37]	valid_0's binary_logloss: 0.285594	valid_1's binary_logloss: 0.385588
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[37]	valid_0's binary_logloss: 0.285594	valid_1's binary_logloss: 0.385588
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[Ligh

regularization_factors, val_score: 0.385588:  80%|########  | 16/20 [00:00<00:00, 30.15it/s]

Early stopping, best iteration is:
[37]	valid_0's binary_logloss: 0.285608	valid_1's binary_logloss: 0.385605


[32m[I 2022-06-03 02:12:40,804][0m Trial 58 finished with value: 0.38560479753079907 and parameters: {'lambda_l1': 7.36106425148854e-08, 'lambda_l2': 6.071300770585118e-05}. Best is trial 56 with value: 0.38558797294553065.[0m
regularization_factors, val_score: 0.385588:  80%|########  | 16/20 [00:00<00:00, 30.15it/s]

[LightGBM] [Info] Number of positive: 24, number of negative: 120


regularization_factors, val_score: 0.385588:  80%|########  | 16/20 [00:00<00:00, 30.15it/s][32m[I 2022-06-03 02:12:40,845][0m Trial 59 finished with value: 0.38562535872222087 and parameters: {'lambda_l1': 0.0011925209334834297, 'lambda_l2': 1.4343283815030283e-05}. Best is trial 56 with value: 0.38558797294553065.[0m
regularization_factors, val_score: 0.385588:  85%|########5 | 17/20 [00:00<00:00, 30.15it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[37]	valid_0's binary_logloss: 0.285642	valid_1's binary_logloss: 0.385625
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds


regularization_factors, val_score: 0.385588:  85%|########5 | 17/20 [00:00<00:00, 30.15it/s][32m[I 2022-06-03 02:12:40,886][0m Trial 60 finished with value: 0.38564349841239065 and parameters: {'lambda_l1': 5.1650414338416995e-06, 'lambda_l2': 0.003347394226752911}. Best is trial 56 with value: 0.38558797294553065.[0m
regularization_factors, val_score: 0.385588:  90%|######### | 18/20 [00:00<00:00, 30.15it/s][32m[I 2022-06-03 02:12:40,924][0m Trial 61 finished with value: 0.3855879738089805 and parameters: {'lambda_l1': 7.504358535399918e-08, 'lambda_l2': 1.0099406079428058e-08}. Best is trial 56 with value: 0.38558797294553065.[0m
regularization_factors, val_score: 0.385588: 100%|##########| 20/20 [00:00<00:00, 28.30it/s][32m[I 2022-06-03 02:12:40,961][0m Trial 62 finished with value: 0.38578672096475447 and parameters: {'lambda_l1': 4.3030824008308316e-05, 'lambda_l2': 0.01796879758736836}. Best is trial 56 with value: 0.38558797294553065.[0m
regularization_factors, val_scor

Early stopping, best iteration is:
[37]	valid_0's binary_logloss: 0.2857	valid_1's binary_logloss: 0.385643
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[37]	valid_0's binary_logloss: 0.285594	valid_1's binary_logloss: 0.385588
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightG

min_data_in_leaf, val_score: 0.385588:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.


min_data_in_leaf, val_score: 0.385588:   0%|          | 0/5 [00:00<?, ?it/s][32m[I 2022-06-03 02:12:40,987][0m Trial 63 finished with value: 0.4611660286391853 and parameters: {'min_child_samples': 25}. Best is trial 63 with value: 0.4611660286391853.[0m
min_data_in_leaf, val_score: 0.385588:  20%|##        | 1/5 [00:00<00:00, 26.67it/s][32m[I 2022-06-03 02:12:41,003][0m Trial 64 finished with value: 0.6690007658448855 and parameters: {'min_child_samples': 100}. Best is trial 63 with value: 0.4611660286391853.[0m
min_data_in_leaf, val_score: 0.385588:  40%|####      | 2/5 [00:00<00:00, 49.38it/s]

[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[7]	valid_0's binary_logloss: 0.39564	valid_1's binary_logloss: 0.461166
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.5757	valid_1's binary_logloss: 0.669001


min_data_in_leaf, val_score: 0.385588:  40%|####      | 2/5 [00:00<00:00, 32.48it/s][32m[I 2022-06-03 02:12:41,027][0m Trial 65 finished with value: 0.43593803145390336 and parameters: {'min_child_samples': 5}. Best is trial 65 with value: 0.43593803145390336.[0m
min_data_in_leaf, val_score: 0.385588:  60%|######    | 3/5 [00:00<00:00, 46.45it/s]

[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[11]	valid_0's binary_logloss: 0.274193	valid_1's binary_logloss: 0.435938
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438

min_data_in_leaf, val_score: 0.385588:  60%|######    | 3/5 [00:00<00:00, 37.96it/s][32m[I 2022-06-03 02:12:41,044][0m Trial 66 finished with value: 0.6690007658448855 and parameters: {'min_child_samples': 50}. Best is trial 65 with value: 0.43593803145390336.[0m
min_data_in_leaf, val_score: 0.385588:  80%|########  | 4/5 [00:00<00:00, 48.63it/s]


[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.5757	valid_1's binary_logloss: 0.669001
[LightGBM] [Info] Number of positive: 24, number of negative: 120
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5375
[LightGBM] [Info] Number of data points in the train set: 144, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438


min_data_in_leaf, val_score: 0.385588: 100%|##########| 5/5 [00:00<00:00, 49.18it/s][32m[I 2022-06-03 02:12:41,066][0m Trial 67 finished with value: 0.414635734617476 and parameters: {'min_child_samples': 10}. Best is trial 67 with value: 0.414635734617476.[0m
min_data_in_leaf, val_score: 0.385588: 100%|##########| 5/5 [00:00<00:00, 48.23it/s]


[LightGBM] [Info] Start training from score -1.609438
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[9]	valid_0's binary_logloss: 0.325681	valid_1's binary_logloss: 0.414636


In [252]:
params = {
    'objective': 'binary',
    'random_state': 100,
    'feature_pre_filter': False,
    'lambda_l1': 0.0,
    'lambda_l2': 0.0,
    'num_leaves': 31,
    'feature_fraction': 0.6839999999999999,
    'bagging_fraction': 0.4028313137145883,
    'bagging_freq': 1,
    'min_child_samples': 20,
    'num_iterations': 1000,
}

In [254]:
train, test = split_data(r.data_c)

#説明変数と目的変数に分ける。dateはこの後不要なので省く。
X_train = train.drop(['rank', 'date', '単勝'], axis=1)
y_train = train['rank']
#2021/3/12追加： テストデータの単勝オッズはシミュレーション時に使用するので残しておく
X_test = test.drop(['rank', 'date'], axis=1)
y_test = test['rank']

lgb_clf = lgb.LGBMClassifier(**params)
lgb_clf.fit(X_train.values, y_train.values)



LGBMClassifier(bagging_fraction=0.4028313137145883, bagging_freq=1,
               feature_fraction=0.6839999999999999, feature_pre_filter=False,
               lambda_l1=0.0, lambda_l2=0.0, num_iterations=1000,
               objective='binary', random_state=100)

In [255]:
class Return:
    def __init__(self, return_tables):
        self.return_tables = return_tables
    
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(race_id_list):
        return_tables = {}
        for race_id in tqdm(race_id_list):
            time.sleep(1)
            try:
                url = "https://db.netkeiba.com/race/" + race_id

                #普通にスクレイピングすると複勝やワイドなどが区切られないで繋がってしまう。
                #そのため、改行コードを文字列brに変換して後でsplitする
                f = urlopen(url)
                html = f.read()
                html = html.replace(b'<br />', b'br')
                dfs = pd.read_html(html)

                #dfsの1番目に単勝〜馬連、2番目にワイド〜三連単がある
                df = pd.concat([dfs[1], dfs[2]])

                df.index = [race_id] * len(df)
                return_tables[race_id] = df
            except IndexError:
                continue
            except AttributeError: #存在しないrace_idでAttributeErrorになるページもあるので追加
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる
        return_tables_df = pd.concat([return_tables[key] for key in return_tables])
        return return_tables_df
    
    @property
    def fukusho(self):
        fukusho = self.return_tables[self.return_tables[0]=='複勝'][[1,2]]
        wins = fukusho[1].str.split('br', expand=True)[[0,1,2]]
        
        wins.columns = ['win_0', 'win_1', 'win_2']
        returns = fukusho[2].str.split('br', expand=True)[[0,1,2]]
        returns.columns = ['return_0', 'return_1', 'return_2']
        
        df = pd.concat([wins, returns], axis=1)
        for column in df.columns:
            df[column] = df[column].str.replace(',', '')
        return df.fillna(0).astype(int)
    
    @property
    def tansho(self):
        tansho = self.return_tables[self.return_tables[0]=='単勝'][[1,2]]
        tansho.columns = ['win', 'return']
        
        for column in tansho.columns:
            tansho[column] = pd.to_numeric(tansho[column], errors='coerce')
            
        return tansho
    
    @property
    def umaren(self):
        umaren = self.return_tables[self.return_tables[0]=='馬連'][[1,2]]
        wins = umaren[1].str.split('-', expand=True)[[0,1]].add_prefix('win_')
        return_ = umaren[2].rename('return')  
        df = pd.concat([wins, return_], axis=1)        
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    
    @property
    def umatan(self):
        umatan = self.return_tables[self.return_tables[0]=='馬単'][[1,2]]
        wins = umatan[1].str.split('→', expand=True)[[0,1]].add_prefix('win_')
        return_ = umatan[2].rename('return')  
        df = pd.concat([wins, return_], axis=1)        
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    
    @property
    def wide(self):
        wide = self.return_tables[self.return_tables[0]=='ワイド'][[1,2]]
        wins = wide[1].str.split('br', expand=True)[[0,1,2]]
        wins = wins.stack().str.split('-', expand=True).add_prefix('win_')
        return_ = wide[2].str.split('br', expand=True)[[0,1,2]]
        return_ = return_.stack().rename('return')
        df = pd.concat([wins, return_], axis=1)
        return df.apply(lambda x: pd.to_numeric(x.str.replace(',',''), errors='coerce'))
    
    @property
    def sanrentan(self):
        rentan = self.return_tables[self.return_tables[0]=='三連単'][[1,2]]
        wins = rentan[1].str.split('→', expand=True)[[0,1,2]].add_prefix('win_')
        return_ = rentan[2].rename('return')
        df = pd.concat([wins, return_], axis=1) 
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    
    @property
    def sanrenpuku(self):
        renpuku = self.return_tables[self.return_tables[0]=='三連複'][[1,2]]
        wins = renpuku[1].str.split('-', expand=True)[[0,1,2]].add_prefix('win_')
        return_ = renpuku[2].rename('return')
        df = pd.concat([wins, return_], axis=1) 
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [258]:
rt = Return(return_tables)
rt.fukusho #jupyterで表示

NameError: name 'return_tables' is not defined

In [262]:
rt = Return(return_tables_df)
rt.fukusho #jupyterで表示

NameError: name 'return_tables_df' is not defined

In [261]:
retern_tables

Unnamed: 0,0,1,2,3
202105030211,単勝,11,4760,8
202105030211,複勝,11br5br13,710br110br240,9br1br4
202105030211,枠連,4 - 7,720,4
202105030211,馬連,5 - 11,2950,9
202105030211,ワイド,5 - 11br11 - 13br5 - 13,"1,160br5,460br400",12br38br3
202105030211,馬単,11 → 5,12090,29
202105030211,三連複,5 - 11 - 13,8860,26
202105030211,三連単,11 → 5 → 13,110420,247
