In [1]:
import pandas as pd
import pickle

import scraping
import create_rawdf
import create_prediction_population
import re
from pathlib import Path

import pandas as pd
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

DATA_DIR = Path("..", "data")
RAWDF_DIR = DATA_DIR / "rawdf"


%load_ext autoreload

In [21]:
# モジュールの変更を反映させたい時実行
%autoreload

In [None]:
def create_return_tables(
    html_path_list: list[Path],
    save_dir: Path = RAWDF_DIR,
    save_filename: str = "return_tables.csv",
) -> pd.DataFrame:
    """
    保存されているraceページのhtmlを読み込んで、払い戻しテーブルに加工する関数。
    """
    dfs = {}
    for html_path in tqdm(html_path_list):
        with open(html_path, "rb") as f:
            try:
                html = f.read()
                df_list = pd.read_html(html)
                df = pd.concat([df_list[1], df_list[2]])

                # ファイル名からrace_idを取得
                race_id = html_path.stem
                # 最初の列にrace_idを挿入
                df.insert(0, "race_id", race_id)
                dfs[race_id] = df
            except IndexError as e:
                print(f"table not found at {race_id}")
                continue
    concat_df = pd.concat(dfs.values())
    save_dir.mkdir(exist_ok=True, parents=True)
    update_rawdf(
        concat_df,
        key="race_id",
        save_filename=save_filename,
        save_dir=save_dir,
    )
    return concat_df

In [4]:
with open("race_id_list.pickle","rb") as f:
    race_id_list = pickle.load(f)
    

In [5]:
# 途中で処理が途切れるなどした場合は、直接htmlのファイルパスを取得
html_paths_race = [
    scraping.HTML_RACE_DIR / f"{race_id}.bin" for race_id in race_id_list
]

In [22]:
return_tables = create_rawdf.create_return_tables(html_path_list=html_paths_race)

  0%|          | 0/6912 [00:00<?, ?it/s]

In [23]:
return_tables

Unnamed: 0,race_id,0,1,2,3
0,202206010101,単勝,15,680,4
1,202206010101,複勝,15 10 4,"210 1,600 170",3 13 1
2,202206010101,枠連,5 - 8,2680,13
3,202206010101,馬連,10 - 15,31040,61
4,202206010101,ワイド,10 - 15 4 - 15 4 - 10,"6,890 660 5,640",58 4 50
...,...,...,...,...,...
54925,202309050912,馬連,2 - 12,9930,28
54926,202309050912,ワイド,2 - 12 2 - 5 5 - 12,"2,590 1,380 460",27 15 2
54927,202309050912,馬単,2 → 12,29610,69
54928,202309050912,三連複,2 - 5 - 12,7370,21


In [25]:
race_id_list = scraping.scrape_race_id_list(["20240601"])

  0%|          | 0/1 [00:00<?, ?it/s]

In [28]:
prediction_population = create_prediction_population.create(kaisai_date="20240601")

scraping race_id_list...


  0%|          | 0/1 [00:00<?, ?it/s]

scraping horse_id_list...


  0%|          | 0/24 [00:00<?, ?it/s]

In [29]:
prediction_population

Unnamed: 0,date,race_id,horse_id
0,2024-06-01,202405030101,2019100108
1,2024-06-01,202405030101,2019104899
2,2024-06-01,202405030101,2016103092
3,2024-06-01,202405030101,2020102800
4,2024-06-01,202405030101,2019105143
...,...,...,...
9,2024-06-01,202408040112,2019104288
10,2024-06-01,202408040112,2018104208
11,2024-06-01,202408040112,2020104845
12,2024-06-01,202408040112,2021100161


In [30]:
html_path_horse = scraping.scrape_html_horse(
    horse_id_list=prediction_population["horse_id"].unique(),
    skip=False,
)

  0%|          | 0/328 [00:00<?, ?it/s]

In [31]:
len(html_path_horse)

328

In [32]:
import create_rawdf

create_rawdf.create_horse_results(
    html_path_list = html_path_horse,
    save_filename="horse_results_prediction.csv",
)

  0%|          | 0/328 [00:00<?, ?it/s]

Unnamed: 0,horse_id,日付,開催,天気,R,レース名,映像,頭数,枠番,馬番,...,着差,ﾀｲﾑ指数,通過,ペース,上り,馬体重,厩舎ｺﾒﾝﾄ,備考,勝ち馬(2着馬),賞金
0,2019100108,2024/08/18,2中京4,晴,1.0,障害3歳以上未勝利,,12,2.0,2,...,-0.4,,2-2-3-2,103.1-38.9,13.1,504(+4),,,(トーアモルペウス),790.0
1,2019100108,2024/08/03,2新潟3,晴,6.0,柳都S(3勝クラス),,15,6.0,10,...,0.4,**,8-7-9-10,36.2-38.1,37.7,500(0),,,ゴールドバランサー,184.0
2,2019100108,2024/06/01,3東京1,晴,1.0,障害3歳以上未勝利,,10,1.0,1,...,0.2,,4-4-4-4,105.1-38.7,13.7,500(-6),,,オメガリッチマン,320.0
3,2019100108,2024/05/12,1新潟6,曇,4.0,障害4歳以上未勝利,,11,6.0,7,...,0.3,,6-5-4-4,105.0-38.2,13.1,506(-6),,,ラストドラフト,200.0
4,2019100108,2024/02/24,2小倉5,晴,4.0,障害4歳以上未勝利,,10,8.0,9,...,2.9,,8-8-7-7,105.0-39.0,13.7,512(+6),,,イフティファール,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,2020101230,2023/05/28,2東京12,晴,5.0,3歳1勝クラス,,16,4.0,7,...,0.8,**,1-1,34.5-37.5,38.3,542(-18),,,ボールドゾーン,
13,2020101230,2023/02/19,1東京8,曇,3.0,3歳1勝クラス,,16,3.0,6,...,2.2,**,4-4,35.5-36.3,38.1,560(+8),,,スマートフォルス,
14,2020101230,2022/11/26,5東京7,曇,9.0,カトレアS(OP),,10,6.0,6,...,0.8,**,2-2,34.9-37.2,37.8,552(-8),,,コンティノアール,
15,2020101230,2022/11/05,5東京1,曇,2.0,2歳未勝利,,14,6.0,10,...,-0.4,**,2-2,35.6-37.9,37.6,560(+10),,,(プラチナジュビリー),520.0
