In [1]:

import json
import re
from pathlib import Path

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from tqdm.notebook import tqdm
from webdriver_manager.chrome import ChromeDriverManager
from io import StringIO 
import create_population
import condition_prediction
# commonディレクトリのパス
COMMON_DATA_DIR = Path("..", "..", "common", "data")
POPULATION_DIR = COMMON_DATA_DIR / "prediction_population"
MAPPING_DIR = COMMON_DATA_DIR / "mapping"


DATA_DIR = Path("..", "data")
OLD_POPULATION_DIR = DATA_DIR / "00_population"
INPUT_DIR = DATA_DIR / "01_preprocessed"
OUTPUT_DIR = DATA_DIR / "02_features"
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

# カテゴリ変数を数値に変換するためのマッピング
with open(MAPPING_DIR / "sex.json", "r") as f:
    sex_mapping = json.load(f)
with open(MAPPING_DIR / "race_type.json", "r") as f:
    race_type_mapping = json.load(f)
with open(MAPPING_DIR / "around.json", "r") as f:
    around_mapping = json.load(f)
with open(MAPPING_DIR / "weather.json", "r") as f:
    weather_mapping = json.load(f)
with open(MAPPING_DIR / "ground_state.json", "r") as f:
    ground_state_mapping = json.load(f)
with open(MAPPING_DIR / "race_class.json", "r") as f:
    race_class_mapping = json.load(f)
%load_ext autoreload


In [2]:
%autoreload

In [3]:
population = create_population.create(from_="2023-01-01", to_="2023-12-31")


In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)
pd.reset_option('display.precision')
pd.reset_option('display.float_format')

In [5]:
population_dir  = POPULATION_DIR
population_filename  = "population.csv"

old_population_dir  = OLD_POPULATION_DIR
old_population_filename  = "population.csv"
old_results_filename  = "results.csv"
old_race_info_filename  = "race_info.csv"
old_horse_results_filename  = "horse_results.csv"

input_dir  = INPUT_DIR

horse_results_filename  = "horse_results_prediction.csv"
jockey_leading_filename  = "jockey_leading.csv"
trainer_leading_filename  = "trainer_leading.csv"
peds_filename  = "peds_prediction.csv"
sire_leading_filename  = "sire_leading.csv"
output_dir  = OUTPUT_DIR
output_filename  = "features_prediction.csv"


old_results_condition_filename  = "results_prediction.csv"
old_race_info_condition_filename  = "race_info_prediction.csv"
bms_leading_filename  = "bms_leading.csv"

population = pd.read_csv(population_dir / population_filename, sep="\t")
old_results = pd.read_csv(input_dir / old_results_filename, sep="\t")
old_race_info = pd.read_csv(input_dir / old_race_info_filename, sep="\t")
old_population = pd.read_csv(old_population_dir / old_population_filename, sep="\t")
old_horse_results = pd.read_csv(input_dir / old_horse_results_filename, sep="\t")

old_results_condition = pd.read_csv(input_dir / old_results_condition_filename, sep="\t")
old_race_info_condition = pd.read_csv(input_dir / old_race_info_condition_filename, sep="\t")


horse_results = pd.read_csv(input_dir / horse_results_filename, sep="\t")
jockey_leading = pd.read_csv(input_dir / jockey_leading_filename, sep="\t")
trainer_leading = pd.read_csv(
    input_dir / trainer_leading_filename, sep="\t"
)
peds = pd.read_csv(input_dir / peds_filename, sep="\t")
sire_leading = pd.read_csv(input_dir / sire_leading_filename, sep="\t")
output_dir = output_dir
output_filename = output_filename
htmls = {}
agg_horse_per_group_cols_dfs = {}

bms_leading = pd.read_csv(input_dir / bms_leading_filename, sep="\t")       


In [6]:
race_id = "202406050811"
print("fetching shutuba page html...")
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
# chrome driverをインストール
driver_path = ChromeDriverManager().install()
url = f"https://race.netkeiba.com/race/shutuba.html?race_id={race_id}"
with webdriver.Chrome(service=Service(driver_path), options=options) as driver:
    driver.implicitly_wait(10000)
    driver.get(url)
    htmls[race_id] = driver.page_source
print("fetching shutuba page html...comp")

fetching shutuba page html...
fetching shutuba page html...comp


In [30]:
html = str(htmls)  # 辞書やリストを文字列として扱う
df = pd.read_html(StringIO(html))[0]
df.columns = df.columns.get_level_values(1)
soup = BeautifulSoup(html, "lxml").find("table", class_="Shutuba_Table")
horse_id_list = []
a_list = soup.find_all("a", href=re.compile(r"/horse/"))
for a in a_list:
    horse_id = re.findall(r"\d{10}", a["href"])[0]
    horse_id_list.append(int(horse_id))
df["horse_id"] = horse_id_list
jockey_id_list = []
a_list = soup.find_all("a", href=re.compile(r"/jockey/"))
for a in a_list:
    jockey_id = re.findall(r"\d{5}", a["href"])[0]
    jockey_id_list.append(int(jockey_id))
df["jockey_id"] = jockey_id_list
trainer_id_list = []
a_list = soup.find_all("a", href=re.compile(r"/trainer/"))
for a in a_list:
    trainer_id = re.findall(r"\d{5}", a["href"])[0]
    trainer_id_list.append(int(trainer_id))
df["trainer_id"] = trainer_id_list

df = df[df.iloc[:, 9] != '--']
df["tansho_odds"] = df.iloc[:, 9].astype(float)
# 前処理
df["wakuban"] = df.iloc[:, 0].astype(int)
df["umaban"] = df.iloc[:, 1].astype(int)
df["sex"] = df.iloc[:, 4].str[0].map(sex_mapping)
df["age"] = df.iloc[:, 4].str[1:].astype(int)

df["impost"] = df.iloc[:, 5].astype(float)
# df["weight"] = df.iloc[:, 8].str.extract(r"(\d+)").astype(int)
# df["weight_diff"] = df.iloc[:, 8].str.extract(r"\((.+)\)").astype(int)
df["weight"] = df.iloc[:, 8].astype(str).str.extract(r"(\d+)").astype(float)
# df["weight_diff"] = df.iloc[:, 8].astype(str).str.extract(r"\((.+)\)").astype(float)
# '前計不'をNaNに置き換え、その後の処理を続行

In [37]:
df.iloc[:, 10]

0      \n2\n
2      \n1\n
3     \n11\n
4      \n3\n
5      \n7\n
6      \n8\n
7      \n4\n
8     \n12\n
9      \n6\n
10     \n5\n
11    \n14\n
12     \n9\n
13    \n15\n
14    \n13\n
15    \n10\n
Name: 人気, dtype: object

In [41]:
# 改行や不要な空白を完全に削除して、整数に変換
df["popularity"] = df.iloc[:, 10].str.replace(r'\\n', '', regex=True).str.replace(r'\n', '', regex=True).str.strip().astype(int)


In [42]:
df["popularity"]

0      2
2      1
3     11
4      3
5      7
6      8
7      4
8     12
9      6
10     5
11    14
12     9
13    15
14    13
15    10
Name: popularity, dtype: int64

In [35]:
df["weight_diff"]

0     -2.0
2      6.0
3      2.0
4     -2.0
5      NaN
6     -2.0
7     -4.0
8    -14.0
9      NaN
10     0.0
11     0.0
12    -2.0
13     4.0
14    12.0
15     NaN
Name: weight_diff, dtype: float64

In [31]:
["weight_diff"] = df.iloc[:, 8].astype(str).str.extract(r"\((.+)\)").astype(float)

SyntaxError: cannot assign to literal (2803496908.py, line 1)

In [26]:
df.iloc[:, 8]

0      \n520(-2)\n
2      \n516(+6)\n
3      \n432(+2)\n
4      \n512(-2)\n
5     \n506(前計不)\n
6      \n498(-2)\n
7      \n468(-4)\n
8     \n502(-14)\n
9     \n474(前計不)\n
10      \n468(0)\n
11      \n478(0)\n
12     \n496(-2)\n
13     \n496(+4)\n
14    \n492(+12)\n
15    \n458(前計不)\n
Name: 馬体重 (増減), dtype: object

In [24]:
# '前計不'をNaNに置き換えた後、数値部分を抽出
df["weight_diff"] = df.iloc[:, 8].replace("前計不", 0)  # '前計不'をNaNに置き換え


In [25]:
df

Unnamed: 0,枠,馬 番,印\n\n\n\n\n\n\n,\n\n馬名\n\n,性齢,斤量,騎手,厩舎,馬体重 (増減),\n\nオッズ\n\n,人気,登録,グループ,馬メモ切替,horse_id,jockey_id,trainer_id,tansho_odds,wakuban,umaban,sex,age,impost,weight,weight_diff
0,1,1,\n\n--\n,\n\n\n ダノンデサイル\n\n\n\n,牡3,56.0,\n 横山典\n\n\n,栗東安田,\n520(-2)\n,4.0,\n2\n,\n\n\n\n,,編集,2021105143,660,1164,4.0,1,1,0,3,56.0,520.0,\n520(-2)\n
2,2,3,\n\n--\n,\n\n\n アーバンシック\n\n\n\n,牡3,56.0,\n ルメール\n\n\n,美浦武井,\n516(+6)\n,2.8,\n1\n,\n\n\n\n,,編集,2021105436,5339,1147,2.8,2,3,0,3,56.0,516.0,\n516(+6)\n
3,2,4,\n\n--\n,\n\n\n ブローザホーン\n\n\n\n,牡5,58.0,\n 菅原明\n\n\n,栗東吉岡,\n432(+2)\n,35.1,\n11\n,\n\n\n\n,,編集,2019105748,1179,1176,35.1,2,4,0,5,58.0,432.0,\n432(+2)\n
4,3,5,\n\n--\n,\n\n\n ベラジオオペラ\n\n\n\n,牡4,58.0,\n 横山和\n\n\n,栗東上村,\n512(-2)\n,7.1,\n3\n,\n\n\n\n,,編集,2020102781,1140,1168,7.1,3,5,0,4,58.0,512.0,\n512(-2)\n
5,3,6,\n\n--\n,\n\n\n ローシャムパーク\n\n\n\n,牡5,58.0,\n マーカンド\n\n\n,美浦田中博,\n506(前計不)\n,17.2,\n7\n,\n\n\n\n,,編集,2019105552,5626,1162,17.2,3,6,0,5,58.0,506.0,\n506(前計不)\n
6,4,7,\n\n--\n,\n\n\n スターズオンアース\n\n\n\n,牝5,56.0,\n 川田\n\n\n,美浦高柳瑞,\n498(-2)\n,17.6,\n8\n,\n\n\n\n,,編集,2019104740,1088,1118,17.6,4,7,1,5,56.0,498.0,\n498(-2)\n
7,4,8,\n\n--\n,\n\n\n レガレイラ\n\n\n\n,牝3,54.0,\n 戸崎圭\n\n\n,美浦木村,\n468(-4)\n,10.9,\n4\n,\n\n\n\n,,編集,2021105898,5386,1126,10.9,4,8,1,3,54.0,468.0,\n468(-4)\n
8,5,9,\n\n--\n,\n\n\n ディープボンド\n\n\n\n,牡7,58.0,\n 幸\n\n\n,栗東大久保,\n502(-14)\n,49.7,\n12\n,\n\n\n\n,,編集,2017102170,732,1058,49.7,5,9,0,7,58.0,502.0,\n502(-14)\n
9,5,10,\n\n--\n,\n\n\n プログノーシス\n\n\n\n,牡6,58.0,\n 三浦\n\n\n,栗東中内田,\n474(前計不)\n,16.4,\n6\n,\n\n\n\n,,編集,2018104541,1122,1137,16.4,5,10,0,6,58.0,474.0,\n474(前計不)\n
10,6,11,\n\n--\n,\n\n\n ジャスティンパレス\n\n\n\n,牡5,58.0,\n 坂井\n\n\n,栗東杉山晴,\n468(0)\n,10.9,\n5\n,\n\n\n\n,,編集,2019105346,1163,1157,10.9,6,11,0,5,58.0,468.0,\n468(0)\n


In [None]:

# '前計不'をNaNに置き換えた後、数値部分を抽出
df["weight_diff"] = df.iloc[:, 8].replace("前計不", np.nan)  # '前計不'をNaNに置き換え
df["weight_diff"] = df["weight_diff"].astype(str).str.extract(r"\((.+)\)").astype(float)  # 数値を抽出してfloatに変換
df["popularity"] = df.iloc[:, 10].astype(int)

In [84]:
htmls = str(htmls)  # 辞書やリストを文字列として扱う
info_dict = {}
info_dict["race_id"] = int(race_id)
soup = BeautifulSoup(htmls, "lxml").find("div", class_="RaceList_Item02")
title = soup.find("h1").text.strip()
divs = soup.find_all("div")
div1 = divs[0].text.replace(" ", "")
info1 = re.findall(r"[\w:]+", div1)
info_dict["race_type"] = race_type_mapping[info1[1][0]]
info_dict["around"] = (
    around_mapping[info1[2][0]] if info_dict["race_type"] != 2 else np.nan
)
info_dict["course_len"] = int(re.findall(r"\d+", info1[1])[0])
info_dict["weather"] = weather_mapping[re.findall(r"天候:(\w+)", div1)[0]]
info_dict["ground_state"] = ground_state_mapping[
    re.findall(r"馬場:(\w+)", div1)[0]
]
# レース階級情報の取得
regex_race_class = "|".join(race_class_mapping)
race_class_title = re.findall(regex_race_class, title)
# タイトルからレース階級情報が取れない場合
race_class = re.findall(regex_race_class, divs[1].text)
if len(race_class_title) != 0:
    info_dict["race_class"] = race_class_mapping[race_class_title[0]]
elif len(race_class) != 0 and race_class != ['オープン']:
    info_dict["race_class"] = race_class_mapping[race_class[0]]
elif len(race_class) != 0 and race_class ==['オープン']:
    #オープンの場合
    #賞金
    #2900未満でオープン
    #2900-5000G3
    #5000-10000G2
    #10000-G1
    # 本賞金部分の抽出
    prize_text = divs[1].find("span", string=lambda text: text and "本賞金:" in text).text
    prize_amount = int(prize_text.split(":")[1].split(",")[0])  # 最初の金額を取得
    # 本賞金に基づいてレースクラスを決定
    if prize_amount < 2900:
        race_grade = "オープン"
        info_dict["race_class"] = race_class_mapping[race_grade]
    elif 2900 <= prize_amount <= 5000:
        race_grade = "GⅢ"
        info_dict["race_class"] = race_class_mapping[race_grade]
    elif 5000 < prize_amount < 10000:
        race_grade = "GⅡ"
        info_dict["race_class"] = race_class_mapping[race_grade]
    elif 10000 <= prize_amount:
        race_grade = "GⅠ"
        info_dict["race_class"] = race_class_mapping[race_grade]
else:
    info_dict["race_class"] = None

In [85]:
info_dict

{'race_id': 202408070811,
 'race_type': 0,
 'around': 0,
 'course_len': 1200,
 'weather': 1,
 'ground_state': 0,
 'race_class': 5}

In [51]:
race_class = re.findall(regex_race_class, divs[1].text)
race_class

['オープン']

In [39]:
divs[1]

<div class="RaceData02">\n<span>5回</span>\n<span>中山</span>\n<span>8日目</span>\n<span>サラ系３歳以上</span>\n<span>オープン</span>\n     \n<span>(国際)(指)</span>\n<span>定量</span>\n<span>16頭</span>\n<br/>\n<span>本賞金:50000,20000,12500,7500,5000万円</span>\n</div>

In [44]:
# 本賞金部分の抽出
prize_text = divs[1].find("span", string=lambda text: text and "本賞金:" in text).text
prize_amount = int(prize_text.split(":")[1].split(",")[0])  # 最初の金額を取得
# 本賞金に基づいてレースクラスを決定
if prize_amount < 2900:
    race_class = "オープン"
    info_dict["race_class"] = race_class_mapping.get(race_class, None)
elif 2900 <= prize_amount <= 5000:
    race_class = "G3"
    info_dict["race_class"] = race_class_mapping.get(race_class, None)
elif 5000 < prize_amount <= 10000:
    race_class = "G2"
    info_dict["race_class"] = race_class_mapping.get(race_class, None)
else:
    race_class = "G1"
    info_dict["race_class"] = race_class_mapping.get(race_class, None)

In [43]:
prize_amount

50000