In [121]:
import requests
from bs4 import BeautifulSoup
import re
from bs4 import XMLParsedAsHTMLWarning
import warnings
import pandas as pd
import numpy as np
import concurrent.futures
import time
import random
import pandas as pd
from tqdm import tqdm
import unicodedata
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

In [122]:
def create_hd_list(start_date, end_date):
    date_range = pd.date_range(start=start_date, end=end_date)
    hd_list = [date.strftime("%Y%m%d") for date in date_range]
    return hd_list

In [123]:
# jcd_list = [str(i).zfill(2) for i in range(1, 24)]
jcd_list = [str(i).zfill(2) for i in range(1, 25)]
hd_list = create_hd_list("20250101", "20250101")
rno_list = [str(i).zfill(2) for i in range(1, 13)]

In [124]:
def get_course_len(soup):
    header = soup.find('h3', class_='title16_titleDetail__add2020')
    header_text = header.get_text(strip=True)
    match = re.search(r'(\d{3,4}m)', header_text)
    return match.group(1)[:-1]

In [125]:
def make_racelist_df(hd, jcd, rno):
    prefix = "https://www.boatrace.jp/owpc/pc/race/racelist?"
    url = f"{prefix}rno={rno}&jcd={jcd}&hd={hd}"
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'lxml')
    
    course_len = get_course_len(soup)

    rows = soup.find_all('tbody', class_='is-fs12')
    data_list = []
    for row in rows:
        cols = row.find_all('td')
        wakuban = unicodedata.normalize('NFKC', cols[0].get_text(strip=True))
        
        racer_infos =  cols[2].find_all('div')
        id_grade = racer_infos[0].get_text(strip=True)
        id_match = re.search(r'(\d+)', id_grade)
        racer_id = id_match.group(1)
        grade_match = re.search(r'([A-Z]\d)', id_grade)
        grade = grade_match.group(1)
        
        start_info = list(cols[3].stripped_strings)
        f_count = start_info[0][1:]
        l_count = start_info[1][1:]
        st_time = start_info[2]
        
        global_info = list(cols[4].stripped_strings)
        g_1rate = global_info[0]
        g_2rate = global_info[1]
        g_3rate = global_info[2]
        
        local_info = list(cols[5].stripped_strings)
        l_1rate = local_info[0]
        l_2rate = local_info[1]
        l_3rate = local_info[2]
        
        boat_info = list(cols[6].stripped_strings)
        b_no = boat_info[0]
        b_2rate = boat_info[1]
        b_3rate = boat_info[2]
        
        motor_info = list(cols[7].stripped_strings)
        m_no = motor_info[0]
        m_2rate = motor_info[1]
        m_3rate = motor_info[2]
        data = {
            "wakuban": wakuban,
            "racer_id": racer_id,
            "grade": grade,
            "f_count": f_count,
            "l_count": l_count,
            "st_time": st_time,
            "g_1rate": g_1rate,
            "g_2rate": g_2rate,
            "g_3rate": g_3rate,
            "l_1rate": l_1rate,
            "l_2rate": l_2rate,
            "l_3rate": l_3rate,
            "b_no": b_no,
            "b_2rate": b_2rate,
            "b_3rate": b_3rate,
            "m_no": m_no,
            "m_2rate": m_2rate,
            "m_3rate": m_3rate,
            "course_len": course_len
        }
        data_list.append(data)
    df = pd.DataFrame(data_list)
    df["course_len"] = df["course_len"].astype(int)
    return df

In [126]:
def make_beforeinfo_df(hd, jcd, rno):
    prefix = "https://www.boatrace.jp/owpc/pc/race/beforeinfo?"
    url = f"{prefix}rno={rno}&jcd={jcd}&hd={hd}"
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'lxml')
    
    racer_table = soup.find('table', class_='is-w748')
    rows = racer_table.find_all('tbody', class_='is-fs12')
    racer_data_list = []
    for row in rows:
        cols = row.find_all('td')
        wakuban = cols[0].get_text(strip=True)
        weight = cols[3].get_text(strip=True)[:-2]
        prev_time = cols[4].get_text(strip=True)
        tilt = cols[5].get_text(strip=True)
        weight_adj = cols[12].get_text(strip=True)
        racer_data = {
            "wakuban": wakuban,
            "weight": weight,
            "prev_time": prev_time,
            "tilt": tilt,
            "weight_adj": weight_adj
        }
        racer_data_list.append(racer_data)
    racer_df = pd.DataFrame(racer_data_list)
    
    start_table = soup.find('table', class_='is-w238')
    rows = start_table.find('tbody', class_='is-p10-0').find_all('tr')
    start_data_list = []
    for row in rows:
        cols = row.find_all('span')
        wakuban = cols[0].get_text(strip=True)
        start_place_match = re.search(r'left: (\d+)%', str(cols[2]))
        start_place = start_place_match.group(1) if start_place_match else None
        st = cols[3].get_text(strip=True)
        start_data = {
            "wakuban": wakuban,
            "start_place": start_place,
            "st": st
        }
        start_data_list.append(start_data)
    start_df = pd.DataFrame(start_data_list)
    
    df = pd.merge(racer_df, start_df, on="wakuban", how="left")
    
    weather_table = soup.find('div', class_='weather1_body')
    infos = weather_table.find_all('div', class_="weather1_bodyUnitLabel")
    direction_match = re.search(r'is-direction(\d+)', str(weather_table.find('div', class_="weather1_bodyUnit is-direction").find('p')))
    direction = direction_match.group(1)
    temp_match = re.search(r'(\d+\.\d+)℃', str(infos[0]))
    temp = temp_match.group(1)
    weather = infos[1].get_text(strip=True)
    wind_match = re.search(r'(\d+)m', str(infos[2]))
    wind = wind_match.group(1)
    wind_direction_match = re.search(r'is-wind(\d+)', str(weather_table.find('div', class_="weather1_bodyUnit is-windDirection").find('p')))
    wind_direction = wind_direction_match.group(1)
    water_temp_match = re.search(r'(\d+\.\d+)℃', str(infos[3]))
    water_temp = water_temp_match.group(1)
    wave_height_match = re.search(r'(\d+)cm', str(infos[4]))
    wave_height = wave_height_match.group(1)
    df['direction'] = direction
    df['temp'] = temp
    df['weather'] = weather
    df['wind'] = wind
    df['wind_direction'] = wind_direction
    df['water_temp'] = water_temp
    df['wave_height'] = wave_height
    df
    return df

In [127]:
def convert_time_to_seconds(time_str):
    if time_str == "":
        return np.nan
    time_str = time_str.replace('\"', '\'')
    minutes, seconds, deciseconds = map(float, time_str.split('\''))
    total_seconds = minutes * 60 + seconds + deciseconds / 10
    return total_seconds

In [128]:
def make_raceresult_df(hd, jcd, rno):
    prefix = "https://www.boatrace.jp/owpc/pc/race/raceresult?"
    url = f"{prefix}rno={rno}&jcd={jcd}&hd={hd}"
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'lxml')
    table = soup.find('table', class_='is-w495')
    rows = table.find_all('tbody')
    data_list = []
    for row in rows:
        cols = row.find_all('td')
        rank_value = cols[0].get_text(strip=True)
        if rank_value in ['転']:
            continue
        else:
            rank = unicodedata.normalize('NFKC', rank_value)
        wakuban = cols[1].get_text(strip=True)
        time = convert_time_to_seconds(cols[3].get_text(strip=True))
        data = {
            "rank": rank,
            "wakuban": wakuban,
            "time": time
        }
        data_list.append(data)
    df = pd.DataFrame(data_list)
    return df

In [None]:
race_held_list = []
for hd in hd_list:
    for jcd in jcd_list:
        check=url = "https://www.boatrace.jp/owpc/pc/race/raceindex?jcd=" + jcd + "&hd=" + hd
        response = requests.get(check)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml')
        if "データがありません" not in soup.text:
            race_held_list.append((hd, jcd))
# race_held_list = [('20250101', '01')]

In [130]:
def fetch_race_data(args):
    hd, jcd, rno = args
    race_id = f"{hd}{jcd}{rno}"
    
    time.sleep(random.uniform(0.5, 1.5))
    
    try:
        racelist_df = make_racelist_df(hd, jcd, rno)
        beforeinfo_df = make_beforeinfo_df(hd, jcd, rno)
        raceresult_df = make_raceresult_df(hd, jcd, rno)
        
        df = pd.merge(racelist_df, beforeinfo_df, on='wakuban', how='left')
        df = pd.merge(df, raceresult_df, on='wakuban', how='left')
        
        df["race_id"] = race_id
        return df

    except Exception as e:
        return None

In [None]:
tasks = [
    (hd, jcd, rno)
    for hd, jcd in race_held_list
    for rno in rno_list
]

results = []

print(f"Starting scraping for {len(tasks)} races with {5} workers...")

# ThreadPoolExecutorで並列実行
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(fetch_race_data, task) for task in tasks]
    
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
        result = future.result()
        if result is not None:
            results.append(result)
df = pd.concat(results, ignore_index=True)

Starting scraping for 12 races with 5 workers...


  0%|          | 0/12 [00:00<?, ?it/s]

In [None]:
def convert_st_time(st_str):
    if st_str[0] == "F":
        return -1 * float(st_str[1:])
    else:
        return float(st_str)

In [None]:
int_cols = [
    "wakuban",
    "racer_id",
    "f_count",
    "l_count",
    "b_no",
    "m_no",
    "course_len",
    "f_count",
    "l_count",
    "direction",
    "start_place",
    "wind",
    "wind_direction",
    "wave_height",
    "rank",
]
for col in int_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')
float_cols = [
    "st_time",
    "g_1rate",
    "g_2rate",
    "g_3rate",
    "l_1rate",
    "l_2rate",
    "l_3rate",
    "b_2rate",
    "b_3rate",
    "m_2rate",
    "m_3rate",
    "weight",
    "prev_time",
    "tilt",
    "weight_adj",
    "temp",
    "water_temp",
    "time"
]
for col in float_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
grade_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4}
df['grade'] = df['grade'].map(grade_mapping).astype('Int64')
df['st'] = df['st'].map(convert_st_time)
df.set_index(['race_id', 'wakuban'], inplace=True)

KeyError: 'wakuban'

In [None]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,racer_id,grade,f_count,l_count,st_time,g_1rate,g_2rate,g_3rate,l_1rate,l_2rate,l_3rate,b_no,b_2rate,b_3rate,m_no,m_2rate,m_3rate,course_len,weight,prev_time,tilt,weight_adj,start_place,st,direction,temp,weather,wind,wind_direction,water_temp,wave_height,rank,time
race_id,wakuban,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
202501010102,1,3652,3,0,0,0.18,3.82,15.15,30.30,4.02,18.75,36.61,42,0.0,0.0,62,30.00,55.00,1800,53.8,6.90,-0.5,0.0,75,-0.10,14,7.0,晴,6,4,1.0,5,5,116.2
202501010102,2,4888,2,0,0,0.17,5.70,40.48,53.17,5.94,38.96,58.44,14,0.0,0.0,45,43.75,43.75,1800,52.0,6.93,-0.5,0.0,64,0.01,14,7.0,晴,6,4,1.0,5,1,110.9
202501010102,3,4645,2,1,0,0.13,6.61,47.50,70.83,6.15,44.10,61.49,32,0.0,0.0,37,47.37,63.16,1800,52.0,6.90,0.0,0.0,69,-0.04,14,7.0,晴,6,4,1.0,5,3,113.6
202501010102,4,5301,3,0,0,0.20,3.55,8.47,24.58,2.47,2.78,9.72,11,0.0,0.0,33,33.33,55.56,1800,51.5,6.88,0.0,0.5,74,-0.09,14,7.0,晴,6,4,1.0,5,4,115.2
202501010102,5,5390,4,0,0,,1.33,0.00,0.00,1.22,0.00,0.00,26,0.0,0.0,71,23.53,76.47,1800,46.0,6.99,-0.5,1.0,53,0.12,14,7.0,晴,6,4,1.0,5,6,118.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202501010112,2,4188,1,0,0,0.13,7.32,58.59,73.44,7.27,59.33,77.33,20,0.0,0.0,68,50.00,64.29,1800,52.0,6.75,-0.5,0.0,61,0.04,14,3.0,晴,2,2,1.0,1,2,109.7
202501010112,3,4769,1,0,0,0.13,6.01,44.66,61.17,6.56,40.74,62.96,31,0.0,0.0,39,31.25,37.50,1800,52.0,6.89,-0.5,0.0,54,0.11,14,3.0,晴,2,2,1.0,1,3,111.6
202501010112,4,4888,2,0,0,0.17,5.70,40.48,53.17,5.94,38.96,58.44,14,0.0,0.0,45,43.75,43.75,1800,52.0,6.75,-0.5,0.0,46,0.19,14,3.0,晴,2,2,1.0,1,4,112.3
202501010112,5,4539,2,0,0,0.14,6.61,48.84,66.67,6.19,46.38,57.97,34,0.0,0.0,26,25.00,50.00,1800,52.0,6.85,-0.5,0.0,59,0.06,14,3.0,晴,2,2,1.0,1,5,113.5
