In [1]:
import requests
from bs4 import BeautifulSoup
import re
from bs4 import XMLParsedAsHTMLWarning
import warnings
import pandas as pd
import numpy as np
import concurrent.futures
import time
import random
import pandas as pd
from tqdm import tqdm
import unicodedata
import os
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

In [2]:
def create_hd_list(ym):
    start_date = pd.to_datetime(ym, format="%Y%m")
    end_date = start_date + pd.DateOffset(months=1) - pd.DateOffset(days=1)
    date_range = pd.date_range(start=start_date, end=end_date)
    hd_list = [date.strftime("%Y%m%d") for date in date_range]
    return hd_list

In [3]:
def get_course_len(soup):
    header = soup.find('h3', class_='title16_titleDetail__add2020')
    header_text = header.get_text(strip=True)
    match = re.search(r'(\d{3,4}m)', header_text)
    return match.group(1)[:-1]

In [4]:
def make_racelist_df(hd, jcd, rno):
    prefix = "https://www.boatrace.jp/owpc/pc/race/racelist?"
    url = f"{prefix}rno={rno}&jcd={jcd}&hd={hd}"
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'lxml')
    
    course_len = get_course_len(soup)

    rows = soup.find_all('tbody', class_='is-fs12')
    data_list = []
    for row in rows:
        cols = row.find_all('td')
        wakuban = unicodedata.normalize('NFKC', cols[0].get_text(strip=True))
        
        racer_infos =  cols[2].find_all('div')
        id_grade = racer_infos[0].get_text(strip=True)
        id_match = re.search(r'(\d+)', id_grade)
        racer_id = id_match.group(1)
        grade_match = re.search(r'([A-Z]\d)', id_grade)
        grade = grade_match.group(1)
        
        start_info = list(cols[3].stripped_strings)
        f_count = start_info[0][1:]
        l_count = start_info[1][1:]
        st_time = start_info[2]
        
        global_info = list(cols[4].stripped_strings)
        g_1rate = global_info[0]
        g_2rate = global_info[1]
        g_3rate = global_info[2]
        
        local_info = list(cols[5].stripped_strings)
        l_1rate = local_info[0]
        l_2rate = local_info[1]
        l_3rate = local_info[2]
        
        boat_info = list(cols[6].stripped_strings)
        b_no = boat_info[0]
        b_2rate = boat_info[1]
        b_3rate = boat_info[2]
        
        motor_info = list(cols[7].stripped_strings)
        m_no = motor_info[0]
        m_2rate = motor_info[1]
        m_3rate = motor_info[2]
        data = {
            "wakuban": wakuban,
            "racer_id": racer_id,
            "grade": grade,
            "f_count": f_count,
            "l_count": l_count,
            "st_time": st_time,
            "g_1rate": g_1rate,
            "g_2rate": g_2rate,
            "g_3rate": g_3rate,
            "l_1rate": l_1rate,
            "l_2rate": l_2rate,
            "l_3rate": l_3rate,
            "b_no": b_no,
            "b_2rate": b_2rate,
            "b_3rate": b_3rate,
            "m_no": m_no,
            "m_2rate": m_2rate,
            "m_3rate": m_3rate,
            "course_len": course_len
        }
        data_list.append(data)
    df = pd.DataFrame(data_list)
    df["course_len"] = df["course_len"].astype(int)
    return df

In [5]:
def make_beforeinfo_df(hd, jcd, rno):
    prefix = "https://www.boatrace.jp/owpc/pc/race/beforeinfo?"
    url = f"{prefix}rno={rno}&jcd={jcd}&hd={hd}"
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'lxml')
    
    racer_table = soup.find('table', class_='is-w748')
    rows = racer_table.find_all('tbody', class_='is-fs12')
    racer_data_list = []
    for row in rows:
        cols = row.find_all('td')
        wakuban = cols[0].get_text(strip=True)
        weight = cols[3].get_text(strip=True)[:-2]
        prev_time = cols[4].get_text(strip=True)
        tilt = cols[5].get_text(strip=True)
        weight_adj = cols[12].get_text(strip=True)
        racer_data = {
            "wakuban": wakuban,
            "weight": weight,
            "prev_time": prev_time,
            "tilt": tilt,
            "weight_adj": weight_adj
        }
        racer_data_list.append(racer_data)
    racer_df = pd.DataFrame(racer_data_list)
    
    start_table = soup.find('table', class_='is-w238')
    rows = start_table.find('tbody', class_='is-p10-0').find_all('tr')
    start_data_list = []
    for row in rows:
        cols = row.find_all('span')
        wakuban = cols[0].get_text(strip=True)
        start_place_match = re.search(r'left: (\d+)%', str(cols[2]))
        start_place = start_place_match.group(1) if start_place_match else None
        st = cols[3].get_text(strip=True)
        start_data = {
            "wakuban": wakuban,
            "start_place": start_place,
            "st": st
        }
        start_data_list.append(start_data)
    start_df = pd.DataFrame(start_data_list)
    
    df = pd.merge(racer_df, start_df, on="wakuban", how="left")
    
    weather_table = soup.find('div', class_='weather1_body')
    infos = weather_table.find_all('div', class_="weather1_bodyUnitLabel")
    direction_match = re.search(r'is-direction(\d+)', str(weather_table.find('div', class_="weather1_bodyUnit is-direction").find('p')))
    direction = direction_match.group(1)
    temp_match = re.search(r'(\d+\.\d+)℃', str(infos[0]))
    temp = temp_match.group(1)
    weather = infos[1].get_text(strip=True)
    wind_match = re.search(r'(\d+)m', str(infos[2]))
    wind = wind_match.group(1)
    wind_direction_match = re.search(r'is-wind(\d+)', str(weather_table.find('div', class_="weather1_bodyUnit is-windDirection").find('p')))
    wind_direction = wind_direction_match.group(1)
    water_temp_match = re.search(r'(\d+\.\d+)℃', str(infos[3]))
    water_temp = water_temp_match.group(1)
    wave_height_match = re.search(r'(\d+)cm', str(infos[4]))
    wave_height = wave_height_match.group(1)
    df['direction'] = direction
    df['temp'] = temp
    df['weather'] = weather
    df['wind'] = wind
    df['wind_direction'] = wind_direction
    df['water_temp'] = water_temp
    df['wave_height'] = wave_height
    df
    return df

In [6]:
def convert_time_to_seconds(time_str):
    if time_str == "":
        return np.nan
    time_str = time_str.replace('\"', '\'')
    minutes, seconds, deciseconds = map(float, time_str.split('\''))
    total_seconds = minutes * 60 + seconds + deciseconds / 10
    return total_seconds

In [7]:
def make_raceresult_df(hd, jcd, rno):
    prefix = "https://www.boatrace.jp/owpc/pc/race/raceresult?"
    url = f"{prefix}rno={rno}&jcd={jcd}&hd={hd}"
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'lxml')
    table = soup.find('table', class_='is-w495')
    rows = table.find_all('tbody')
    data_list = []
    for row in rows:
        cols = row.find_all('td')
        rank_value = cols[0].get_text(strip=True)
        if rank_value in ['転']:
            continue
        else:
            rank = unicodedata.normalize('NFKC', rank_value)
        wakuban = cols[1].get_text(strip=True)
        time = convert_time_to_seconds(cols[3].get_text(strip=True))
        data = {
            "rank": rank,
            "wakuban": wakuban,
            "time": time
        }
        data_list.append(data)
    df = pd.DataFrame(data_list)
    return df

In [8]:
def race_exists(args):
    hd, jcd = args
    check=url = "https://www.boatrace.jp/owpc/pc/race/raceindex?jcd=" + jcd + "&hd=" + hd
    time.sleep(random.uniform(0.5, 1.5))
    response = requests.get(check)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'lxml')
    if "データがありません" not in soup.text:
        return (hd, jcd)
    else:
        return None

In [9]:
def fetch_race_data(args):
    hd, jcd, rno = args
    race_id = f"{hd}{jcd}{rno}"
    time.sleep(random.uniform(0.5, 1.5))
    try:
        racelist_df = make_racelist_df(hd, jcd, rno)
        beforeinfo_df = make_beforeinfo_df(hd, jcd, rno)
        raceresult_df = make_raceresult_df(hd, jcd, rno)
        
        df = pd.merge(racelist_df, beforeinfo_df, on='wakuban', how='left')
        df = pd.merge(df, raceresult_df, on='wakuban', how='left')
        
        df["race_id"] = race_id
        return df

    except Exception as e:
        return None

In [10]:
start = "202301"
end = "202511"

start_dt = pd.to_datetime(start, format="%Y%m")
end_dt = pd.to_datetime(end, format="%Y%m")

ym_list = pd.date_range(start=start_dt, end=end_dt, freq='MS').strftime("%Y%m").tolist()

In [None]:
jcd_list = [str(i).zfill(2) for i in range(1, 25)]
rno_list = [str(i).zfill(2) for i in range(1, 13)]
for ym in ym_list:
    hd_list = create_hd_list(ym)
    tasks = [
    (hd, jcd)
    for hd in hd_list
    for jcd in jcd_list
    ]
    race_held_list = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(race_exists, task) for task in tasks]
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            result = future.result()
            if result is not None:
                race_held_list.append(result)

    tasks = [
        (hd, jcd, rno)
        for hd, jcd in race_held_list
        for rno in rno_list
    ]

    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(fetch_race_data, task) for task in tasks]
        
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            result = future.result()
            if result is not None:
                results.append(result)
    df = pd.concat(results, ignore_index=True)

    output_dir = "dataframes"
    os.makedirs(output_dir, exist_ok=True)
    file_name = f"{ym}.csv"
    save_path = os.path.join(output_dir, file_name)
    df.to_csv(save_path, index=False)

100%|██████████| 744/744 [25:30<00:00,  2.06s/it]
100%|██████████| 5112/5112 [8:07:54<00:00,  5.73s/it]  
100%|██████████| 672/672 [23:00<00:00,  2.05s/it]
100%|██████████| 4260/4260 [6:50:49<00:00,  5.79s/it]  
100%|██████████| 744/744 [25:54<00:00,  2.09s/it]
100%|██████████| 4668/4668 [7:33:03<00:00,  5.82s/it]  
100%|██████████| 720/720 [24:56<00:00,  2.08s/it]
100%|██████████| 4572/4572 [7:17:32<00:00,  5.74s/it]  
100%|██████████| 744/744 [25:44<00:00,  2.08s/it]
100%|██████████| 4800/4800 [7:41:31<00:00,  5.77s/it]  
100%|██████████| 720/720 [24:58<00:00,  2.08s/it]
100%|██████████| 4812/4812 [7:45:10<00:00,  5.80s/it]  
100%|██████████| 744/744 [25:39<00:00,  2.07s/it]
100%|██████████| 4884/4884 [7:49:01<00:00,  5.76s/it]   
100%|██████████| 744/744 [25:34<00:00,  2.06s/it]
100%|██████████| 5172/5172 [8:11:20<00:00,  5.70s/it]   
100%|██████████| 720/720 [24:38<00:00,  2.05s/it]
100%|██████████| 4428/4428 [7:07:40<00:00,  5.80s/it]  
100%|██████████| 744/744 [25:45<00:00,  2.08

In [None]:
def convert_st_time(st_str):
    if st_str[0] == "F":
        return -1 * float(st_str[1:])
    else:
        return float(st_str)

In [None]:
int_cols = [
    "wakuban",
    "racer_id",
    "f_count",
    "l_count",
    "b_no",
    "m_no",
    "course_len",
    "f_count",
    "l_count",
    "direction",
    "start_place",
    "wind",
    "wind_direction",
    "wave_height",
    "rank",
]
for col in int_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')
float_cols = [
    "st_time",
    "g_1rate",
    "g_2rate",
    "g_3rate",
    "l_1rate",
    "l_2rate",
    "l_3rate",
    "b_2rate",
    "b_3rate",
    "m_2rate",
    "m_3rate",
    "weight",
    "prev_time",
    "tilt",
    "weight_adj",
    "temp",
    "water_temp",
    "time"
]
for col in float_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
grade_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4}
df['grade'] = df['grade'].map(grade_mapping).astype('Int64')
df['st'] = df['st'].map(convert_st_time)
# df.set_index(['race_id', 'wakuban'], inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   wakuban         984 non-null    Int64  
 1   racer_id        984 non-null    Int64  
 2   grade           984 non-null    Int64  
 3   f_count         984 non-null    Int64  
 4   l_count         984 non-null    Int64  
 5   st_time         946 non-null    float64
 6   g_1rate         984 non-null    float64
 7   g_2rate         984 non-null    float64
 8   g_3rate         984 non-null    float64
 9   l_1rate         984 non-null    float64
 10  l_2rate         984 non-null    float64
 11  l_3rate         984 non-null    float64
 12  b_no            984 non-null    Int64  
 13  b_2rate         984 non-null    float64
 14  b_3rate         984 non-null    float64
 15  m_no            984 non-null    Int64  
 16  m_2rate         984 non-null    float64
 17  m_3rate         984 non-null    flo