スクレイピング

In [35]:
%pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
from bs4 import BeautifulSoup
import re


In [3]:
jcd = "01" # ボートレース場のコード
hd = "20251129" # 日付（YYYYMMDD形式）
rno = "12" # レース番号

# 出走表

In [4]:
racelist_prefix = "https://www.boatrace.jp/owpc/pc/race/racelist?"

In [5]:
racelist_url = f"{racelist_prefix}rno={rno}&jcd={jcd}&hd={hd}"
racelist_url

'https://www.boatrace.jp/owpc/pc/race/racelist?rno=12&jcd=01&hd=20251129'

In [6]:
response = requests.get(racelist_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'lxml')


Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(response.content, 'lxml')


In [7]:
class Racer_info:
    def __init__(self, wakuban, id, grade, weight, age):
        self.wakuban = wakuban
        self.id = id
        self.grade = grade
        self.weight = weight
        self.age = age
    def print(self):
        print(*self.__dict__.values())

In [8]:
race_header = soup.find('h3', class_='title16_titleDetail__add2020')
if race_header:
    # ヘッダー内のテキストを取得（余分な空白を除去）
    header_text = race_header.get_text(strip=True)
    # 正規表現で「数字 + m」のパターン（例: 1800m）を抽出
    match = re.search(r'(\d{3,4}m)', header_text)
    if match:
        distance = match.group(1)[:-1]
        print(f"course length: {distance}")
    else:
            print("距離情報が見つかりませんでした。")

course length: 1800


In [9]:
racer_containers = soup.find_all('tbody', class_='is-fs12')

In [10]:
racer_container = racer_containers[0]
racer_container.get_text().split()

['１',
 '3532',
 '/',
 'A1',
 '柴田',
 '光',
 '群馬/群馬',
 '53歳/52.0kg',
 'F0',
 'L0',
 '0.16',
 '6.97',
 '50.41',
 '73.17',
 '6.41',
 '48.03',
 '66.45',
 '74',
 '36.36',
 '56.28',
 '73',
 '0.00',
 '0.00',
 '7',
 '7R',
 '3',
 '.15',
 '３']

In [11]:
racer_info_list = []
wakuban = 0
for racer_container in racer_containers:
    wakuban += 1
    basis_info_containers = racer_container.find_all('div', class_='is-fs11')
    for elem in basis_info_containers:
        text = elem.get_text(strip=True)
        grade_match = re.search(r'([A-Z]\d)', text)
        id_match = re.search(r'(\d{4})', text)
        age_match = re.search(r'(\d+)歳', text)
        weight_match = re.search(r'(\d+\.\d+)kg', text)
        if grade_match:
            grade = grade_match.group(1)
        if id_match:
            racer_id = id_match.group(1)
        if age_match:
            age = age_match.group(1)
        if weight_match:
            weight = weight_match.group(1)
    racer_info = Racer_info(wakuban, racer_id, grade, weight, age)
    racer_info_list.append(racer_info)

    

In [12]:
racer_container = racer_containers[5]

In [13]:
class Racer_data:
    def __init__(self, start_data, global_rates, local_rates, motor_rates, boat_rates):
        self.f_num, self.l_num, self.st_ave = start_data
        self.g_winrate, self.g_2rate, self.g_3rate = global_rates
        self.l_winrate, self.l_2rate, self.l_3rate = local_rates
        self.m_winrate, self.m_2rate, self.m_3rate = motor_rates
        self.b_winrate, self.b_2rate, self.b_3rate = boat_rates

    def print(self):
        print(*self.__dict__.values())

In [14]:
data_source = [
    tag.get_text().split() 
    for tag in racer_container.find_all('td', class_='is-lineH2')
]
racer_data = Racer_data(*data_source[:5])

直前情報

In [15]:
beforeinfo_prefix = "https://www.boatrace.jp/owpc/pc/race/beforeinfo?"
beforeinfo_url = f"{beforeinfo_prefix}rno={rno}&jcd={jcd}&hd={hd}"
beforeinfo_url

'https://www.boatrace.jp/owpc/pc/race/beforeinfo?rno=1&jcd=01&hd=20251129'

In [16]:
response = requests.get(beforeinfo_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')

In [17]:
racer_containers = soup.find_all('tbody', class_='is-fs12')
racer_container = racer_containers[0]

In [20]:
racer_container.get_text().split()

['1', '小倉', '康典', '52.0kg', '6.80', '-0.5', 'R', '進入', '0.0', 'ST', '着順']