In [1]:
import sys
import os
import logging
from datetime import datetime, date
import pandas as pd
import json

# プロジェクトのルートディレクトリをパスに追加
sys.path.append('..')

# ログ設定
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

print("=== 競馬レースデータ収集システム ===")
print(f"実行時刻: {datetime.now()}")


=== 競馬レースデータ収集システム ===
実行時刻: 2025-08-31 20:09:40.470750


In [2]:
# 必要なモジュールをインポート
from src.scraping.scrapers.race_scraper import RaceScraper
from src.scraping.storage.race_storage import RaceStorage
from src.database.schemas.race_schema import Race, RaceResult

# スクレイパーとストレージを初期化
scraper = RaceScraper(delay=1.0)  # 1秒間隔でリクエスト
storage = RaceStorage()

print("✅ システム初期化完了")

✅ システム初期化完了


In [3]:
# テスト: 2024年G1レースリストを取得（実際のスクレイピングは行わない）
print("=== Testing Race List Extraction ===")
race_list = scraper.scrape_race_list_by_conditions(
    start_year=2010,
    end_year=2012,
    grades=['1'],
    limit=20
)

print(f"Found {len(race_list)} races")
for race in race_list[:3]:  # 最初の3件のみ表示
    print(f"  - {race}")

2025-08-31 20:09:43,674 - src.scraping.scrapers.race_scraper - INFO - Fetching race list: 2010-2012, grades: ['1']


=== Testing Race List Extraction ===


2025-08-31 20:09:45,471 - src.scraping.scrapers.race_scraper - INFO - Final URL: https://db.netkeiba.com/?pid=race_list&word=&start_year=2010&start_mon=none&end_year=2012&end_mon=none&list=20&sort=date&track%5B%5D=1&jyo%5B%5D=01&jyo%5B%5D=02&jyo%5B%5D=03&jyo%5B%5D=04&jyo%5B%5D=05&jyo%5B%5D=06&jyo%5B%5D=07&jyo%5B%5D=08&jyo%5B%5D=09&jyo%5B%5D=10&grade%5B%5D=1
2025-08-31 20:09:45,472 - src.scraping.scrapers.race_scraper - INFO - Response status: 200
2025-08-31 20:09:45,499 - src.scraping.extractors.race.race_list_extractor - INFO - Extracted 20 races from list
2025-08-31 20:09:45,500 - src.scraping.scrapers.race_scraper - INFO - Found 20 races


Found 20 races
  - {'race_date': '2012-12-23', 'track_name': '中山', 'meeting_number': 5, 'day_number': 8, 'weather': '晴', 'race_number': 10, 'race_name': '有馬記念(GI)', 'race_id': '201206050810', 'grade': 'G1', 'track_type': '芝', 'distance': 2500, 'total_horses': 16, 'track_condition': '良', 'winning_time': '2:31.9', 'pace': '29.9-36.0', 'winner_name': 'ゴールドシップ', 'winner_jockey': '内田博幸', 'winner_trainer': '須貝尚介', 'winner_trainer_region': '西'}
  - {'race_date': '2012-12-16', 'track_name': '中山', 'meeting_number': 5, 'day_number': 6, 'weather': '晴', 'race_number': 11, 'race_name': '朝日フューチュリティ(GI)', 'race_id': '201206050611', 'grade': 'G1', 'track_type': '芝', 'distance': 1600, 'total_horses': 16, 'track_condition': '良', 'winning_time': '1:33.4', 'pace': '33.9-36.1', 'winner_name': 'ロゴタイプ', 'winner_jockey': 'Ｍ．デム', 'winner_trainer': '田中剛', 'winner_trainer_region': '東'}
  - {'race_date': '2012-12-09', 'track_name': '阪神', 'meeting_number': 5, 'day_number': 4, 'weather': '晴', 'race_number': 11, 'ra

In [4]:
# レース詳細取得のテスト

test_race = race_list[3]
test_race_id = test_race['race_id']

print(f"TEST RACE ID: {test_race_id}")

# --------------------------------------

race_detail = scraper.scrape_race_detail(test_race_id)

2025-08-31 20:09:47,623 - src.scraping.scrapers.race_scraper - INFO - race detail url: https://db.netkeiba.com/race/201209050411/


TEST RACE ID: 201209050411


2025-08-31 20:09:48,245 - src.scraping.extractors.race.race_detail_extractor - INFO - Extracted 18 race results for 201209050411
2025-08-31 20:09:48,245 - src.scraping.extractors.race.race_detail_extractor - INFO - Successfully extracted race data: 201209050411 (18 horses)
2025-08-31 20:09:48,245 - src.scraping.extractors.race.race_detail_extractor - INFO - RACE IS: Race(race_id='201209050411', race_date=datetime.date(2025, 8, 31), track_name='', race_number=11, race_name='Âè64²óºå¿À¥¸¥å¥Ù¥Ê¥¤¥ëF(GI)', distance=0, track_type='', total_horses=18, grade='G1', track_direction=None, weather=None, track_condition=None, start_time=None, winning_time='1:34.2', pace=None, prize_1st=None, race_class=None, race_conditions=None, created_at=None, updated_at=None)
2025-08-31 20:09:48,246 - src.scraping.extractors.race.race_detail_extractor - INFO - RESULT: RaceResult(race_id='201209050411', horse_id='2010104278', horse_name='¥í¡¼¥Ö¥Æ¥£¥µ¡¼¥¸¥å', bracket_number=1, horse_number=1, age=0, sex='不明', jo

In [5]:
race, race_results = race_detail

print(race)

Race(race_id='201209050411', race_date=datetime.date(2025, 8, 31), track_name='', race_number=11, race_name='Âè64²óºå¿À¥¸¥å¥Ù¥Ê¥¤¥ëF(GI)', distance=0, track_type='', total_horses=18, grade='G1', track_direction=None, weather=None, track_condition=None, start_time=None, winning_time='1:34.2', pace=None, prize_1st=None, race_class=None, race_conditions=None, created_at=None, updated_at=None)


In [6]:
from src.scraping.storage.postgresql_storage import PostgreSQLStorage

storage = PostgreSQLStorage()

storage.insert_race(race)


2025-08-31 20:10:00,874 - src.scraping.storage.postgresql_storage - INFO - PostgreSQL connection test successful
2025-08-31 20:10:00,875 - src.scraping.storage.postgresql_storage - INFO - Race Is: Race(race_id='201209050411', race_date=datetime.date(2025, 8, 31), track_name='', race_number=11, race_name='Âè64²óºå¿À¥¸¥å¥Ù¥Ê¥¤¥ëF(GI)', distance=0, track_type='', total_horses=18, grade='G1', track_direction=None, weather=None, track_condition=None, start_time=None, winning_time='1:34.2', pace=None, prize_1st=None, race_class=None, race_conditions=None, created_at=None, updated_at=None)


True