In [None]:
import sys
import os
import pandas as pd




project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

if project_root not in sys.path:
    sys.path.append(project_root)
    
from utils.fetchPayloads import get_payloads_to_scrape, get_match_payloads, main_payload_scraper

In [14]:
"""
This script fetches the raw JSON containing all "match payloads" containing match metadata
    for a given WTT event ID using a GET request.
    
    Match-codes contained in payload are reequired for subsequent API call to get full match details.
    
    Reverse engineered from WTT events pages such as:
    https://www.worldtabletennis.com/eventInfo?eventId=3085&selectedTab=Matches

    Events_file is a csv containing the events list of events to be scraped based on their unique event ID.

    A csv file is made for each event containing all match payloads for that event.

    Threading has been implemented to speed up the proccess.
    
"""
# --- CONFIGURATION ---

# Specifying the csv containing all the events from
EVENTS_FILE = "../Data/Processed/Events/shortlist_events.csv"

# A csv for each event containing its match payloads will be saved to this directory/
OUTPUT_DIR = "../Data/Raw/Match_payloads"

# Values used to generate random pause duration in seconds for API politeness
MIN_PAUSE = 0.1 
MAX_PAUSE = 0.2 

# Number of threads for the IO processing.
# Based on reading - 20 is a good starting number:
MAX_WORKERS = 20 

# Max retries for fdailed requests
MAX_RETRIES = 10 

In [15]:
try:
    shortlist_df = pd.read_csv(EVENTS_FILE) 
    print(f"--- ‚úÖ Shortlist file found at {EVENTS_FILE} with {len(shortlist_df)} events ---")             
except FileNotFoundError:
    print(f"--- ‚ùå ERROR: Shortlist file not found at {EVENTS_FILE}. ---"); sys.exit(1)

payloads_to_scrape = get_payloads_to_scrape(shortlist_df, OUTPUT_DIR)


if len(payloads_to_scrape) == 0:
    print("\n--- ‚úÖ PROCESS COMPLETE: No events remaining to scrape. ---"); sys.exit(0)

else:
    await main_payload_scraper(payloads_to_scrape, OUTPUT_DIR)

--- ‚úÖ Shortlist file found at ../Data/Processed/Events/shortlist_events.csv with 361 events ---
--- üü† Reconciling event list with existing payload files... ---
Total events in master list: 361
Found 1242 existing payload files in ../Data/Raw/Match_payloads.
üü¢ 0 new/ongoing events to scrape.

--- ‚úÖ PROCESS COMPLETE: No events remaining to scrape. ---


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
