In [3]:
import json
import os
import random
import time
import pandas as pd
import requests
import sys
import concurrent.futures 
from typing import Union, Dict, List, Tuple, Any
import glob 

In [4]:
"""
This script fetches the raw JSON containing all "match payloads" containing match metadata
    for a given WTT event ID using a GET request.
    
    Match-codes contained in payload are reequired for subsequent API call to get full match details.
    
    Reverse engineered from WTT events pages such as:
    https://www.worldtabletennis.com/eventInfo?eventId=3085&selectedTab=Matches

    Events_file is a csv containing the events list of events to be scraped based on their unique event ID.

    A csv file is made for each event containing all match payloads for that event.

    Threading has been implemented to speed up the proccess.
    
"""
# --- CONFIGURATION ---

# Specifying the csv containing all the events from
EVENTS_FILE = "../Data/Processed/Events/shortlist_events.csv"

# A csv for each event containing its match payloads will be saved to this directory/
OUTPUT_DIR = "../Data/Raw/Match_payloadst"

# Values used to generate random pause duration in seconds for API politeness
MIN_PAUSE = 0.1 
MAX_PAUSE = 0.2 

# Number of threads for the IO processing.
# Based on reading - 20 is a good starting number:
MAX_WORKERS = 20 


In [5]:
# Helper function used by each worker
# API call as well as file saving occurs inside the fucntion.

def fetch_and_save_payload(event_id: Union[int, str], output_dir: str, min_pause: float, max_pause: float) -> Tuple[int, bool, int, str]:
    """
    For one event: fetches match payloads, saves to CSV, and handles errors and reporting.
    For use with threadpool workers
    Returns: (event_id, status_bool, match_count, status_message)
    """

    # Define API endpoint URL and necessary params + headers.
    url = "https://liveeventsapi.worldtabletennis.com/api/cms/GetOfficialResult"
    params = {'EventId': str(event_id), "DocumentCode": "TTE"}
    headers = {
        'Accept': 'application/json, text/plain, */*',
        'Referer': 'https://www.worldtabletennis.com/',
        'User-Agent': 'Mozilla/5.0 (Linux; Android 11.0; Surface Duo) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Mobile Safari/537.36'
    }

    # initialise variables for match_count and the status message for logging.
    match_count = 0
    status_msg = ""

    # Try the api call and get response as json. Timeout set to keep thread running if API TimeoutError occurs
    try:
        response = requests.get(url, params=params, headers=headers, timeout=15)
        
        # Raise exception for bad status codes (4xx request errors or 5xx server errors)
        response.raise_for_status()
        raw_payloads = response.json() 

        # Check if response is a list as expected.
        if not isinstance(raw_payloads, list):
            # return content for logging, type.__name__ isolates typename as string  
            status_msg = f"JSON was not a list ({type(raw_payloads).__name__})"
            # return content for logginga
            return event_id, False, 0, status_msg

        # convert match payloads to df and get length / number of matches
        payloads_df = pd.DataFrame(raw_payloads)
        match_count = len(payloads_df)

        # Create filename and save df to csv (even if content is blank)
        filename = os.path.join(OUTPUT_DIR,f"{event_id}_match_payloads.csv")
        payloads_df.to_csv(filename, index=False)

        sleep_duration = random.uniform(min_pause, max_pause)
        time.sleep(sleep_duration)

        return event_id, True, match_count, f"Found {match_count} matches for event:{event_id}. Pausing for {sleep_duration:.1f}s."

    except requests.exceptions.HTTPError as e:
        status_msg = f"HTTP Error: {e.response.status_code}"
    except Exception as e:
        status_msg = f"Error: {type(e).__name__}"

    return event_id, False, 0, status_msg 




In [6]:
def filter_events_to_scrape(shortlist_df: pd.DataFrame, output_dir: str) -> pd.Series:
    """
    Checks output directory against the event_shortlist to check and return event_ids where data is not yet found or where 
    the event is not yet complete (as more matches can be added)
    
    Args:
        shortlist_df (pd.DataFrame): Shortlist of evente to be scraped
        output_dir (str): The directory where existing match payloads are saved.
        
    Returns:
        pd.Series: A series of event_ids that need to be scraped for more data (availability of data will be checked later)
    """

    print(f"\n--- 🟠 Starting Check on {len(os.listdir(output_dir))} existing files in {output_dir}... 🟠 ---")
    
    # Use the input DF length for initial count
    events_total_initial = len(shortlist_df)
    
    # initialise variables
    already_obtained_events: Set[int] = set()
    files_checked_count = 0 
    
  
    
    # Parse the output directory for existing match payloads files
    existing_files = glob.glob(f"{output_dir}/*.csv")
    
    
    # Loop through the existing files; check if data is there, and check if event is still ongoing.
    
    for file_path in existing_files: 
         # Initialise variables for logging
        files_checked_count += 1
        current_event_id = None

        # First Try Except to handle IO errors such as bad files.
        try:
            # create of payloads for current event 
            existing_payloads_df = pd.read_csv(file_path)
            
            # If no data, then continue.
            if existing_payloads_df.empty:
                continue

            # Inner Try except catches (KeyError, IndexError)
            try:
                # get event id from the payloads df (rather than checking filename)
                current_event_id = existing_payloads_df["eventId"].iloc[0]
                
                # get the event listing from the events_shortlist that contains complete evnent metadata
                event_entry = shortlist_df[shortlist_df["eventId"] == current_event_id]

                # if eventry entry is not found in shortlist, skip to next file in loop.
                if event_entry.empty:
                    continue

                # get completed status (True or False)
                event_completed = event_entry["Completed"].iloc[0]
                
                # Only add to skip list if the event is completed! 
                if event_completed:
                    already_obtained_events.add(current_event_id)
                    
            except (KeyError, IndexError, pd.errors.EmptyDataError) as err:
                # Prints errors for index problems missing data 
                print(f" Skipping file: {os.path.basename(file_path)}. Data error: {type(err).__name__}")
                continue
            
        except Exception as e:
            # Catches other errors if they occur.
            #print(f"ERROR reading file {os.path.basename(file_path)}: {type(e).__name__}")
            continue

    

    # Filter df to get event ids required to scrape
    

   # mask to get events that have already been obtained
    event_ids_series = shortlist_df["eventId"].astype(int)
    mask = shortlist_df["eventId"].isin(already_obtained_events)
    # filters out events that are obtained already - leaves only event id that need to be scraped
    events_to_scrape_df = shortlist_df[~mask].copy()

    #count for final print statement
    events_to_scrape_count = len(events_to_scrape_df)

   
    print(f"\n--- CHECK COMPLETE: {events_to_scrape_count}/{events_total_initial} events remaining to scrape. ---")
    
    print(f"✅ Total Events: {events_total_initial} | Files Checked: {files_checked_count} | Already Obtained: {len(already_obtained_events)} | To Scrape: {events_to_scrape_count}")
    
    # Return the clean Series of IDs
    return events_to_scrape_df["eventId"]

In [None]:
if __name__ == "__main__":

    start_time = time.time() # Start timing the entire run
    print("---🚀 Starting Obtaining Match Payloads 🚀---")

    # 1. Load Shortlist DF and Perform Skip Check
    try:
        shortlist_df_object = pd.read_csv(EVENTS_FILE)
        # Ensure 'Completed' column is boolean after loading
        shortlist_df_object['Completed'] = shortlist_df_object['Completed'].astype(bool)
    except FileNotFoundError:
        print(f"--- ❌ ERROR: Shortlist file not found at {EVENTS_FILE}. ---")
        sys.exit(1)
    except KeyError as e:
        print(f"--- ❌ ERROR: Necessary column '{e}' not found in {EVENTS_FILE}. ---")
        sys.exit(1)

    # Create output directory
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # Call the filter function to get the Series of IDs to actually scrape
    events_to_scrape_ids = filter_events_to_scrape(shortlist_df_object, OUTPUT_DIR)
    # Convert Series to Python list for the executor
    event_ids_to_process = events_to_scrape_ids.tolist()
    events_to_scrape_count = len(event_ids_to_process)

    # Exit cleanly if no events need scraping after the skip check
    if events_to_scrape_count == 0:
        print("\n--- ✅ PROCESS COMPLETE: No events remaining to scrape. ---")
        sys.exit(0)

    # --- Counters & Setup for Parallel Execution ---
    processed_count = 0
    successful_count = 0
    failed_event_ids: List[int] = []
    start_time_loop = time.time() # Start timer specifically for the concurrent loop

    print(f"\n---🚀 Starting Concurrent Scraping for {events_to_scrape_count} Events 🚀---")
    print(f"--- Using {MAX_WORKERS} threads. API pause: {MIN_PAUSE:.1f}s - {MAX_PAUSE:.1f}s ---")

    #  Launch paralllel execution
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:

        # Submit all tasks immediately.
        # Map the 'Future' object back to the event_id for easy lookup.
        futures = {
            executor.submit(fetch_and_save_payload, event_id, OUTPUT_DIR, MIN_PAUSE, MAX_PAUSE): event_id
            for event_id in event_ids_to_process
        }

        # Process Results Asynchronously (as they complete)
        for future in concurrent.futures.as_completed(futures):

            processed_count += 1
            event_id = futures[future] # Get the event_id for this completed future

            try:
                # Retrieve the result tuple: (event_id, status_bool, match_count, status_message)
                result_id, status, match_count, status_msg = future.result()

                # --- In-Place Status Line (Overwrites itself) ---
                if status:
                    successful_count += 1                   
                    # Construct the status message
                    log_line = f"Event {event_id}: ({processed_count}/{events_to_scrape_count})  ✅ {status_msg}"
                else:
                   
                    failed_event_ids.append(event_id)
                    # Construct the failure message
                    log_line = f"Event {event_id}: ({processed_count}/{events_to_scrape_count})  ❌ Failed: {status_msg}"

                # Print the status line in place, padded to clear previous content
                print(log_line.ljust(100), end='\r') # Increased padding slightly

                # --- Checkpoint Log (Every 10 events, prints a NEW LINE) ---
                if processed_count % 10 == 0:
                    elapsed_time = time.time() - start_time_loop
                    minutes = int(elapsed_time // 60)
                    seconds = int(elapsed_time % 60)
                    # Print checkpoint on a new line for clarity
                    print(f"\n--- {processed_count}/{events_to_scrape_count} processed. Success Rate: {successful_count/processed_count:.1%}. Elapsed: {minutes}m {seconds}s ---")
                    # After checkpoint, immediately reprint the current status in-place to avoid blank line
                    print(log_line.ljust(100), end='\r')


            except Exception as e:
                # Catches unexpected errors *during result retrieval*
                # Print fatal error on a new line, padded
                print(f"\n--- ❌ FATAL ERROR processing result for Event {event_id}: {type(e).__name__} ---".ljust(100))
                failed_event_ids.append(event_id)

    # --- End of Loop ---
    # Final cleanup print to clear the last in-place status line before the summary
    print(" " * 100, end='\r')

    # 3. Final Summary
    total_run_time = time.time() - start_time
    total_minutes = int(total_run_time // 60)
    total_seconds = int(total_run_time % 60)

    print("\n" + "=" * 50)
    print(f"✅ Finished! Match payloads obtained for {successful_count}/{events_to_scrape_count} requested events.")
    if failed_event_ids:
        print(f"⚠️ Failed Event IDs ({len(failed_event_ids)}): {failed_event_ids}")
    print(f"Total run time = {total_minutes} m and {total_seconds} s. (Parallel Mode)")
    print("---🟢 Scraping finished. 🟢---")

---🚀 Starting Obtaining Match Payloads 🚀---

--- 🟠 Starting Check on 74 existing files in ../Data/Raw/Match_payloadst... 🟠 ---

--- CHECK COMPLETE: 281/302 events remaining to scrape. ---
✅ Total Events: 302 | Files Checked: 74 | Already Obtained: 21 | To Scrape: 281

---🚀 Starting Concurrent Scraping for 281 Events 🚀---
--- Using 20 threads. API pause: 0.1s - 0.2s ---
Event 2099: (3/281)  ✅ Found 0 matches for event:2099. Pausing for 0.2s.                            