In [15]:
import json
import os
import random
import time
from datetime import datetime
import re
import pandas as pd
import requests
from typing import Union, Dict, List
import sys

In [16]:
"""
This script fetches the raw JSON containing all "match payloads" containing match metadata
    for a given WTT event ID using a GET request.
    
    Match-codes contained in payload are reequired for subsequent API call to get full match details.
    
    Reverse engineered from WTT events pages such as:
    https://www.worldtabletennis.com/eventInfo?eventId=3085&selectedTab=Matches

    Events_file is a csv containing the events list of events to be scraped based on their unique event ID.

    A csv file is made for each event containing all match payloads for that event.
"""

# --- CONFIGURATION ---

# file containing events listings.
EVENTS_FILE = "Data/Processed/Events/shortlist_events.csv"

# direotory where payloads will be saved in a separate csv for each event scraped.
OUTPUT_DIR = "Data/Raw/Match_payloads_test"

# define pause durations for random pauses to prevent overloading the API, in seconds.
MIN_PAUSE = 0.1
MAX_PAUSE = 0.2


In [17]:
def get_raw_match_payloads(event_id: Union[int, str]) -> Union[Dict, None]:
    """
    Fetches the raw JSON containing all "match payloads" containing match metadata
    for a given WTT event ID using a GET request.
    
    Match-codes contained in payload are reequired for subsequent API call to get full match details.
    
    Reverse engineered from WTT events pages such as:
    https://www.worldtabletennis.com/eventInfo?eventId=3085&selectedTab=Matches
    
    
    Args:
        event_id (int or str): Unique id code for the WTT event to be scraped.
        
    Returns:
        dict or None: A dictionary containing all  raw match payloads for the event. 
        Return None on failure.
    """
    
    url = "https://liveeventsapi.worldtabletennis.com/api/cms/GetOfficialResult"

    # convert eventID to a string as required.
    params = {
        'EventId': str(event_id),
        "DocumentCode": "TTE"
    }
    
    # Define essential headers for robust scraping
    headers = {
        'Accept': 'application/json, text/plain, */*',
        'Referer': 'https://www.worldtabletennis.com/',
        'User-Agent': 'Mozilla/5.0 (Linux; Android 11.0; Surface Duo) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Mobile Safari/537.36'
    }
    
    try:
        # Execute GET Request. 'params' automatically builds the query string.
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        return response.json()
        
    except requests.exceptions.HTTPError as err:
        print(f"❌ HTTP Error for Event {event_id}: {err}")
    except requests.exceptions.RequestException as err:
        print(f"❌ Connection Error for Event {event_id}: {err}")
    except json.JSONDecodeError:
        print(f"❌ JSON Decode Error for Event {event_id}: Invalid response.")
    
    return None

In [18]:
if __name__ == "__main__":
    
    # Start timing the run to monitor time elapsed during scrapping
    start_time = time.time()

    print("---🚀 Starting Obtaining Match payloads  🚀---")

    # create output directory if it does not yet exist
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # load events file into a dataframe if the file is found
    try:
        events_df = pd.read_csv(EVENTS_FILE)
    except FileNotFoundError as e:
        print(f"--- ❌ FILE ERROR: Input file not found at {EVENTS_FILE}. Check events_file configuration ---")
        sys.exit()

    # ensure events are sorted by eventID for clarity    
    events_df = events_df.sort_values("EventId").reset_index(drop=True)

    events_total = len(events_df)
    successful_events_ids: List[int] = []
    failed_events_ids: List[int] = [] 

    if events_total == 0:
        print("--- ❌ PROCESS CANCELLED: No events found: Check previous steps and events_file ---")
    
    for number, event_id in events_df["EventId"].items():

        
        # print progress as scraping occurs
        print(f"Processing event: {event_id}: ({number + 1}/{events_total})")

        # fetch the raw match payloads json from the api
        raw_payloads = get_raw_match_payloads(event_id)

        # verify if a response was returned
        if raw_payloads is None or not raw_payloads:
            print (f"Failed or EMPTY repsonse, skipping event: {event_id}")
            failed_events_ids.append(event_id)
            continue

        # check if response is in the expected format for current API, i,e a list of match payloads in this case
        if not isinstance(raw_payloads, list):
            print(f"❌ Json response for event: {event_id} was not a list as expected. Skipping.")
            failed_events_ids.append(event_id)
            continue

        # convert the list of match payloads to a dataframe
        try:
            payloads_df = pd.DataFrame(raw_payloads)
            successful_events_ids.append(payloads_df)
        except ValueError:
            print(f"❌ Error converting response to DataFrame for event: {event_id}. Skipping.")
            continue


        filename = os.path.join(OUTPUT_DIR, f"{event_id}_match_payloads.csv")
        payloads_df.to_csv(filename, index=False)

            
        # get a random duration to pause for 
        sleep_duration = random.uniform(MIN_PAUSE, MAX_PAUSE)


        print(f" ✅ Obtained {len(payloads_df)} matches. Pausing for {sleep_duration:.1f}s.")

        time.sleep(sleep_duration)
        


    total_run_time = time.time() - start_time
    total_minutes = int(total_run_time // 60)
    total_seconds = int(total_run_time % 60)
    
    total_events_scraped = len(successful_event_ids)
    
    print("\n" + "="*50)
    print(f"✅ Finished! Match payloads obtained for {total_events_scraped}/{event_total} events.")
    print(f"Total run time = {total_minutes} m and {total_seconds} s.")
    print("---🟢 Scraping finished. 🟢---")        
        
        

    
    

    


---🚀 Starting Obtaining Match payloads  🚀---
Processing event: 2263: (1/166)
Failed or EMPTY repsonse, skipping event: 2263
Processing event: 2265: (2/166)
Failed or EMPTY repsonse, skipping event: 2265
Processing event: 2345: (3/166)
 ✅ Obtained 183 matches. Pausing for 0.2s.
Processing event: 2346: (4/166)


KeyboardInterrupt: 