In [1]:
import requests
import json
import pandas as pd
import os
from typing import Union

In [2]:
"""
This script is used to scrape the WTT API for
complete lists of available events.
Events lists for each years are saved as csv files.
"""

# --- CONFIGURATION ---

# Specify the output directory
OUTPUT_DIR ="../Data/Raw/Events/"
    
# Specify the range of years to be scraped. 
# Here, data is obtained for years 2021-2025 (current year at time of writing)
YEARS = range(2021, 2026) 


In [3]:
def get_events_by_year(year: Union[int, str]) -> Union[pd.DataFrame, bool]:
    """
    [Function used to scrape events via the WTT for a specified year. 
    
    Data available in the API includes all events listed in the ITTF/WTT database including non-WTT events.
    
    Reverse engineered from https://www.worldtabletennis.com/events_calendar]
    
    Args:
        year (int or str): The year to fetch events for (e.g., 2024).
        
    Returns:
        pd.DataFrame or bool: DataFrame containing all event details for the year, 
                              or False if an error occurs or no data is found.
    """
    
    # Converts the input year to a string if passed as an int.
    year = str(year)
    
    # API endpoint that can return all events listings for a specified year.
    url = 'https://wtt-website-api-prod-3-frontdoor-bddnb2haduafdze9.a01.azurefd.net/api/eventcalendar'

    # Including all headers from the cURL to maximize request fidelity and bypass simple API checks.
    headers = {
        'accept': 'application/json, text/plain, */*',
        'accept-language': 'en-GB,en;q=0.9,es;q=0.8',
        'cache-control': 'no-cache',
        'content-type': 'application/json',
        'dnt': '1',
        'origin': 'https://www.worldtabletennis.com',
        'pragma': 'no-cache',
        'priority': 'u=1, i',
        'referer': 'https://www.worldtabletennis.com/',
        'sec-ch-ua': '"Chromium";v="140", "Not=A?Brand";v="24", "Google Chrome";v="140"',
        'sec-ch-ua-mobile': '?1',
        'sec-ch-ua-platform': '"Android"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'cross-site',        
        'secapimkey': 'S_WTT_882jjh7basdj91834783mds8j2jsd81', 
        'user-agent': 'Mozilla/5.0 (Linux; Android 11.0; Surface Duo) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Mobile Safari/537.36'
    }

    # API requires a custom filter JSON in order to specify the year.
    data = {
        "custom_filter": f"[{{\"name\":\"StartDateTime\",\"value\":{year},\"custom_handling\":\"multimatch_year_or_filter\",\"condition\":\"or_start\"}},{{\"name\":\"FromStartDate\",\"value\":{year},\"custom_handling\":\"multimatch_year_or_filter\",\"condition\":\"or_end\"}}]"
    }

    try:
        print(f"Fetching events for {year}...")
        # POST request required as JSON payload is expected in order to return events for specified year.
        response = requests.post(url, headers=headers, json=data)
        # Raise exception for bad status codes (4xx request errors or 5xx server errors)
        response.raise_for_status() 

        response_data = response.json()
        # Response is a dictionary with inside a list. The response_data.[0].get("rows") is used to access the events data.
        # Each event is stored as a dictionary.
        events_list = response_data[0].get('rows')

        # Check if data has been returned ana accessed successfully
        if not (events_list and isinstance(events_list, list)):
            print(f"No event data found for {year}.")
            return False

        # Create DataFrame containing all events listings found and return it.
        df = pd.DataFrame(events_list)
        print(f"Found {len(df)} events from {year}")
        
        return df

    # except specified errors or other unexpected ones.
    except requests.exceptions.HTTPError as err:
        print(f"❌ HTTP Error: {err}")
    except json.JSONDecodeError:
        print(f"❌ Error: The response was not valid JSON.")
    except Exception as err:        
        print(f"❌ An unexpected error occurred: {err}")
    
    return False





In [4]:
if __name__ == "__main__":  
    
    print("--- 🚀 Starting WTT Events Scraper 🚀 --- \n    ")
    
    # Create the desired output directory if required.
    os.makedirs(OUTPUT_DIR, exist_ok=True) 
    
    for year in YEARS:
    
    
    
        # Call the function to obtain Dataframe of events for specified year
        year_df = get_events_by_year(year=year) 
        
        if isinstance(year_df, pd.DataFrame) and not year_df.empty:
            filename = f'{OUTPUT_DIR}/raw_events_{year}.csv'
            
            # Save the individual year's file as a csv
            year_df.to_csv(filename, index=False)
            print(f"✅ Data successfully saved to {filename}")                      
           
        
        # if error occured, skip the year
        elif year_df is False:
            print(f"Skipping year {year} due to error.")     
                  
        
    print("\n--- 🟢 Scraping finished. 🟢 ---")

--- 🚀 Starting WTT Events Scraper 🚀 --- 
    
Fetching events for 2021...
Found 100 events from 2021
✅ Data successfully saved to ../Data/Raw/Events//raw_events_2021.csv
Fetching events for 2022...
Found 94 events from 2022
✅ Data successfully saved to ../Data/Raw/Events//raw_events_2022.csv
Fetching events for 2023...
Found 107 events from 2023
✅ Data successfully saved to ../Data/Raw/Events//raw_events_2023.csv
Fetching events for 2024...
Found 137 events from 2024
✅ Data successfully saved to ../Data/Raw/Events//raw_events_2024.csv
Fetching events for 2025...
Found 172 events from 2025
✅ Data successfully saved to ../Data/Raw/Events//raw_events_2025.csv

--- 🟢 Scraping finished. 🟢 ---
