In [1]:
import json
import os
import random
import time
import pandas as pd
import requests
import sys
import concurrent.futures 
from typing import Union, Dict, List, Tuple, Any
import glob 

In [2]:
"""
This script fetches the raw JSON containing all "match payloads" containing match metadata
    for a given WTT event ID using a GET request.
    
    Match-codes contained in payload are reequired for subsequent API call to get full match details.
    
    Reverse engineered from WTT events pages such as:
    https://www.worldtabletennis.com/eventInfo?eventId=3085&selectedTab=Matches

    Events_file is a csv containing the events list of events to be scraped based on their unique event ID.

    A csv file is made for each event containing all match payloads for that event.

    Threading has been implemented to speed up the proccess.
    
"""
# --- CONFIGURATION ---

# Specifying the csv containing all the events from
EVENTS_FILE = "../Data/Processed/Events/shortlist_events.csv"

# A csv for each event containing its match payloads will be saved to this directory/
OUTPUT_DIR = "../Data/Raw/Match_payloads"

# Values used to generate random pause duration in seconds for API politeness
MIN_PAUSE = 0.1 
MAX_PAUSE = 0.2 

# Number of threads for the IO processing.
# Based on reading - 20 is a good starting number:
MAX_WORKERS = 20 

# Max retries for fdailed requests
MAX_RETRIES = 10 

In [3]:
def fetch_and_save_payloads(event_id: Union[int, str], output_dir: str, min_pause: float, max_pause: float) -> Tuple[int, bool, int, str]:
    """
    For one event: fetches match payloads, saves to CSV, and handles errors and reporting.
    For use with threadpool workers
    Returns: (event_id, status_bool, match_count, status_message)
    """

    # Define API endpoint URL and necessary params + headers.
    url = "https://liveeventsapi.worldtabletennis.com/api/cms/GetOfficialResult"
    params = {'EventId': str(event_id), "DocumentCode": "TTE"}
    headers = {
        'Accept': 'application/json, text/plain, */*',
        'Referer': 'https://www.worldtabletennis.com/',
        'User-Agent': 'Mozilla/5.0 (Linux; Android 11.0; Surface Duo) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Mobile Safari/537.36'
    }

    # initialise variables for match_count and the status message for logging.
    match_count = 0
    status_msg = ""

    # Try the api call and get response as json. Timeout set to keep thread running if API TimeoutError occurs
    try:
        response = requests.get(url, params=params, headers=headers, timeout=20)
        
        # Raise exception for bad status codes (4xx request errors or 5xx server errors)
        response.raise_for_status()
        raw_payloads = response.json() 

        # Check if response is a list as expected.
        if not isinstance(raw_payloads, list):
            # return content for logging, type.__name__ isolates typename as string  
            status_msg = f"JSON was not a list ({type(raw_payloads).__name__})"
            # return content for logginga
            return event_id, False, 0, status_msg

        # convert match payloads to df and get length / number of matches
        payloads_df = pd.DataFrame(raw_payloads)
        match_count = len(payloads_df)

        # Create filename and save df to csv (even if content is blank)
        filename = os.path.join(OUTPUT_DIR,f"{event_id}_match_payloads.csv")
        payloads_df.to_csv(filename, index=False)

        sleep_duration = random.uniform(min_pause, max_pause)
        time.sleep(sleep_duration)

        return event_id, True, match_count, f"Found {match_count} matches for event:{event_id}. Pausing for {sleep_duration:.1f}s."

    except requests.exceptions.HTTPError as e:
        status_msg = f"HTTP Error: {e.response.status_code}"
    except Exception as e:
        status_msg = f"Error: {type(e).__name__}"

    return event_id, False, 0, status_msg 




In [4]:
def filter_events_to_scrape(shortlist_df: pd.DataFrame, output_dir: str) -> pd.Series:
    """
    Checks the event_shortlist and output directory to return event_ids
    that need to be scraped. An event needs scraping if it is not completed OR
    if its payload file does not exist.

    Args:
        shortlist_df (pd.DataFrame): Shortlist of events to be scraped (must contain 'eventId' and 'Completed').
        output_dir (str): The directory where existing match payloads are saved.

    Returns:
        pd.Series: A series of event_ids that need to be scraped.
    """

    # Use the input DF length for initial count
    events_total_initial = len(shortlist_df)

    # --- NEW LOGIC: Initialize list for IDs TO SCRAPE ---
    ids_to_scrape: List[int] = []
    events_checked_count = 0

    print(f"\n--- 🟠 Starting Check on {events_total_initial} events to determine scrape list... 🟠 ---")

    # Ensure required columns have correct types for comparison
    try:
        ongoing_events_df = shortlist_df[shortlist_df['EventStatus'] == 'Ongoing']

    except KeyError as e:
        print(f"--- ❌ ERROR: Missing required column '{e}' in shortlist DataFrame. Cannot proceed. ---")
        return pd.Series([], dtype=int) # Return empty Series

    # loop through the shortlist dataframe
    for index, event_row in shortlist_df.iterrows():
        events_checked_count += 1
        event_id = event_row['eventId']
        is_ongoing = event_row['EventStatus']=="Ongoing"

        # Construct the expected payload filename
        payload_file = os.path.join(output_dir, f"{event_id}_match_payloads.csv")

      
        # Scrape if EITHER:
        # tje event is NOT completed (i.e mathches are still being played)
        # or the payload file does NOT exist (not yet been scraped)
        if is_ongoing or not os.path.exists(payload_file):
            ids_to_scrape.append(event_id)

 

 
    # Filter the original DataFrame on the IDs identified for scraping
    
    events_to_scrape_df = shortlist_df[shortlist_df['eventId'].isin(ids_to_scrape)].copy()

    events_to_scrape_count = len(events_to_scrape_df)
    already_obtained_and_completed_count = events_total_initial - events_to_scrape_count # Calculate skipped count

    print(f"\n--- CHECK COMPLETE: {events_to_scrape_count}/{events_total_initial} events identified for scraping. ---")
    print(f"✅ Total Events: {events_total_initial} | Already Obtained & Completed: {already_obtained_and_completed_count} | To Scrape: {events_to_scrape_count}")

    # Return the clean Series of IDs that need scraping
    return events_to_scrape_df["eventId"]

In [10]:
if __name__ == "__main__":

    start_time = time.time() # Start timing the entire run
    print("---🚀 Starting Obtaining Match Payloads 🚀---")

    #  Perform Skip Check on Shortlist DF and
    try:
        shortlist_df = pd.read_csv(EVENTS_FILE)              
    except FileNotFoundError:
        print(f"--- ❌ ERROR: Shortlist file not found at {EVENTS_FILE}. ---"); sys.exit(1)
  

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    events_to_scrape_ids = filter_events_to_scrape(shortlist_df, OUTPUT_DIR)
    event_ids_to_process = events_to_scrape_ids.tolist()
    events_to_scrape_count = len(event_ids_to_process)

    if events_to_scrape_count == 0:
        print("\n--- ✅ PROCESS COMPLETE: No events remaining to scrape. ---"); sys.exit(0)

    # --- Counters & Setup ---
    processed_count = 0
    successful_count = 0
    failed_event_ids: List[int] = [] # Initial list for failures
    start_time_loop = time.time()
    new_payoads_count = 0

    print(f"\n---🚀 Starting Initial Concurrent Scraping for {events_to_scrape_count} Events 🚀---")
    
    print(f"--- Using {MAX_WORKERS} threads. API pause: {MIN_PAUSE:.1f}s - {MAX_PAUSE:.1f}s ---")

    # --- Initial Parallel Execution ---
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {
            # Ensure MIN_PAUSE_S and MAX_PAUSE_S constants are passed
            executor.submit(fetch_and_save_payloads, event_id, OUTPUT_DIR, MIN_PAUSE, MAX_PAUSE): event_id
            for event_id in event_ids_to_process
        }

        for future in concurrent.futures.as_completed(futures):
            processed_count += 1
            event_id = futures[future]
            try:
                result_id, status, match_count, status_msg = future.result()
                if status:
                    successful_count += 1
                    new_payoads_count += match_count
                    # --- SUCCESS: No print ---
                else:
                    failed_event_ids.append(event_id)
                    # --- FAILURE: Print error message ---
                    print(f"Event {event_id}: ({processed_count}/{events_to_scrape_count})  ❌ Failed: {status_msg}")

                # --- Checkpoint Log (Every 10 events) ---
                if processed_count % 10 == 0:
                    elapsed_time = time.time() - start_time_loop
                    minutes = int(elapsed_time // 60); seconds = int(elapsed_time % 60)
                    # Print checkpoint summary on a new line
                    print(f"\n---  {processed_count}/{events_to_scrape_count} processed. Success Rate: {successful_count/processed_count:.1%}. Elapsed: {minutes}m {seconds}s ---")

            except Exception as e:
                # Print fatal errors during result retrieval
                print(f"\n--- ❌ FATAL ERROR processing result for Event {event_id}: {type(e).__name__} ---")
                failed_event_ids.append(event_id)

    print(f"\n--- Initial scraping phase complete. {successful_count}/{events_to_scrape_count} succeeded initially. ---")


    # --- RETRY LOGIC BLOCK ---
    retry_attempt = 0
    while failed_event_ids and retry_attempt < MAX_RETRIES:
        retry_attempt += 1
        ids_to_retry = list(failed_event_ids)
        failed_event_ids.clear()
        current_retry_successful_count = 0
        current_retry_processed_count = 0

        # --- Print Retry Start ---
        print(f"\n--- Starting retry number {retry_attempt}/{MAX_RETRIES} for {len(ids_to_retry)} failed events ---")
        time.sleep(2) # Small pause before retry batch

        with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor_retry:
            futures_retry = {
                
                executor_retry.submit(fetch_and_save_payloads, event_id, OUTPUT_DIR, MIN_PAUSE, MAX_PAUSE): event_id
                for event_id in ids_to_retry
            }

            for future_retry in concurrent.futures.as_completed(futures_retry):
                current_retry_processed_count += 1
                event_id_retry = futures_retry[future_retry]
                try:
                    result_id_retry, status_retry, match_count_retry, status_msg_retry = future_retry.result()
                    if status_retry:
                        current_retry_successful_count += 1
                        new_payoads_count += match_count
                        # --- SUCCESS (Retry): No print ---
                    else:
                        failed_event_ids.append(event_id_retry)
                        # --- FAILURE (Retry): Print error message ---
                        print(f"Retry {retry_attempt}: Event {event_id_retry} ({current_retry_processed_count}/{len(ids_to_retry)}) ❌ Failed again: {status_msg_retry}")

                    # --- Checkpoint Log (Every 10 events within retry batch) ---
                    # Note: Using current_retry_processed_count here
                    if current_retry_processed_count % 10 == 0:
                        elapsed_time = time.time() - start_time_loop # Still measure from loop start
                        minutes = int(elapsed_time // 60); seconds = int(elapsed_time % 60)
                        # Calculate success rate for *this retry batch*
                        batch_success_rate = current_retry_successful_count / current_retry_processed_count if current_retry_processed_count > 0 else 0
                        print(f"\n--- RETRY CHECKPOINT: {current_retry_processed_count}/{len(ids_to_retry)} processed in attempt {retry_attempt}. Batch Success: {batch_success_rate:.1%}. Total Elapsed: {minutes}m {seconds}s ---")

                except Exception as e_retry:
                     print(f"\n--- ❌ FATAL ERROR during retry for Event {event_id_retry}: {type(e_retry).__name__} ---")
                     failed_event_ids.append(event_id_retry)

        successful_count += current_retry_successful_count # Add successful retries to total
        print(f"\n--- Retry Attempt {retry_attempt} complete. {current_retry_successful_count}/{len(ids_to_retry)} succeeded this attempt. ---")
        if failed_event_ids:
            print(f"--- {len(failed_event_ids)} events still failing after {retry_attempt} retries. ---")

    # --- End of Retry Logic ---


    # --- Final Summary ---
    total_run_time = time.time() - start_time
    total_minutes = int(total_run_time // 60)
    total_seconds = int(total_run_time % 60)

    print("\n" + "=" * 50)
    print(f"✅ Finished! Match payloads obtained for {successful_count}/{events_to_scrape_count} requested events (including retries).")
    print(f"{new_payoads_count}  matches found for new / ongoing events")
    if failed_event_ids:
        print(f"⚠️ Permanently Failed Event IDs ({len(failed_event_ids)}): {failed_event_ids}")
    print(f"Total run time = {total_minutes} m and {total_seconds} s.")
    print("---🟢 Scraping finished. 🟢---")
    

---🚀 Starting Obtaining Match Payloads 🚀---

--- 🟠 Starting Check on 302 events to determine scrape list... 🟠 ---

--- CHECK COMPLETE: 1/302 events identified for scraping. ---
✅ Total Events: 302 | Already Obtained & Completed: 301 | To Scrape: 1

---🚀 Starting Initial Concurrent Scraping for 1 Events 🚀---
--- Using 20 threads. API pause: 0.1s - 0.2s ---

--- Initial scraping phase complete. 1/1 succeeded initially. ---

✅ Finished! Match payloads obtained for 1/1 requested events (including retries).
18  matches found for new / ongoing events
Total run time = 0 m and 1 s.
---🟢 Scraping finished. 🟢---


In [9]:
%tb

NameError: name 'shortlist_df_object' is not defined