In [None]:
import pandas as pd
import os
import sys
import glob
import requests
from typing import List, Dict, Union, Any, Optional
from datetime import datetime
import re
import random
import time
import concurrent.futures
import json

In [None]:
#----------- Configuration--------------------
# Get current date for file naming.
date = datetime.now().strftime("%Y%m%d")


INTERMEDIATE_DIR= "../Data/Processed/Events/Intermediate/"
INTERMEDIATE_EVENTS_FILENAME_SUFFIX = "_intermediate_events.csv"
INTERMEDIATE_EVENTS_REGEX = rf"^\d{{8}}{re.escape(INTERMEDIATE_EVENTS_FILENAME_SUFFIX)}$"


MASTER_EVENTS_DIR = "../Data/Master/Events"
MASTER_EVENTS_FILENAME_SUFFIX = "_master_events.csv"
MASTER_EVENTS_REGEX = rf"^\d{{8}}{re.escape(MASTER_EVENTS_FILENAME_SUFFIX)}$"

MASTER_OUTPUT_NAME = f"{date}_events_master.csv"
MASTER_OUTPUT_PATH = os.path.join(MASTER_EVENTS_DIR, MASTER_OUTPUT_NAME)





In [None]:
def get_latest_master_events(master_dir:str, master_regex) -> Optional[pd.DataFrame]:
    """
    Parses specified directory for events files in format yyyy_mm_dd. 
    Attempts to read latest file in this format. 

    Args:
        directory (str): The folder where the master files are stored (e.g., '../Data/Master/Events').
        filename_pattern (str): The pattern to match (e.g., '*_events_master.csv').

    Returns:
        Optional[pd.DataFrame]: The DataFrame of the latest file, or None if no files are found or reading fails.
    """
    if not os.path.isdir(master_dir):
        print (f"❌{master_dir} does not exist as a directory")
        return None 
    
    # Get csv files in 
    files = glob.glob(f"{master_dir}/*.csv")
   

    master_files = []
    
    if not files:
        print(f"❌ No existing *.csv files found in MASTER Events Directory: {master_dir} ")
        return None

    for file in files:
        filename = os.path.basename(file)
       
        if re.match(master_regex,filename):
            master_files.append(file)

    if not master_files:
        print(f"❌ No existing MASTER files in format: {master_regex} in {master_dir} ")
        return None
    master_files.sort()    
    latest_master = master_files[-1]

    try: 
        latest_master_df = pd.read_csv(latest_master)
        print(f"✅ {len(latest_master_df)} events found in latest MASTER: {file} ")
        return latest_master_df
        
    except Exception as e:
        print (f"❌ Error reading lastest MASTER, {file}: {e}")
        return None    

In [None]:
def get_latest_intermediate_events(intermediate_dir:str, intermediate_regex) -> Optional[pd.DataFrame]:
    """
    Parses specified directory for events files in format yyyy_mm_dd. 
    Attempts to read latest file in this format. 

    Args:
        directory (str): The folder where the intermediate files are stored (e.g., '../Data/Events/Intermediate').
        filename_pattern (str): The pattern to match (e.g., '*_events_intermediate.csv').

    Returns:
        Optional[pd.DataFrame]: The DataFrame of the latest file, or None if no files are found or reading fails.
    """
    if not os.path.isdir(intermediate_dir):
        print (f"❌{intermediate_dir} does not exist as a directory")
        return None 
    
    # Get csv files in 
    files = glob.glob(f"{intermediate_dir}/*.csv")
   

    intermediate_files = []
    
    if not files:
        print(f"❌ No existing *.csv files found in INTERMEDIATE Events Directory: {intermediate_dir} ")
        return None

    for file in files:
        filename = os.path.basename(file)
       
        if re.match(intermediate_regex,filename):
           intermediate_files.append(file)

    if not intermediate_files:
        print(f"❌ No existing INTERMEDIATE files in format: {intermediate_regex} in {intermediate_dir}")
        return None
    intermediate_files.sort()    
    latest_intermediate = intermediate_files[-1]

    try: 
        latest_intermediate_df = pd.read_csv(latest_intermediate)
        print(f"✅ {len(latest_intermediate_df)} events found in latest INTERMEDIATE: {file} ")
        return latest_intermediate_df
        
    except Exception as e:
        print (f"❌ Error reading lastest INTERMEDIATE, {file}: {e}")
        return None    

In [None]:
def get_unenriched_event_ids(df: pd.DataFrame) -> List[int]:
    """
    Identifies events in the DataFrame that already have valid sponsor data 
    (i.e., BallSponsor or TableSponsor is not 'TBC' and not None).
    Returns a list of these event is.
    """
    # Identify rows where either sponsor column is NOT 'TBC' AND NOT NaN.
    #'|' to see if either column has data 
    # TBC means no sponsor data has been searched for 
    # None means no sponsor data was found previously
    unenriched_mask = (
        (df['BallSponsor'] == 'TBC') & (df['BallSponsor'].notna())
    ) | (
        (df['TableSponsor'] == 'TBC') & (df['TableSponsor'].notna())
    )
    
    
    ids = list(set(df[unenriched_mask]['eventId'].tolist()))

   
    return ids

In [None]:
def get_sponsors(event_id:Union[int,str], min_pause:float, max_pause:float) -> Optional[List[Dict[str, Any]]]:
    """
    Function used for parralel, threaded api calls to fetch sponsor details,
    for one event specified by event_id. Returns JSON Dict of the 

    Args:
        event_id (int): The unique id of the event.
        min_pause (float): Minimum pause duration (seconds).
        max_pause (float): Maximum pause duration (seconds).

    Returns:
        (sponsors_list)  A list of sponsors data. 
    """

    # initialise variables to be returned in the final dictionary.
   
    
    sponsors_list = None    

    
    # define api url and headers.
    url = f"https://wtt-website-api-prod-3-frontdoor-bddnb2haduafdze9.a01.azurefd.net/api/cms/GetEventEquipmentwithLogo/{event_id}" 
    headers = {
        "accept": "application/json, text/plain, */*",
        "accept-language": "en-GB,en;q=0.9,es=q=0.8",
        "cache-control": "no-cache",
        "dnt": "1",
        "origin": "https://www.worldtabletennis.com",
        "pragma": "no-cache",
        "priority": "u=1, i",
        "referer": "https://www.worldtabletennis.com/",
        "sec-ch-ua": "\"Chromium\";v=\"140\", \"Not=A?Brand\";v=\"24\", \"Google Chrome\";v=\"140\"",
        "sec-ch-ua-mobile": "?1",
        "sec-ch-ua-platform": "\"Android\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "cross-site",
        "secapimkey": "S_WTT_882jjh7basdj91834783mds8j2jsd81",
        "user-agent": "Mozilla/5.0 (Linux; Android 11.0; Surface Duo) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Mobile Safari/537.36"
    }

    # default status
    status_msg = "OK" 

    # make the api call and get response as a json. Raise errors if they occur
    try:       
        print(f"[{event_id}] Making request...", flush=True) 
        response = requests.get(url, headers=headers, timeout=15)
        # raise an error for bad status codes (4xx or 5xx)
        response.raise_for_status() 
        print(f"[{event_id}] Request success (Status: {response.status_code}). Parsing JSON...", flush=True) # Log success
        sponsors_json = response.json()

        #  check that response contains data and is a list:        
        if sponsors_json and isinstance(sponsors_json,list):
            sponsors_list = sponsors_json
            # raise an error for bad status codes (4xx or 5xx)
            sleep_duration = random.uniform(min_pause, max_pause)
            time.sleep(sleep_duration)
    
            # Return the list (can be empty if API returned [], or None if checks failed)
            return sponsors_list, event_id 
        else:
            status_msg = "Invalid JSON structure or empty list"
            print(f"--- ⚠️ [{event_id}] Warning: {status_msg} ---", flush=True) # Log invalid data
        
   
    except requests.exceptions.HTTPError as e:
        status_msg = f"HTTP Error {e.response.status_code}"
        print(f"--- ❌ [{event_id}] Failed: {status_msg} ---", flush=True) # Log HTTP Error
    except requests.exceptions.Timeout:
        status_msg = "Timeout"
        print(f"--- ❌ [{event_id}] Failed: {status_msg} ---", flush=True) # Log Timeout
    except requests.exceptions.RequestException as e:
        status_msg = f"Request Error {type(e).__name__}"
        print(f"--- ❌ [{event_id}] Failed: {status_msg} ---", flush=True) # Log other Request Errors
    except Exception as e: # Catch any other unexpected errors (like JSON decode error)
        status_msg = f"Unexpected Error {type(e).__name__}: {e}"
        print(f"--- ❌ [{event_id}] Failed: {status_msg} ---", flush=True) # Log unexpected errors

    # Pause for API politness before returning None and continuing
    sleep_duration = random.uniform(min_pause, max_pause)
    time.sleep(sleep_duration)
    return None, event_id

   

In [None]:
def merge_sponsors_and_overwrite(intermediate_df, balls_df, tables_df):
    """
    Merges sponsor data onto the intermediate DataFrame, overwriting placeholders 
    and ensuring all missing values are represented by NaN (or None).
    """
    
   # peplace 'TBC' with NaN as this data is now known to be unavailable (save for manual additions)

    intermediate_df.replace('TBC', pd.NA, inplace=True)    
    merged_df = intermediate_df.copy()

    # --- 2. Merge Ball Sponsors ---
    
    # Merge the new BallSponsor data. The original column is renamed with suffix '_x'.
    merged_df = merged_df.merge(
        balls_df[['eventId', 'BallSponsor']],
        on='eventId',
        how='left',
        suffixes=('_original', '_new') 
    )

   
    # We use .combine_first() to achieve this:
    # If '_new' has a value (not NaN/None), use it. 
    # If '_new' is NaN/None, use the value from '_original' (which is also NaN/None if TBC was replaced).
    merged_df['BallSponsor'] = merged_df['BallSponsor_new'].combine_first(merged_df['BallSponsor_original'])
    
    # Drop the temporary merged columns
    merged_df.drop(columns=['BallSponsor_original', 'BallSponsor_new'], inplace=True)
    
    
    # --- 3. Merge Table Sponsors ---
    
    # Merge the new TableSponsor data.
    merged_df = merged_df.merge(
        tables_df[['eventId', 'TableSponsor']],
        on='eventId',
        how='left',
        suffixes=('_original', '_new')
    )

    # Overwrite the original column
    merged_df['TableSponsor'] = merged_df['TableSponsor_new'].combine_first(merged_df['TableSponsor_original'])
    
    # Drop the temporary merged columns
    merged_df.drop(columns=['TableSponsor_original', 'TableSponsor_new'], inplace=True)

    return merged_df

In [None]:
if __name__ == "__main__":

    # record time for logging purposes.
    start_time = time.time()

     # Ensure the master output directory exists for the check
    os.makedirs(MASTER_EVENTS_DIR, exist_ok=True) 
    # Ensure the intermediate output directory exists for saving the output
    os.makedirs(INTERMEDIATE_DIR, exist_ok=True)
    
    # Check if a master file already exists: returns None if no master is found
    master_df = get_latest_master_events(MASTER_EVENTS_DIR, MASTER_EVENTS_REGEX)

    if master_df is not None and not master_df.empty:
        base_df = master_df.copy()
        # Placeholder - need to consider how to handle this ? 
        print(f"sing existing master file as base data ({len(base_df)} events).")
    else:
        # check for intermediate files
        intermediate_df = get_latest_intermediate_events(INTERMEDIATE_DIR, INTERMEDIATE_EVENTS_REGEX)
        
        if intermediate_df is not None and not intermediate_df.empty:
            base_df = intermediate_df.copy()
            print(f"Using intermediate file ({len(base_df)} events) for enrichment.")
        else:
            print("--- ❌ FATAL ERROR: No base file (Master or Intermediate) found. Cannot proceed. ---")
            sys.exit(1)

   
    ids_to_get_sponsors = get_unenriched_event_ids(intermediate_df) 
    ids_to_get_sponsors_count = len(ids_to_get_sponsors)

    # if nothing to get sponsors for - sys.exit(0) as the job is done (or something went wrong?)
    if ids_to_get_sponsors_count == 0:
        print("\n--- ✅ All events are already enriched. ---")
        time_taken = time.time() - start_time
        print(f"Total time taken = {time_taken:.2f} s.")
        print("---🟢 Scraping finished. 🟢---")
        sys.exit(0)

    
    print(f"\n---🚀 Starting Concurrent Scraping for {ids_to_get_sponsors_count} Events 🚀---")
    print(f"--- Using {MAX_WORKERS} threads. API pause: {MIN_PAUSE:.1f}s - {MAX_PAUSE:.1f}s ---")
    
    # initialise list and counts to be added to.
    all_raw_sponsors_list = []
    processed_count = 0
    success_count = 0

    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:

        futures = {
            executor.submit(get_sponsors, event_id, MIN_PAUSE, MAX_PAUSE): event_id
            for event_id in ids_to_get_sponsors}


        for future in concurrent.futures.as_completed(futures):
            processed_count +=1
            try:
                result, event_id = future.result()

                if result is not None:
                    all_raw_sponsors_list.extend(result)
                    success_count += 1 
            except Exception as e:
                print(f"Error with event id {event_id}")
                
    end_time = time.time()
    time_taken = end_time - start_time

   

    print(f"✅ Finished! Sponsor data collected for {success_count}/{ids_to_get_sponsors_count} events.")
    print(f"Time Elapsed = {time_taken}")
    print("---🟢 Scraping finished. 🟢---")
    

     

--- ❌ [3031] Failed: Request Error ConnectionError ---
--- ❌ [2521] Failed: Request Error ConnectionError ---
--- ❌ [2522] Failed: Request Error ConnectionError ---
--- ❌ [2531] Failed: Request Error ConnectionError ---
--- ❌ [2532] Failed: Request Error ConnectionError ---
--- ❌ [2535] Failed: Request Error ConnectionError ---
--- ❌ [2533] Failed: Request Error ConnectionError ---
--- ❌ [2534] Failed: Request Error ConnectionError ---
--- ❌ [2536] Failed: Request Error ConnectionError ---
--- ❌ [2537] Failed: Request Error ConnectionError ---
--- ❌ [2539] Failed: Request Error ConnectionError ---


In [None]:
# Drop the intermeddiate placeholder sponsor columns if they exist
try:
    intermediate_df.drop(columns=['BallSponsor', 'TableSponsor'], inplace=True)
except KeyError:
    pass

sponsors_df = pd.DataFrame(all_raw_sponsors_list)

# filter for equipment sponsors (balls and tables)
equipment_mask = (
    (sponsors_df['sponsorTypeName'].str.contains("ball", case=False)) | 
    (sponsors_df['sponsorTypeName'].str.contains("table", case=False))
)
# Create EQUIPMENT sponsors dataframe
equipment_df = sponsors_df[equipment_mask].copy()
all_equipment_df = equipment_df.copy()
equipment_df["Brand"] = None

# Where link is available, use that to map to brand name using LINKS_MAP (manually created)
equipment_df['Brand'] = equipment_df['sponsorLink'].map(lambda x: LINKS_MAP.get(x, None))

#Fill in missing Brand names using logo mapping - 
#Only apply logo mapping to rows where Brand is still missing
no_links_mask = equipment_df["Brand"].isna()


mapped_logo_values = equipment_df.loc[no_links_mask, "logo"].map(
    lambda x: LOGOS_MAP.get(x, None) 
)
equipment_df.loc[no_links_mask, "Brand"] = mapped_logo_values

# get balls and tables dataframes separately
balls_df = equipment_df[equipment_df['sponsorTypeName'].str.contains("ball", case=False)].copy()
balls_df.rename(columns={"Brand": "BallSponsor"}, inplace=True)
balls_df = balls_df[['eventId', 'BallSponsor']].copy()

tables_df = equipment_df[equipment_df['sponsorTypeName'].str.contains("table", case=False)].copy()
tables_df.rename(columns={"Brand": "TableSponsor"}, inplace=True)



# Add the manual table sponsor data from EVENTS_TABLES_DF
enriched_df = pd.merge(left=intermediate_df, right=EVENTS_TABLES_DF, on='eventId', how='left')

# get events where no manual data has been made yet!

no_table_sponsor_mask = enriched_df['TableSponsor'].isna()
no_table_df = enriched_df[no_table_sponsor_mask]
no_table_ids = no_table_df['eventId'].tolist()

for event_id in no_table_ids:
    table_sponsor = tables_df.loc[tables_df['eventId'] == event_id, 'TableSponsor'].values[0]
    enriched_df.loc[enriched_df['eventId'] == event_id, 'TableSponsor'] = table_sponsor

enriched_df = pd.merge(left=enriched_df, right=balls_df, on='eventId', how='left')

enriched_df.to_csv(MASTER_OUTPUT_PATH, index=False)

print(f"✅ Enriched master events file saved to: {MASTER_OUTPUT_PATH} ")


In [None]:
enriched_df["TableSponsor"].value_counts()

In [None]:
sponsors_df