In [128]:
import pandas as pd
import os
import sys
import glob
import requests
from typing import List, Union,  Optional
import re
from datetime import datetime
import json

In [129]:
#----------- Configuration--------------------


INTERMEDIATE_EVENTS_DIR= "../Data/Processed/Events/Intermediate/"
INTERMEDIATE_EVENTS_SUFFIX = "_intermediate_events.csv"
INTERMEDIATE_EVENTS_REGEX = rf"^\d{{8}}{re.escape(INTERMEDIATE_EVENTS_SUFFIX)}$"


# Manual made table sponsor information 
EVENTS_SPONSORS_DIR = "../Data/Processed/Sponsors"
EVENTS_SPONSORS_SUFFIX = "_event_sponsors.csv"
EVENTS_SPONSORS_REGEX = rf"^\d{{8}}{re.escape(EVENTS_SPONSORS_SUFFIX)}$"


#used to return dummy /placeholder Df if no existing data is found 
# currently just used to check if df is empty: can be used 
MINIMAL_EVENT_COLUMNS = ["eventId"]

LINKS_MAP_FILE = "../Data/Processed/Sponsors/sponsor_links_map.json"
LOGOS_MAP_FILE = "../Data/Processed/Sponsors/sponsor_logos_map.json"

In [130]:
def get_latest_event_sponsor(event_sponsor_dir:str, event_sponsor_regex) -> Optional[pd.DataFrame]:
    """
    Parses specified directory for events files in format yyyy_mm_dd. 
    Attempts to read latest file in this format. 

    Args:
        directory (str): The folder where the event_sponsor files are stored (e.g., '../Data/Events/Intermediate').
        filename_pattern (str): The pattern to match (e.g., '*_event_sponsor.csv').

    Returns:
        Optional[pd.DataFrame]: The DataFrame of the latest file, or None if no files are found or reading fails.
    """
    if not os.path.isdir(event_sponsor_dir):
        print (f"‚ùå{event_sponsor_dir} does not exist as a directory")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS)   
    
    # Get csv files in 
    files = glob.glob(f"{event_sponsor_dir}/*.csv")
   

    event_sponsor_files = []
    
    if not files:
        print(f"‚ùå No existing *.csv files found in EVENTS_SPONSORS  Directory: {event_sponsor_dir} ")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None  

    for file in files:
        filename = os.path.basename(file)
       
        if re.match(event_sponsor_regex,filename):
           event_sponsor_files.append(file)

    if not event_sponsor_files:
        print(f"‚ùå No existing EVENTS_SPONSORS files in format: {event_sponsor_regex} in {event_sponsor_dir}")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS)   
    event_sponsor_files.sort()    
    latest_event_sponsor = event_sponsor_files[-1]

    try: 
        latest_event_sponsor_df = pd.read_csv(latest_event_sponsor)
        print(f"‚úÖ {len(latest_event_sponsor_df)} events found in latest EVENTS_SPONSORS: {latest_event_sponsor} ")
        return latest_event_sponsor_df, latest_event_sponsor
        
    except Exception as e:
        print (f"‚ùå Error reading lastest EVENTS_SPONSORS, {file}: {e}")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None

In [131]:
def get_latest_intermediate_events(intermediate_events_dir:str, intermediate_events_regex) -> Optional[pd.DataFrame]:
    """
    Parses specified directory for events files in format yyyy_mm_dd. 
    Attempts to read latest file in this format. 

    Args:
        directory (str): The folder where the intermediate files are stored (e.g., '../Data/Events/Intermediate').
        filename_pattern (str): The pattern to match (e.g., '*_events_intermediate.csv').

    Returns:
        Optional[pd.DataFrame]: The DataFrame of the latest file, or None if no files are found or reading fails.
    """
    if not os.path.isdir(intermediate_events_dir):
        print (f"‚ùå{intermediate_events_dir} does not exist as a directory")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None   
    
    # Get csv files in 
    files = glob.glob(f"{intermediate_events_dir}/*.csv")
   

    intermediate_files = []
    
    if not files:
        print(f"‚ùå No existing *.csv files found in INTERMEDIATE Events Directory: {intermediate_events_dir} ")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None

    for file in files:
        filename = os.path.basename(file)
       
        if re.match(intermediate_events_regex,filename):
           intermediate_files.append(file)

    if not intermediate_files:
        print(f"‚ùå No existing INTERMEDIATE files in format: {intermediate_events_regex} in {intermediate_events_dir}")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS),None 
    intermediate_files.sort()    
    latest_intermediate = intermediate_files[-1]

    try: 
        latest_intermediate_df = pd.read_csv(latest_intermediate)
        print(f"‚úÖ {len(latest_intermediate_df)} events found in latest INTERMEDIATE: {latest_intermediate} ")
        return latest_intermediate_df, latest_intermediate
        
    except Exception as e:
        print (f"‚ùå Error reading lastest INTERMEDIATE, {latest_intermediate}: {e}")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None 

In [132]:
def get_missing_sponsor_ids(df: pd.DataFrame,sponsor:str) -> List[int]:
    """
    Identifies events in the DataFrame that already have valid sponsor data 
    (i.e., BallSponsor or TableSponsor is not 'TBC' and not None).
    Returns a list of these event is.
    """
    # Identify rows where either sponsor column is NOT 'TBC' AND NOT NaN.
    #'|' to see if either column has data 
    # TBC means no sponsor data has been searched for 
    # None means no sponsor data was found previously

    unenriched_mask = (df[sponsor] == 'TBC)') | (df[sponsor].isnull())
    
    ids = list(set(df[unenriched_mask]['eventId'].tolist()))

   
    return ids

In [133]:
def get_sponsors(event_id:Union[int,str]) -> Optional[pd.DataFrame]:
    """
    Function used for parralel, threaded api calls to fetch sponsor details,
    for one event specified by event_id. Returns dict of ball and table sponsor
       
    Returns:
        (sponsors_list)  A list of sponsors data direct from the API 
    """   
    
    # define api url and headers.
    url = f"https://wtt-website-api-prod-3-frontdoor-bddnb2haduafdze9.a01.azurefd.net/api/cms/GetEventEquipmentwithLogo/{event_id}" 
    headers = {
        "accept": "application/json, text/plain, */*",
        "accept-language": "en-GB,en;q=0.9,es=q=0.8",
        "cache-control": "no-cache",
        "dnt": "1",
        "origin": "https://www.worldtabletennis.com",
        "pragma": "no-cache",
        "priority": "u=1, i",
        "referer": "https://www.worldtabletennis.com/",
        "sec-ch-ua": "\"Chromium\";v=\"140\", \"Not=A?Brand\";v=\"24\", \"Google Chrome\";v=\"140\"",
        "sec-ch-ua-mobile": "?1",
        "sec-ch-ua-platform": "\"Android\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "cross-site",
        "secapimkey": "S_WTT_882jjh7basdj91834783mds8j2jsd81",
        "user-agent": "Mozilla/5.0 (Linux; Android 11.0; Surface Duo) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Mobile Safari/537.36"
    }
    
   

    # make the api call and get response as a json. Raise errors if they occur
    try:       
        
        response = requests.get(url, headers=headers, timeout=15)
        # raise an error for bad status codes (4xx or 5xx)
        response.raise_for_status()         
        sponsors_json = response.json()

        #  check that response contains data and is a list:        
        if sponsors_json and isinstance(sponsors_json,list):  
            print(f"Obtained Raw Sponsors API Data for {event_id}")          
            return pd.DataFrame(sponsors_json)
        
   
    except requests.exceptions.HTTPError as e:
        status_msg = f"HTTP Error {e.response.status_code}"
        print(f"--- ‚ùå [{event_id}] Failed:") 
    except requests.exceptions.Timeout:
        status_msg = "Timeout"
        print(f"--- ‚ùå [{event_id}] Failed: ") 
    except Exception as e:
        status_msg = f"Unexpected Error {type(e).__name__}: {e}"
        print(f"--- ‚ùå [{event_id}] Failed: {e}") # Log unexpected errors

    # Pause for API politness before returning None and continuing
   
   
    return None

   

In [134]:
def map_sponsor_name(sponsor_row, links_map, logos_map):
    """
    Tries to map a sponsor name, first by link, then by logo.
    """    
    links_map = json.load(open(LINKS_MAP_FILE))
    logos_map = json.load(open(LOGOS_MAP_FILE))
    if sponsor_row is None:
        return None
        
    # 1. Try to map the sponsorLink
    link = sponsor_row.get("sponsorLink")
    name = links_map.get(link)
    
    # 2. If mapping the link fails, fall back to mapping the logo
    if name is None:
        logo = sponsor_row.get("logo")
        name = logos_map.get(logo)
        
    return name

In [135]:
if __name__ == "__main__":   

    now_date_str = datetime.now().strftime("%Y%m%d")

    
    
    intermediate_df, _ = get_latest_intermediate_events(INTERMEDIATE_EVENTS_DIR, INTERMEDIATE_EVENTS_REGEX)
    if intermediate_df.empty:
        print("‚ùå FAILURE: Missing intermediate data")
        sys.exit(1)

    
    event_sponsors_df, event_sponsors_file = get_latest_event_sponsor(EVENTS_SPONSORS_DIR, EVENTS_SPONSORS_REGEX)
    if event_sponsors_df.empty:
        print ("‚ùå FAILURE: Missing event sponsor data")
        sys.exit(1)

    latest_sponsors_date_str = os.path.basename(event_sponsors_file).split("_")[0]
    latest_sponsors_date = pd.to_datetime(latest_sponsors_date_str)
    
    existing_sponsor_ids = event_sponsors_df["eventId"].to_list()
    new_sponsor_mask = intermediate_df["eventId"].isin(existing_sponsor_ids)
    new_ids = intermediate_df[~new_sponsor_mask]["eventId"].tolist()
   
   
        
    # get IDS which can still be scraped if info is available!
    missing_ball_ids = get_missing_sponsor_ids(event_sponsors_df,"BallSponsor") + new_ids
    missing_ball_set = set(missing_ball_ids)

    missing_table_ids = get_missing_sponsor_ids(event_sponsors_df,"TableSponsor") + new_ids
    missing_table_set = set(missing_table_ids)

    intermediate_df["StartDate"] = pd.to_datetime(intermediate_df["StartDate"])
    unupdated_events_filter = intermediate_df["StartDate"] > latest_sponsors_date
    unupdated_ids = intermediate_df[unupdated_events_filter]["eventId"].tolist()

    ids_to_scrape = missing_ball_set.union(missing_table_set)
    # ids_to_scrape = [event_id for event_id in ids_to_scrape if event_id in unupdated_ids]
    ids_to_scrape = set(ids_to_scrape).union(new_ids)
    ids_to_scrape = list(ids_to_scrape)

    

    

    

    new_sponsor_data = []

    print("--- üü¢ Commencing Sponsor Scrape üü¢---")
    try:
        with open(LINKS_MAP_FILE, 'r') as f:
            LOADED_LINKS_MAP = json.load(f)
        with open(LOGOS_MAP_FILE, 'r') as f:
            LOADED_LOGOS_MAP = json.load(f)
    except Exception as e:
        print(f"FATAL: Could not load sponsor map files. {e}")
        sys.exit(1)


    for i, event_id in enumerate(ids_to_scrape):
    
        print(f"Processing Event: {i+1}/{len(ids_to_scrape)} (ID: {event_id})    \r", end="", flush=True)
        
        # Request to get sponsors details from API call 
        sponsors_df = get_sponsors(event_id)
        # If response is bad / None/ empty, skip to next event to check
        if sponsors_df is None or sponsors_df.empty:            continue
        
        
        # Find the *first* row that matches "ball"
        ball_sponsor_row = sponsors_df[
            sponsors_df["sponsorTypeName"].str.contains("ball", case=False, na=False)
        ].iloc[0] if not sponsors_df[sponsors_df["sponsorTypeName"].str.contains("ball", case=False, na=False)].empty else None
        
        # Find the *first* row that matches "table"
        table_sponsor_row = sponsors_df[
            sponsors_df["sponsorTypeName"].str.contains("table", case=False, na=False)
        ].iloc[0] if not sponsors_df[sponsors_df["sponsorTypeName"].str.contains("table", case=False, na=False)].empty else None

        # try to map data 

        ball_sponsor_name = map_sponsor_name(ball_sponsor_row, LINKS_MAP_FILE, LOGOS_MAP_FILE)
        table_sponsor_name = map_sponsor_name(table_sponsor_row, LINKS_MAP_FILE, LOGOS_MAP_FILE)

        found_new_ball = ball_sponsor_name and (event_id in missing_ball_set)
        found_new_table = table_sponsor_name and (event_id in missing_table_set)

        # Only proceed if we found at least one *new* piece of data
        if found_new_ball or found_new_table:            
            # Log the success
            found_items = []
            if found_new_ball:
                found_items.append("Ball Sponsor")
            if found_new_table:
                found_items.append("Table Sponsor")
            
            found_str = " and ".join(found_items)
            print(f"\n--- üèì Found new {found_str} for Event ID: {event_id} üèì---", flush=True)

        
        new_sponsor_data.append({
            "eventId": event_id,
            "BallSponsor": ball_sponsor_name if found_new_ball else None,
            "TableSponsor": table_sponsor_name if found_new_table else None
        })


    print("\n--- ‚úÖ Scraping complete. Consolidating results... ---")
    new_sponsor_df = pd.DataFrame(new_sponsor_data)
   
    if new_sponsor_df.empty:
        print("--- ‚ö†Ô∏è No new sponsor data was found for any missing events sponsors. ---")
    else:
        print(f"--- Found {len(new_sponsor_df)} events with new sponsor data. ---")##

    if not new_sponsor_df.empty:
        print(f"--- Combining {len(new_sponsor_df)} new sponsor records with existing data... ---")
    

        SPONSOR_COLUMNS = ['eventId', 'BallSponsor', 'TableSponsor']
  
        event_sponsors_df = event_sponsors_df.set_index("eventId")
        new_sponsor_df = new_sponsor_df.set_index("eventId")
        combined_df = new_sponsor_df.combine_first(event_sponsors_df)
        final_sponsors_df = combined_df.reset_index()

        try:
            date_string = now_date_str
            final_sponsors_name = f"{date_string}{EVENTS_SPONSORS_SUFFIX}"
            final_sponsors_path = os.path.join(EVENTS_SPONSORS_DIR,final_sponsors_name)
            final_sponsors_df.to_csv(final_sponsors_path, index=False)
            print(f"--- üü¢ Successfully saved {len(final_sponsors_df)} total sponsored events to {final_sponsors_path} üü¢---")
        except Exception as e:
            print(f"--- ‚ùå FAILED to save updated sponsor file: {e} ---")
   


‚úÖ 187 events found in latest INTERMEDIATE: ../Data/Processed/Events/Intermediate/20251110_intermediate_events.csv 
‚úÖ 186 events found in latest EVENTS_SPONSORS: ../Data/Processed/Sponsors/20251108_event_sponsors.csv 
--- üü¢ Commencing Sponsor Scrape üü¢---
Processing Event: 1/27 (ID: 3066)    

Obtained Raw Sponsors API Data for 3066
Obtained Raw Sponsors API Data for 3191

--- üèì Found new Ball Sponsor for Event ID: 3191 üèì---
Obtained Raw Sponsors API Data for 2591
Obtained Raw Sponsors API Data for 2537
Processing Event: 27/27 (ID: 2234)    
--- ‚úÖ Scraping complete. Consolidating results... ---
--- Found 4 events with new sponsor data. ---
--- Combining 4 new sponsor records with existing data... ---
--- üü¢ Successfully saved 187 total sponsored events to ../Data/Processed/Sponsors/20251110_event_sponsors.csv üü¢---
