In [185]:
import pandas as pd
import os
import sys
import glob
import requests
import json
from typing import List, Dict, Union, Tuple, Any, Optional
from datetime import datetime
import re
import random
import time
import concurrent.futures

In [215]:
#----------- Configuration--------------------

# Sponsors from the api are most easily accesssible as sponsors urls
# This maps those urls to the sponsor brand ( had to be parsed manually due to many different links being used)


date = datetime.now().strftime("%Y%m%d")


INTERMEDIATE_DIR= "../Data/Processed/Events/Intermediate/"
INTERMEDIATE_EVENTS_FILENAME_SUFFIX = "_intermediate_events.csv"
INTERMEDIATE_EVENTS_REGEX = rf"^\d{{8}}{re.escape(INTERMEDIATE_EVENTS_FILENAME_SUFFIX)}$"


MASTER_EVENTS_DIR = "../Data/Master/Events"
MASTER_EVENTS_FILENAME_SUFFIX = "_master_events.csv"
MASTER_EVENTS_REGEX = rf"^\d{{8}}{re.escape(MASTER_EVENTS_FILENAME_SUFFIX)}$"

MASTER_OUTPUT_NAME = f"{date}_events_master.csv"
MASTER_OUTPUT_PATH = os.path.join(MASTER_EVENTS_DIR, MASTER_OUTPUT_NAME)




sponsor_map = {
    'https://www.dhs-sportsglobal.com/': 'DHS',
    'http://www.dhs-sports.com/': 'DHS',
    'https://en.dhs-sports.com/': 'DHS',
    'https://www.doublefish.com/': 'Double Fish',
    'https://www.doublefish.com': 'Double Fish',
    'http://www.yinhe1986.cn/': 'Yinhe',
    'http://www.dhs-sportsglobal.com': 'DHS',
    'https://joola.com/pages/table-tennis?srsltid=AfmBOoquv7qYw0rrAhtEXdZaAsWcFDBkpTNksiGhnOxFif6xCoAajsx_': 'Joola',
    'http://www.yinhe1986.cn': 'Yinhe',
    'https://www.andro.de/en': 'Andro',
    'https://www.stigasports.com/en': 'Stiga',
    'www.doublefish.com': 'Double Fish',
    'https://www.tibhar.com/en/': 'Tibhar',
    'https://butterfly.tt/': 'Butterfly',
    'https://joola.com/': 'Joola',
    'https://www.butterfly-global.com/': 'Butterfly',
    'https://www.stag.in/': 'Stag',
    'https://www.dhs-tt.com/en': 'DHS',
    'www.stigasports.cn': 'Stiga',
    'https://stag.in/': 'Stag',
    'https://www.donic.com/donic/en/': 'Donic',
    'https://tibhar.info/en/': 'Tibhar'
}




# Max number of threads for parallel calls.
MAX_WORKERS = 20

# Used to generate random pause times for API politeness.
MIN_PAUSE = 0.1
MAX_PAUSE = 0.2

In [216]:
def get_latest_master_events(master_dir:str, master_regex) -> Optional[pd.DataFrame]:
    """
    Parses specified directory for events files in format yyyy_mm_dd. 
    Attempts to read latest file in this format. 

    Args:
        directory (str): The folder where the master files are stored (e.g., '../Data/Master/Events').
        filename_pattern (str): The pattern to match (e.g., '*_events_master.csv').

    Returns:
        Optional[pd.DataFrame]: The DataFrame of the latest file, or None if no files are found or reading fails.
    """
    if not os.path.isdir(master_dir):
        print (f"❌{master_dir} does not exist as a directory")
        return None 
    
    # Get csv files in 
    files = glob.glob(f"{master_dir}/*.csv")
   

    master_files = []
    
    if not files:
        print(f"❌ No existing *.csv files found in MASTER Events Directory: {master_dir} ")
        return None

    for file in files:
        filename = os.path.basename(file)
       
        if re.match(master_regex,filename):
            master_files.append(file)

    if not master_files:
        print(f"❌ No existing MASTER files in format: {master_regex} in {master_dir} ")
        return None
    master_files.sort()    
    latest_master = master_files[-1]

    try: 
        latest_master_df = pd.read_csv(latest_master)
        print(f"✅ {len(latest_master_df)} events found in latest MASTER: {file} ")
        return latest_master_df
        
    except Exception as e:
        print (f"❌ Error reading lastest MASTER, {file}: {e}")
        return None    

In [217]:
def get_latest_intermediate_events(intermediate_dir:str, intermediate_regex) -> Optional[pd.DataFrame]:
    """
    Parses specified directory for events files in format yyyy_mm_dd. 
    Attempts to read latest file in this format. 

    Args:
        directory (str): The folder where the intermediate files are stored (e.g., '../Data/Events/Intermediate').
        filename_pattern (str): The pattern to match (e.g., '*_events_intermediate.csv').

    Returns:
        Optional[pd.DataFrame]: The DataFrame of the latest file, or None if no files are found or reading fails.
    """
    if not os.path.isdir(intermediate_dir):
        print (f"❌{intermediate_dir} does not exist as a directory")
        return None 
    
    # Get csv files in 
    files = glob.glob(f"{intermediate_dir}/*.csv")
   

    intermediate_files = []
    
    if not files:
        print(f"❌ No existing *.csv files found in INTERMEDIATE Events Directory: {intermediate_dir} ")
        return None

    for file in files:
        filename = os.path.basename(file)
       
        if re.match(intermediate_regex,filename):
           intermediate_files.append(file)

    if not intermediate_files:
        print(f"❌ No existing INTERMEDIATE files in format: {intermediate_regex} in {intermediate_dir}")
        return None
    intermediate_files.sort()    
    latest_intermediate = intermediate_files[-1]

    try: 
        latest_intermediate_df = pd.read_csv(latest_intermediate)
        print(f"✅ {len(latest_intermediate_df)} events found in latest INTERMEDIATE: {file} ")
        return latest_intermediate_df
        
    except Exception as e:
        print (f"❌ Error reading lastest INTERMEDIATE, {file}: {e}")
        return None    

In [218]:
def get_unenriched_event_ids(df: pd.DataFrame) -> List[int]:
    """
    Identifies events in the DataFrame that already have valid sponsor data 
    (i.e., BallSponsor or TableSponsor is not 'TBC' and not None).
    Returns a list of these event is.
    """
    # Identify rows where either sponsor column is NOT 'TBC' AND NOT NaN.
    #'|' to see if either column has data 
    # TBC means no sponsor data has been searched for 
    # None means no sponsor data was found previously
    unenriched_mask = (
        (df['BallSponsor'] == 'TBC') & (df['BallSponsor'].notna())
    ) | (
        (df['TableSponsor'] == 'TBC') & (df['TableSponsor'].notna())
    )
    
    
    ids = list(set(df[unenriched_mask]['eventId'].tolist()))

   
    return ids

In [234]:
def get_sponsors(event_id:Union[int,str], min_pause:float, max_pause:float) -> Optional[List[Dict[str, Any]]]:
    """
    Function used for parralel, threaded api calls to fetch sponsor details,
    for one event specified by event_id. Returns JSON Dict of the 

    Args:
        event_id (int): The unique id of the event.
        min_pause (float): Minimum pause duration (seconds).
        max_pause (float): Maximum pause duration (seconds).

    Returns:
        (sponsors_list)  A list of sponsors data. 
    """

    # initialise variables to be returned in the final dictionary.
   
    
    sponsors_list = None    

    
    # define api url and headers.
    url = f"https://wtt-website-api-prod-3-frontdoor-bddnb2haduafdze9.a01.azurefd.net/api/cms/GetEventEquipmentwithLogo/{event_id}" 
    headers = {
        "accept": "application/json, text/plain, */*",
        "accept-language": "en-GB,en;q=0.9,es=q=0.8",
        "cache-control": "no-cache",
        "dnt": "1",
        "origin": "https://www.worldtabletennis.com",
        "pragma": "no-cache",
        "priority": "u=1, i",
        "referer": "https://www.worldtabletennis.com/",
        "sec-ch-ua": "\"Chromium\";v=\"140\", \"Not=A?Brand\";v=\"24\", \"Google Chrome\";v=\"140\"",
        "sec-ch-ua-mobile": "?1",
        "sec-ch-ua-platform": "\"Android\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "cross-site",
        "secapimkey": "S_WTT_882jjh7basdj91834783mds8j2jsd81",
        "user-agent": "Mozilla/5.0 (Linux; Android 11.0; Surface Duo) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Mobile Safari/537.36"
    }

    # make the api call and get response as a json. Raise errors if they occur
    try:
        # longer timeout 
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status() # Raise an error for bad status codes (4xx or 5xx)
        sponsors_json = response.json()

        #  check that response contains data and is a list:        
        if sponsors_json and isinstance(sponsors_json,list):
            sponsors_list = sponsors_json
            sleep_duration = random.uniform(min_pause, max_pause)
            time.sleep(sleep_duration)
    
            # Return the list (can be empty if API returned [], or None if checks failed)
            return sponsors_list
            
            
            
           
            
            
            
        
   
    except requests.exceptions.HTTPError as e:
        status_msg = f"HTTP Error {e.response.status_code}"
    except requests.exceptions.Timeout:
        status_msg = "Timeout"
    except requests.exceptions.RequestException as e:
         status_msg = f"Request Error {type(e).__name__}"
    except Exception as e: # Catch any other unexpected errors
        status_msg = f"Unexpected Error {type(e).__name__}"

    # Pause for API politness before returning None and continuing
    sleep_duration = random.uniform(min_pause, max_pause)
    time.sleep(sleep_duration)
    return None

   

In [235]:
if __name__ == "__main__":

    # record time for logging purposes.
    start_time = time.time()

     # Ensure the master output directory exists for the check
    os.makedirs(MASTER_EVENTS_DIR, exist_ok=True) 
    # Ensure the intermediate output directory exists for saving the output
    os.makedirs(INTERMEDIATE_DIR, exist_ok=True)
    
    # Check if a master file already exists: returns None if no master is found
    master_df = get_latest_master_events(MASTER_EVENTS_DIR, MASTER_EVENTS_REGEX)

    if master_df is not None and not master_df.empty:
        base_df = master_df.copy()
        # Placeholder - need to consider how to handle this ? 
        print(f"sing existing master file as base data ({len(base_df)} events).")
    else:
        # check for intermediate files
        intermediate_df = get_latest_intermediate_events(INTERMEDIATE_DIR, INTERMEDIATE_EVENTS_REGEX)
        
        if intermediate_df is not None and not intermediate_df.empty:
            base_df = intermediate_df.copy()
            print(f"Using intermediate file ({len(base_df)} events) for enrichment.")
        else:
            print("--- ❌ FATAL ERROR: No base file (Master or Intermediate) found. Cannot proceed. ---")
            sys.exit(1)

   
    ids_to_get_sponsors = get_unenriched_event_ids(intermediate_df) 
    ids_to_get_sponsors_count = len(ids_to_get_sponsors)

    # if nothing to get sponsors for - sys.exit(0) as the job is done (or something went wrong?)
    if ids_to_get_sponsors_count == 0:
        print("\n--- ✅ All events are already enriched. ---")
        time_taken = time.time() - start_time
        print(f"Total time taken = {total_time_taken:.2f} s.")
        print("---🟢 Scraping finished. 🟢---")
        sys.exit(0)

    
    print(f"\n---🚀 Starting Concurrent Scraping for {ids_to_get_sponsors_count} Events 🚀---")
    print(f"--- Using {MAX_WORKERS} threads. API pause: {MIN_PAUSE:.1f}s - {MAX_PAUSE:.1f}s ---")
    
    # initialise list and counts to be added to.
    all_raw_sponsors_list = []
    processed_count = 0
    success_count = 0

    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:

        futures = {
            executor.submit(get_sponsors, event_id, MIN_PAUSE, MAX_PAUSE): event_id
            for event_id in ids_to_get_sponsors}


        for future in concurrent.futures.as_completed(futures):
            processed_count +=1
            try:
                result = future.result()

                if result is not None:
                    all_raw_sponsors_list.extend(result)
                    success_count += 1 
            except Exception as e:
                print(f"Error with event id {event_id}")
                
    end_time = time.time()
    time_taken = end_time - start_time

   

    print(f"✅ Finished! Sponsor data collected for {success_count}/{ids_to_get_sponsors_count} requested events.")
    print(f"Time Elapsed = {time_taken}")
    print("---🟢 Scraping finished. 🟢---")
    

     

❌ No existing *.csv files found in MASTER Events Directory: ../Data/Master/Events 
✅ 183 events found in latest INTERMEDIATE: ../Data/Processed/Events/Intermediate/20251027_intermediate_events.csv 
Using intermediate file (183 events) for enrichment.

---🚀 Starting Concurrent Scraping for 183 Events 🚀---
--- Using 20 threads. API pause: 0.1s - 0.2s ---
✅ Finished! Sponsor data collected for 160/183 requested events.
Time Elapsed = 2.3209280967712402
---🟢 Scraping finished. 🟢---


In [249]:
all_sponsors_df = pd.DataFrame(all_raw_sponsors_list)

In [250]:

mask = all_sponsors_df['sponsorTypeName'].str.contains("table", case=False) | all_sponsors_df['sponsorTypeName'].str.contains("ball", case= False)

In [251]:
equipment_df = all_sponsors_df[mask].copy()

Index(['eventId', 'eventSponsorId', 'sponsorTypeId', 'logo', 'darkLogo',
       'sponsorLink', 'sponsorTypeName'],
      dtype='object')

In [271]:

all_sponsor_names = list(equipment_df['sponsorLink'].unique()) +  list(equipment_df['logo'].unique())

In [273]:
all_sponsor_names

['http://www.dhs-sports.com/',
 'https://www.doublefish.com',
 'http://www.yinhe1986.cn/',
 'https://www.dhs-sportsglobal.com/',
 None,
 'https://www.doublefish.com/',
 'http://www.dhs-sports.com',
 'http://www.yinhe1986.cn',
 'https://www.stigasports.com/en',
 'https://www.nittaku.com',
 'https://stag.in/',
 'https://en.dhs-sports.com/',
 'https://www.butterfly-global.com/',
 'https://www.dhs-sportsglobal.com/\t',
 'https://www.doublefish.com/\t',
 'https://www.stag.in/',
 'http://en.dhs-sports.com',
 'https://www.tibhar.com/en/',
 'https://joola.com/pages/table-tennis?srsltid=AfmBOoquv7qYw0rrAhtEXdZaAsWcFDBkpTNksiGhnOxFif6xCoAajsx_',
 'https://www.donic.com/donic/en/',
 'http://www.dhs-sportsglobal.com/',
 'www.stigasports.cn',
 'https://butterfly.tt/',
 'www.doublefish.com',
 'https://www.andro.de/en',
 'https://tibhar.info/en/',
 'http://www.dhs-sportsglobal.com',
 'https://joola.com/',
 'https://www.dhs-tt.com/en',
 'org_logos/14November2024_15_2_5_DHS_Website.png',
 'org_logos/14

In [275]:
equipment_df

Unnamed: 0,eventId,eventSponsorId,sponsorTypeId,logo,darkLogo,sponsorLink,sponsorTypeName
1,3096,2724,4,org_logos/14November2024_15_2_5_DHS_Website.png,,http://www.dhs-sports.com/,Official Ball
14,3091,1631,4,org_logos/14November2024_15_3_16_DHS_Website.png,,http://www.dhs-sports.com/,Official Ball
25,3097,2730,5,org_logos/14November2024_14_57_9_DHS_Website.png,,http://www.dhs-sports.com/,Official Table
26,3097,3152,4,org_logos/10May2023_15_21_23_doublefish_dark.png,org_logos/10May2023_15_21_26_doublfish_white.png,https://www.doublefish.com,Official Ball
31,3083,2842,4,org_logos/14November2024_14_41_21_DHS_Website.png,,http://www.dhs-sports.com/,Official Ball
...,...,...,...,...,...,...,...
874,2751,1058,5,org_logos/DHS 110x55px.png,,http://www.dhs-sportsglobal.com/,Official Table
875,2751,1059,4,org_logos/24November2023_10_18_53_RGB.png,org_logos/24November2023_10_18_57_RGB on dark.png,https://www.butterfly-global.com/,Official Ball
885,3069,1532,4,org_logos/20November2024_18_1_27_Yinhe-TO USE ...,org_logos/20November2024_17_59_38_Yinhe-TO USE...,http://www.yinhe1986.cn/,Official Ball
889,2871,946,4,org_logos/DHS 110x55px.png,,https://www.dhs-sportsglobal.com/,Official Ball


In [1]:
as

SyntaxError: invalid syntax (1239779345.py, line 1)