In [2]:
import pandas as pd
import os
from typing import List, Union, Tuple
from datetime import datetime 

In [3]:
"""
This script reads and filters events data that have been scraped by year. 
All desired events are saved into an events_shortlist. ALl removed events are also savec
"""

# --- CONFIGURATION ---

# Define directory where events by year files are stored as csv.
EVENTS_DIRECTORY = "../Data/Raw/Events"

# Output directory where the master file will be saved
OUTPUT_DIRECTORY ="../Data/Processed/Events"

SHORTLIST_OUTPUT_NAME = "shortlist_events.csv"

REMOVED_OUTPUT_NAME = "removed_events.csv"

REMOVE_STRINGS = {
    # Non-Senior events
    "cadet", 
    "junior", 
    "youth",
    "under",
    # Para Categories
    "para", 
    "paralympic",     
    # Veteran series
    "vet", 
    "veteran"
}

# Regex pattern used to filter out age restricted events (e.g U13, U21 etc)
AGE_PATTERN = r"u\d{2}"


# This can be used to rename events after filtering for increased clarity
# Passed into the rename_events function.
NAME_MAP = {
    "Singles World Cup": "World Cup",
    "WTT Cup Finals": "WTT Finals",
    "WTTC": "World Championship"
}


In [4]:
def collate_raw_events (directory: str) -> pd.DataFrame:
    """
    Loads all individual event CSV files from the specified directory and compiles them 
    into a single DataFrame.
    """
    all_events_list = [] 
    print("--- 🟠 Combining Raw Event Files 🟠 ---")

    # Iterate through csv files in search directory
    for file in os.listdir(directory):
        if file.endswith(".csv"):
            
            print(f"Reading file: {file}")
    
            # using os.path.join to create the full file path from the directory and filename
            # read the file, convert to DF, ana store in all_events_list container.
            
            full_path = os.path.join(directory, file)
            df = pd.read_csv(full_path)
            all_events_list.append(df)

    if not all_events_list:
        print(f"❌ Error: No CSV files found in {directory}.")
        # Return blank DataFrame if no data found
        return pd.DataFrame() 

    all_events_df = pd.concat(all_events_list, ignore_index=True)
    
    all_events_df.rename(columns={'EventId': 'eventId'}, inplace=True)

    return all_events_df

    
def filter_selected_events(df: pd.DataFrame,
                           remove_strings: List[str],
                           age_pattern: str
                           ) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Filters a dataframe of events data in order to keep only the desired events as specified by the inputs.
    Here: Keeping all standard, senior events. Checks event names and event types for patterns to be removed.
    Returns a tuple of [kept_df, removed_df

    """
    print(f"\n--- 🟠 Filtering from {len(df)} Events 🟠 ---")



    # Create a copy of the df for the function scope and remove duplicates from it.
    function_df = df.copy()
    function_df = function_df.drop_duplicates(subset=["eventId"], keep = "first", inplace = False)
    
    
    
   

  
    # here "|" denotes "OR" for regex pattern. If the event name has any of these terms it will be removed
    string_pattern = "|".join(remove_strings)
    
    # create a mask for conditions to filter the DF - here select all event names with those strings.
    string_mask = (function_df["EventName"].str.contains(string_pattern, case=False, na=False) | 
                 function_df["EventType"].str.contains(string_pattern, case=False, na=False))
                 
    # mask for age using age pattern as defined in config (UXX regex to remove U13, U21 etc)
    age_mask = (function_df["EventName"].str.contains(age_pattern, case=False, na=False) | 
                 function_df["EventType"].str.contains(age_pattern, case=False, na=False))
                                                      
    # define a mask to check event name and type for any strings to be removed.
    remove_mask = string_mask | age_mask

    # use filter condition ~mask to select entries that DO NOT contain the filtered patterns.
    kept_df = function_df[~remove_mask].copy() 
    # use filter condition ~mask to select entries that  contain the filtered patterns.
    removed_df = function_df[remove_mask].copy()

    print(f"From total: {len(df)} events , kept: {len(kept_df)}, removed: {len(removed_df)}, duplicates: {len(df) - len(function_df)}") 
    
    return kept_df, removed_df
    

   

def standardize_event_names(df: pd.DataFrame, name_map: dict) -> pd.DataFrame:
    """
    Changes event names from the original event list to simpler names for Clarity.
    e.g., ['Singles World Cup', "WTTC"] to ["World Cup", "World Championships"]
    
    World Cup (newer, annual) and World Championship (older, biannnual) are often confused.
    Name changes can be specified in name_map variable.
    
    """
    # Use .loc to explicitly change names of input df.
    df["EventType"] = df["EventType"].replace(name_map)
    return df


def convert_dates(df:pd.DataFrame) -> pd.DataFrame:
    """Converts the dates in the events data to pd.datetime objects for easier processing.
    Current format returned by WTT API is (YYYY-MM-DDTHH:MM:SS)
    """
    # create a copy for safety
    working_df = df.copy()
    
    
    working_df['StartDateTime'] = pd.to_datetime(working_df['StartDateTime'])
    working_df['EndDateTime'] = pd.to_datetime(working_df['EndDateTime'])

    return  working_df

def tag_event_status(df: pd.DataFrame) -> pd.DataFrame:
    """Adds a column to flag events that are currently ongoing or in the future.
    completed events , completed = True, future and ongoing events, completed = False
    This uses the Start and End dates from api response. There are fields for rearranged start and end times.
    But these are blank for all relevant events and so only StartDate and EndDate from api will be considered.
    """
    # create a copy for safety
    working_df = df.copy()
    
    now = pd.to_datetime(datetime.now())
    
    # Tag True if the EndDateTime is in the future relative to 'now'
    working_df['Completed'] =  working_df['EndDateTime'] < now
    
    return  working_df



In [5]:
if __name__ == "__main__":
    
    print("---🚀 Starting WTT Event Processing 🚀---")
    
    # Create the output directory if it does not exist.
    os.makedirs(OUTPUT_DIRECTORY, exist_ok=True) 

    
    
    # Load and combine all raw data
    raw_events_df = collate_raw_events(EVENTS_DIRECTORY)
    
    if raw_events_df.empty:
        print("--- ❌ Processing failed: No raw data loaded. Check the input directory. ---")
    else:
        
        # Filter out events as specified
        # This returns seperate df for kept and removed events.
        kept_df, removed_df = filter_selected_events(
            df=raw_events_df,                     
            remove_strings=REMOVE_STRINGS,
            age_pattern = AGE_PATTERN
        )

        # convert the dates 
        time_converted_df = convert_dates(kept_df)
        # tag if event is ongoing for easier future processing
        tagged_df = tag_event_status(time_converted_df)       
        
            
        # Standardize the event names for consistency - only for the kept_df        
        shortlist_df = standardize_event_names(
            df=tagged_df, 
            name_map=NAME_MAP
        )
        
        # Sort by Date for consistency
        shortlist_df = shortlist_df.sort_values(["StartDateTime"])
        removed_df = removed_df.sort_values(["StartDateTime"])
        
        # Sort the shortlist_df        
        shortlist_path = os.path.join(OUTPUT_DIRECTORY, f"{SHORTLIST_OUTPUT_NAME}")
        shortlist_df.to_csv(shortlist_path, index=False)
        print(f"✅ Kept {len(shortlist_df)} events saved to {shortlist_path}")

        # Save the removed_df so that it can be checkedabs

        removed_path = os.path.join(OUTPUT_DIRECTORY, f"{REMOVED_OUTPUT_NAME}")
        removed_df.to_csv(removed_path, index=False)
        print(f"✅ Removed {len(removed_df)} events saved to {removed_path}")
        
         
        

        

    print("\n---🟢 Processing finished. 🟢---")

---🚀 Starting WTT Event Processing 🚀---
--- 🟠 Combining Raw Event Files 🟠 ---
Reading file: raw_events_2022.csv
Reading file: raw_events_2025.csv
Reading file: raw_events_2020.csv
Reading file: raw_events_2023.csv
Reading file: raw_events_2024.csv
Reading file: raw_events_2021.csv

--- 🟠 Filtering from 677 Events 🟠 ---
From total: 677 events , kept: 302, removed: 329, duplicates: 46
✅ Kept 302 events saved to ../Data/Processed/Events/shortlist_events.csv
✅ Removed 329 events saved to ../Data/Processed/Events/removed_events.csv

---🟢 Processing finished. 🟢---


In [6]:
raw_events_df

Unnamed: 0,PageLink,EventName,EventType,EventTypeId,Country,City,ContinentCode,Subcontinent,StartDateTime,EndDateTime,...,Comments,EventDateChangeId,eventId,FromStartDate,FromEndDate,ToStartDate,ToEndDate,ShowInCalendar,Type,Event_Tier_Name
0,,WTT Feeder Düsseldorf I,WTT Feeder,81,Germany,Düsseldorf,europe,western europe,2022-01-12T00:00:00,2022-01-15T00:00:00,...,,,2521,,,,,,,WTT Feeder Series
1,,WTT Feeder Düsseldorf II,WTT Feeder,81,Germany,Düsseldorf,europe,western europe,2022-01-17T00:00:00,2022-01-20T00:00:00,...,,,2522,,,,,,,WTT Feeder Series
2,,WTT Youth Star Contender Tunis 2022,WTT Youth Star Contender,68,Tunisia,Rades,africa,north africa,2022-02-02T00:00:00,2022-02-06T00:00:00,...,,,2523,,,,,,,WTT Youth Series
3,,WTT Youth Contender Spa 2022,WTT Youth Contender,69,Belgium,Spa,europe,western europe,2022-02-14T00:00:00,2022-02-20T00:00:00,...,,,2525,,,,,,,WTT Youth Series
4,,WTT Youth Contender Metz 2022,WTT Youth Contender,69,France,Metz,europe,western europe,2022-02-14T00:00:00,2022-02-20T00:00:00,...,,,2526,,,,,,,WTT Youth Series
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
672,,Central American Veteran Championships,Veteran Championships,55,El Salvador,El Salvador,americas,central america,2021-09-14T00:00:00,2021-09-18T00:00:00,...,,,2515,,,,,,,
673,,WTT Cup Finals Singapore,WTT Cup Finals,91,Singapore,Singapore,asia,south east asia,2021-12-04T00:00:00,2021-12-07T00:00:00,...,,,2516,,,,,,,WTT Series
674,,WTT Feeder Düsseldorf,WTT Feeder,81,Germany,Düsseldorf,europe,western europe,2021-12-07T00:00:00,2021-12-10T00:00:00,...,,,2519,,,,,,,WTT Feeder Series
675,,2021 Europe Top 16 Cup,European Cups,59,Greece,Thessaloniki,europe,southern europe,2021-09-18T00:00:00,2021-09-19T00:00:00,...,,,2781,,,,,,,
