In [1]:
import sys
import os
import pandas as pd

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

if project_root not in sys.path:
    sys.path.append(project_root)
    
from utils.processEvents import collate_raw_events, \
filter_selected_events, \
standardize_event_names, \
convert_dates ,\
tag_event_status





In [2]:
"""
This script reads and filters events data that have been scraped by year. 
All desired events are saved into an events_shortlist. ALl removed events are also savec
"""

# --- CONFIGURATION ---

# Define directory where events by year files are stored as csv.
EVENTS_DIRECTORY = "../Data/Raw/Events"

# Output directory where the master file will be saved
OUTPUT_DIRECTORY ="../Data/Processed/Events"

SHORTLIST_OUTPUT_NAME = "shortlist_events.csv"

REMOVED_OUTPUT_NAME = "removed_events.csv"

REMOVE_STRINGS = {
    # Non-Senior events
    "cadet", 
    "junior", 
    "youth",
    "under",
    # Para Categories
    "para", 
    "paralympic",     
    # Veteran series
    "vet", 
    "veteran"
}

# Regex pattern used to filter out age restricted events (e.g U13, U21 etc)
AGE_PATTERN = r"u\d{2}"


# This can be used to rename events after filtering for increased clarity
# Passed into the rename_events function.
NAME_MAP = {
    "Singles World Cup": "World Cup",
    "WTT Cup Finals": "WTT Finals",
    "WTTC": "World Championship"
}

print("---üöÄ Starting WTT Event Processing üöÄ---")
os.makedirs(OUTPUT_DIRECTORY, exist_ok=True) 
raw_events_df = collate_raw_events(EVENTS_DIRECTORY)    
if raw_events_df.empty:
        print("--- ‚ùå Processing failed: No raw data loaded. Check the input directory. ---")
else:
    
    # Filter out events as specified
    # This returns seperate df for kept and removed events.
    kept_df, removed_df = filter_selected_events(
        df=raw_events_df,                     
        remove_strings=REMOVE_STRINGS,
        age_pattern = AGE_PATTERN
    )

    # convert the dates 
    time_converted_df = convert_dates(kept_df)
    # tag if event is ongoing for easier future processing
    tagged_df = tag_event_status(time_converted_df)       
    
        
    # Standardize the event names for consistency - only for the kept_df        
    shortlist_df = standardize_event_names(
        df=tagged_df, 
        name_map=NAME_MAP
    )
    
    # Sort by Date for consistency
    shortlist_df = shortlist_df.sort_values(["StartDateTime"])
    removed_df = removed_df.sort_values(["StartDateTime"])
    
    # Sort the shortlist_df        
    shortlist_path = os.path.join(OUTPUT_DIRECTORY, f"{SHORTLIST_OUTPUT_NAME}")
    shortlist_df.to_csv(shortlist_path, index=False)
    print(f"‚úÖ Kept {len(shortlist_df)} events saved to {shortlist_path}")

    # Save the removed_df so that it can be checkedabs

    removed_path = os.path.join(OUTPUT_DIRECTORY, f"{REMOVED_OUTPUT_NAME}")
    removed_df.to_csv(removed_path, index=False)
    print(f"‚úÖ Removed {len(removed_df)} events saved to {removed_path}")        
    

        

print("\n---üü¢ Processing finished. üü¢---")


---üöÄ Starting WTT Event Processing üöÄ---
--- üü† Combined 7 Raw Event Files from 7 files (years) in ../Data/Raw/Events üü† ---

--- üü† Filtering from 784 Events üü† ---
From total: 784 events , kept: 361, removed: 377, duplicates: 46
‚úÖ Kept 361 events saved to ../Data/Processed/Events/shortlist_events.csv
‚úÖ Removed 377 events saved to ../Data/Processed/Events/removed_events.csv

---üü¢ Processing finished. üü¢---
