In [1]:
import pandas as pd
import os
import sys
import glob
from typing import List, Dict, Union, Tuple, Any, Optional
from datetime import datetime

In [2]:
#----------- Configuration--------------------

"""
Events Shortlist is processed to reformat dates for better readability.
Only desired columns are kept.
Placeholder columns are added for ball and table sponsors that will be added in 06_gets_sponsors_and_finalise_events

"""


# Input file from script 02 (processed and tagged events)
SHORTLIST_FILE = "../Data/Processed/Events/shortlist_events.csv"

# payloads directory, where all payloads are stored:
SINGLES_PAYLOADS_DIR = "../Data/Processed/Singles_match_payloads"

date = datetime.now().strftime("%Y%m%d")


INTERMEDIATE_DIR = "../Data/Processed/Events/Intermediate"
INTERMEDIATE_NAME = f"{date}_intermediate_events.csv"



# Columns that are dropped after all processing is done.
COLUMNS_TO_DROP = [
    'StartDateTime', 'EndDateTime' ,# Replaced by StartDate , EndDate 
    'Subcontinent',
    'ContinentCode',
    'EventCode',
    'Tags',
    'EventTypeId',
    # Columns for rearranged events, blank for relevant events at time of writing this code.
    # so will simply be overlooked for simplicity.
    'ToStartDate', 'ToEndDate', 
    'PageLink',
    'Comments',
    'EventDateChangeId',
    'FromStartDate',
    'ToEndDate',
    'FromEndDate',
    'Type',
    'ShowInCalendar',
    'Event_Tier_Name'
    
]

In [3]:
def filter_by_payloads(shortlist_df: pd.DataFrame, singles_payloads_dir:str) -> pd.DataFrame:
    
    print("--- 🟠 Filtering Events By Payloads 🟠 ---")
    """
    Function is used to further filter down the events_shortlist.
    Only keeping events where valid, non-doubles, match payloads are available.

    Returns a new DF containing these selected events with valid payloads.
    """
    valid_event_ids = []
    payloads_files = glob.glob(f"{singles_payloads_dir}/*csv")
    for file in payloads_files:
        try: 
            payloads_df = pd.read_csv(file)
            event_id = payloads_df["eventId"].iloc[0]
           
            valid_event_ids.append(event_id)
        except Exception as e:
            print (e,file)
   


    mask = shortlist_df["eventId"].isin(valid_event_ids)
    valid_events_df = shortlist_df[mask].copy()
    valid_events_df = valid_events_df.reset_index(drop=True, inplace=False)

    print(f"\n--- ✅ Found {len(valid_events_df)} events with valid payloads ---")
    return valid_events_df    

In [4]:
def resolve_duplicate_names(df: pd.DataFrame) -> pd.DataFrame:
    """
    Identifies events with duplicate names and appends the abbreviated month and 
    full year (e.g., 'Oct 2021') from StartDateTime to distinguish them.
    
    Args:
        df (pd.DataFrame): DataFrame containing 'EventName' and 'StartDateTime'.
        
    Returns:
        pd.DataFrame: DataFrame with unique 'EventName' entries.
    """
    
    working_df = df.copy() # Work on a copy for safety

    # 1. Ensure StartDateTime is in datetime format for strftime
    # NOTE: Assuming this has already been done by the convert_dates function prior to this call
    working_df['StartDateTime'] = pd.to_datetime(working_df['StartDateTime']) 

    # 2. Identify duplicate event names
    name_counts = working_df['EventName'].value_counts()
    duplicate_names = name_counts[name_counts > 1].index.tolist()

    # 3. Create a mask for rows that need renaming
    mask = working_df['EventName'].isin(duplicate_names)

    # 4. Apply the transformation using .loc and .apply()
    if not working_df[mask].empty:
        working_df.loc[mask, 'EventName'] = working_df.loc[mask].apply(
            # Format: 'Event Y (Oct 2021)'
            lambda row: f"{row['EventName']} ({row['StartDateTime'].strftime('%b %Y')})",
            axis=1
        )
        print(f"--- ✅ Resolved {len(duplicate_names)} duplicate event names by adding month and year.")
    else:
        print("--- INFO: No event names required disambiguation. ---")

    return working_df

In [5]:
if __name__ == "__main__":   
    
   
    # Try to read csv to shortlist df file
    # Catch error if file does not exist and exit.
    
    try:        
        shortlist_df = pd.read_csv(SHORTLIST_FILE)
    except FileNotFoundError:
        print(f"--- ❌ ERROR: Shortlist file not found: {SHORTLIST_FILE}. ---")
        sys.exit(1)
    

    # valid events are events where singles match payloads exist
    valid_events_df = filter_by_payloads(shortlist_df, SINGLES_PAYLOADS_DIR)
    
    # Convert start and end date to format yyyy-mm-dd for legibility
    valid_events_df['StartDate'] = pd.to_datetime(valid_events_df['StartDateTime']).dt.normalize()
    valid_events_df['EndDate'] = pd.to_datetime(valid_events_df['EndDateTime']).dt.normalize()

    valid_events_df = resolve_duplicate_names(valid_events_df)


        
    valid_events_df = valid_events_df.drop(columns= COLUMNS_TO_DROP)

    output_path = os.path.join(INTERMEDIATE_DIR,INTERMEDIATE_NAME)
    valid_events_df["BallSponsor"] = "TBC"
    valid_events_df["TableSponsor"] = "TBC"
   

    valid_events_df.to_csv(output_path, index=False)
    
    

    print(f"--- ✅ Intermediate Events File saved to {output_path}")

    


--- 🟠 Filtering Events By Payloads 🟠 ---

--- ✅ Found 184 events with valid payloads ---
--- ✅ Resolved 3 duplicate event names by adding month and year.
--- ✅ Intermediate Events File saved to ../Data/Processed/Events/Intermediate/20251029_intermediate_events.csv
