In [1]:
import pandas as pd
import os
import sys
import glob
from typing import List, Dict, Union, Tuple, Any, Optional
from datetime import datetime
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils.cleanEvents import filter_by_payloads,  resolve_duplicate_names

In [2]:
#----------- Configuration--------------------

"""
Events Shortlist is processed to reformat dates for better readability.
Only desired columns are kept.
Placeholder columns are added for ball and table sponsors that will be added in 06_gets_sponsors_and_finalise_events

"""


# Input file from script 02 (processed and tagged events)
SHORTLIST_FILE = "../Data/Processed/Events/shortlist_events.csv"

# payloads directory, where all payloads are stored:
SINGLES_PAYLOADS_DIR = "../Data/Processed/Singles_match_payloads"

date = datetime.now().strftime("%Y%m%d")


INTERMEDIATE_DIR = "../Data/Processed/Events/Intermediate"
INTERMEDIATE_NAME = f"{date}_intermediate_events.csv"



# Columns that are dropped after all processing is done.
COLUMNS_TO_DROP = [
    'StartDateTime', 'EndDateTime' ,# Replaced by StartDate , EndDate 
    'Subcontinent',    
    'EventCode',
    'Tags',
    'EventTypeId',
    # Columns for rearranged events, blank for relevant events at time of writing this code.
    # so will simply be overlooked for simplicity.
    'ToStartDate', 'ToEndDate', 
    'PageLink',
    'Comments',
    'EventDateChangeId',
    'FromStartDate',
    'ToEndDate',
    'FromEndDate',
    'Type',
    'ShowInCalendar',
    'Event_Tier_Name'
    
]

In [3]:
if __name__ == "__main__":   
    
   
    # Try to read csv to shortlist df file
    # Catch error if file does not exist and exit.
    
    try:        
        shortlist_df = pd.read_csv(SHORTLIST_FILE)
    except FileNotFoundError:
        print(f"--- ‚ùå ERROR: Shortlist file not found: {SHORTLIST_FILE}. ---")
        sys.exit(1)
    

    # valid events are events where singles match payloads exist
    valid_events_df = filter_by_payloads(shortlist_df, SINGLES_PAYLOADS_DIR)
    
    # Convert start and end date to format yyyy-mm-dd for legibility
    valid_events_df['StartDate'] = pd.to_datetime(valid_events_df['StartDateTime']).dt.normalize()
    valid_events_df['EndDate'] = pd.to_datetime(valid_events_df['EndDateTime']).dt.normalize()

    valid_events_df = resolve_duplicate_names(valid_events_df)
    test_names_df = valid_events_df

        
    valid_events_df = valid_events_df.drop(columns= COLUMNS_TO_DROP)

    output_path = os.path.join(INTERMEDIATE_DIR,INTERMEDIATE_NAME)
    valid_events_df["BallSponsor"] = "TBC"
    valid_events_df["TableSponsor"] = "TBC"
   

    valid_events_df.to_csv(output_path, index=False)
    
    

    print(f"--- ‚úÖ Intermediate Events File saved to {output_path}")

    


--- üü† Filtering Events By Payloads üü† ---



--- ‚úÖ Found 189 events with valid payloads ---
--- ‚úÖ Resolved 3 duplicate event names by adding month and year.
--- ‚úÖ Intermediate Events File saved to ../Data/Processed/Events/Intermediate/20251125_intermediate_events.csv
