In [5]:
import pandas as pd
import os
import glob
from typing import List, Tuple

In [6]:
# --- CONFIGURATION ---

# Input directory containing RAW match payload CSVs (from script 03)
RAW_PAYLOADS_DIR = "../Data/Raw/Match_payloads" # Or _test depending on your setup

# Output directory for PROCESSED (singles-only) match payload CSVs
OUTPUT_DIR = "../Data/Processed/Singles_match_payloads"
PAYLOADS_FILES = glob.glob (f"{RAW_PAYLOADS_DIR}/*csv")

# Define string to  be kept for subeeventtype data. 
REMOVE_STRINGS = ["double", "U21","class"]

In [7]:
def filter_singles_payloads(raw_payloads_dir: str, output_dir: str, REMOVE_STRINGS: str) -> Tuple[List[int], List[int], List[int], List[int]]:
    """
    Filters raw match payload CSVs to keep only singles matches.
    Here, Teams Matches alongside Singles matches are kept.
    There MAY be cases where a teams match was doubles format (shall be filtered out later)
    Saves filtered singles payloads to the specified output directory.
    Returns lists of event ids: to be used for user checking of results.
    1. success_ids: Events where singles payloads were found
    2. only_doubles_ids: Events with data but no singles matches (e.g only doubles matches)
    3. empty_payload_ids: Events whose payload files contiained no payloads
    4. error_ids: Events where reading the file caused an unexpected error
    """
    
    print("--- üü† Filtering Singles Payloads üü† ---")
    
    os.makedirs(output_dir, exist_ok= True)
    # get the paths to all raw ppayloads_files
    payloads_files = glob.glob(f"{raw_payloads_dir}/*csv")
    payloads_files.sort()

    success_ids = []
    no_singles_ids = []
    no_data_ids = []
    error_ids = [] 

    pattern = "|".join(REMOVE_STRINGS)

    # loop through each file
    for file in payloads_files:
        
        # get event id from filename, less robust but  (avoids errors if df can not be read)
        event_id = os.path.basename(file).split('_')[0]
        if not event_id.isdigit():
            print(f"SKIPPING file: not in expected naming formnat")
            error_ids.append(event_id)
            continue        

    
        try:
            # get df of the match payloads
            payloads_df = pd.read_csv(file)         

            # filter to select only the singles match paylods 
            remove_mask = payloads_df["subEventType"].str.contains(pattern, na=False, case=False)
            
            singles_payloads_df = payloads_df[~remove_mask].copy()

        
            # If the singles payloads df is not empty, 
            if not singles_payloads_df.empty:
                # get filepath for saving as csv
                filepath = os.path.join(output_dir,f"{event_id}_singles_payloads.csv" )
                singles_payloads_df.to_csv(filepath)
                success_ids.append(event_id)
            else:               
                no_singles_ids.append(event_id)

        # if pandas can't read the empty file, catch the error and add id to to no_data list      
        except pd.errors.EmptyDataError:
            
            no_data_ids.append(event_id)
            continue # Go to the next file

        # catches other unexpected i/o errors.
        except Exception as e: 
            print(f"‚ùå Error processing event {event_id} ({os.path.basename(file)}): {e}")
            error_ids.append(event_id)
            continue
    
            
    print(f"\n--- ‚úÖ Filtering Complete ---")
    print(f"Successfully processed and saved singles payloads for: {len(success_ids)} events")
    print(f"Events found with ONLY non-singles payloads: {len(no_singles_ids)} events")
    print(f"Events found with EMPTY payload files: {len(no_data_ids)} events")
    print(f"‚ö†Ô∏è Encountered unexpected errors processing: {len(error_ids)} events") 
        
        
    return success_ids, no_singles_ids, no_data_ids, error_ids             

In [8]:
if __name__ == "__main__":
    
    success, no_singles, empty, error = filter_singles_payloads(RAW_PAYLOADS_DIR, OUTPUT_DIR, REMOVE_STRINGS)

--- üü† Filtering Singles Payloads üü† ---

--- ‚úÖ Filtering Complete ---
Successfully processed and saved singles payloads for: 186 events
Events found with ONLY non-singles payloads: 1 events
Events found with EMPTY payload files: 115 events
‚ö†Ô∏è Encountered unexpected errors processing: 0 events
