In [142]:
import pandas as pd
import os
import glob
from typing import List, Tuple

In [143]:
# --- CONFIGURATION ---

# Input directory containing RAW match payload CSVs (from script 03)
RAW_PAYLOADS_DIR = "Data/Raw/Match_payloads" # Or _test depending on your setup

# Output directory for PROCESSED (singles-only) match payload CSVs
OUTPUT_DIR = "Data/Processed/Singles_match_payloads"
PAYLOADS_FILES = glob.glob (f"{RAW_PAYLOADS_DIR}/*csv")

# Define string to  be kept for subeeventtype data. 
REMOVE_STRING = "singles"

In [198]:
def filter_singles_payloads(raw_payloads_dir: str, output_dir: str, keep_string: str) -> Tuple[List[int], List[int], List[int], List[int]]:
    """
    Filters raw match payload CSVs to keep only singles matches.
    Saves filtered singles payloads to the specified output directory.
    Returns lists of event ids:
    1. success_ids: Events where singles payloads were found
    2. only_doubles_ids: Events with data but no singles matches (e.g only doubles matches)
    3. empty_payload_ids: Events whose payload files contiained no payloads
    4. error_ids: Events where reading the file caused an unexpected error
    """
    
    print("--- 🟠 Filtering Singles Payloads 🟠 ---")
    
    os.makedirs(output_dir, exist_ok= True)
    # get the paths to all raw ppayloads_files
    payloads_files = glob.glob(f"{raw_payloads_dir}/*csv")

    success_ids = []
    no_singles_ids = []
    no_data_ids = []
    error_ids = [] 

    # loop through each file
    for file in payloads_files:
        
        # get event id from filename, less robust but  (avoids errors if df can not be read)
        event_id = os.path.basename(file).split('_')[0]
        if not event_id.isdigit():
            print(f"SKIPPING file: not in expected naming formnat")
            continue        

    
        try:
            # get df of the match payloads
            payloads_df = pd.read_csv(file)

            # if the df is empty, add to no data list and continue. 
            if payloads_df.empty:
                no_data_ids.append(event_id)
                continue

            # # get event_id - reobtain this from the file contents just incase there is a file naming error.
            # event_id = payloads_df["eventId"].iloc[0]
    
            # filter to select only the singles match paylods 
            singles_mask = payloads_df["subEventType"].str.contains(keep_string, na=False, case=False)
            
            singles_payloads_df = payloads_df[singles_mask].copy()

        
            # If the singles payloads df is not empty, 
            if not singles_payloads_df.empty:
                # get filepath for saving as csv
                filepath = os.path.join(output_dir,f"{event_id}_singles_payloads.csv" )
                singles_payloads_df.to_csv(filepath)
                success_ids.append(event_id)
            else:               
                no_singles_ids.append(event_id)

                
        except pd.errors.EmptyDataError:
            
            no_data_ids.append(event_id)
            continue # Go to the next file

        except Exception as e: 
            print(f"❌ Error processing event {event_id} ({os.path.basename(file)}): {e}")
            error_ids.append(event_id)
            continue
            
    print(f"\n--- ✅ Filtering Complete ---")
    print(f"Successfully processed and saved singles payloads for: {len(success_ids)} events")
    print(f"Events found with ONLY non-singles payloads: {len(no_singles_ids)} events")
    print(f"Events found with EMPTY payload files: {len(no_data_ids)} events")
    print(f"⚠️ Encountered unexpected errors processing: {len(error_ids)} events") 
        
        
    return success_ids, no_singles_ids, no_data_ids, error_ids             

In [201]:
if __name__ == "__main__":
    
    success, no_singles, missing, error = filter_singles_payloads(RAW_PAYLOADS_DIR, OUTPUT_DIR, KEEP_STRING)

--- 🟠 Filtering Singles Payloads 🟠 ---

--- ✅ Filtering Complete ---
Successfully processed and saved singles payloads for: 178 events
Events found with ONLY non-singles payloads: 6 events
Events found with EMPTY payload files: 118 events
⚠️ Encountered unexpected errors processing: 0 events


In [202]:
no_singles

['2275', '2979', '2535', '2860', '2717', '2751']

In [34]:
big_df['subEventType'].values

array(['Men Singles', 'Women Singles', 'Men Doubles', ...,
       'Mixed Doubles', 'Mixed Doubles', 'Men Singles'],
      shape=(32869,), dtype=object)

In [170]:
a = PAYLOADS_FILES[0]

In [175]:
a = os.path.basename(a).split('_')[0]

In [176]:
a.isdigit()

True

In [174]:
a

'Data/Raw/Match_payloads/2868_match_payloads.csv'