In [189]:
import pandas as pd 
import glob 
from typing import Optional
import os 
import re
import numpy as np

In [190]:
CLEANED_MATCHES_DIR = "../Data/Processed/Matches/"
CLEANED_MATCHES_SUFFIX = "_cleaned_matches.csv"
CLEANED_MATCHES_REGEX = rf"^\d{{8}}{re.escape(CLEANED_MATCHES_SUFFIX)}$"
START_DROP_COLS = ["bestOf", 
                   "calcBestOf",
                   "currentGameNumber",
                   "gameScores",
                   "resultsGameScores",
                   "overallScores", 
                   "resultOverallScores", 
                   "homePlayerGameScores",
                   "awayPlayerGameScores",
                   "actionType",
                   "gameScoreConsistent",
                    "scoreConsistent",
                    "calcGameScores",
                    "calcOverallScores",
                    "gameScoreFlags",
                    "overallScores",
                    "resultOverallScores",
                    "homePlayerGameScores",
                    "awayPlayerGameScores",
                    "actionType",
                    "scoreConsistent",
                    "gameScoreConsistent",
                    "gameScoreFlags",
                    "tableNumber",
                    

                    ]

In [201]:
def get_latest_cleaned_matches(cleaned_matches_dir:str, cleaned_matches_regex) -> Optional[pd.DataFrame]:
    """
    Parses specified directory for cleaned_matches_files in format yyyy_mm_dd. 
    Attempts to read latest file in this format. 

    Args:
        directory (str): The folder where the clened_matches_files are stored 
        filename_pattern (str): The pattern to match 

    Returns:
        Tuple[pd.DataFrame,Optional]: returns DF with data if available or blank df if data unavailable
    """
    if not os.path.isdir(cleaned_matches_dir):
        print (f"❌{cleaned_matches_dir} does not exist as a directory")
            
        
    # Get csv files in 
    files = glob.glob(f"{cleaned_matches_dir}/*.csv")


    cleaned_matches_files = []


    for file in files:
        filename = os.path.basename(file)   
         
        
        if re.match(cleaned_matches_regex,filename):
            cleaned_matches_files.append(file)
            print(f"✅ Found cleaned_matches file: {file}")

    if not cleaned_matches_files:
        print(f"❌ No existing cleaned_matches files in format: {cleaned_matches_regex} in {cleaned_matches_dir}")
        None 
    cleaned_matches_files.sort()    
    latest_cleaned_matches = cleaned_matches_files[-1]

    try: 
        latest_cleaned_matches_df = pd.read_csv(latest_cleaned_matches)
        print(f"✅ {len(latest_cleaned_matches_df)} matches found in latest cleaned_matches: {latest_cleaned_matches} ")
        return latest_cleaned_matches_df
        
    except Exception as e:
        print (f"❌ Error reading lastest cleaned_matches, {latest_cleaned_matches}: {e}")
        None 

In [243]:
cleaned_matches_df = get_latest_cleaned_matches(CLEANED_MATCHES_DIR, CLEANED_MATCHES_REGEX)
cleaned_matches_df = cleaned_matches_df.drop(columns=START_DROP_COLS)

✅ Found cleaned_matches file: ../Data/Processed/Matches/20251108_cleaned_matches.csv
✅ 23684 matches found in latest cleaned_matches: ../Data/Processed/Matches/20251108_cleaned_matches.csv 


  latest_cleaned_matches_df = pd.read_csv(latest_cleaned_matches)


In [244]:
dnf_filter = cleaned_matches_df["dnf"] != "False"
winner_mismatch = cleaned_matches_df["reconciledOverallScoreWinner"]!= cleaned_matches_df["calculatedGameScoreWinner"]

winner_mismatch_df = cleaned_matches_df[~dnf_filter & winner_mismatch]
num_mismatches = len(winner_mismatch_df)

if num_mismatches > 0:
    print(f"❌ {num_mismatches} matches have a mismatch between reconciledOverallScoreWinner and calculatedGameScoreWinner")
else: 
    print("✅ No mismatches between reconciledOverallScoreWinner and calculatedGameScoreWinner")
cleaned_matches_df.drop("reconciledOverallScoreWinner",axis=1, inplace= True)
cleaned_matches_df.rename(columns={"calculatedGameScoreWinner" : "Winner"}, inplace=True)

✅ No mismatches between reconciledOverallScoreWinner and calculatedGameScoreWinner


In [245]:
cleaned_matches_df

Unnamed: 0,eventId,documentCode,subEventName,subEventDescription,venueName,tableName,dnf,homeCompetitorId,homeCompetitorOrg,homePlayer,...,ttrReview,serverNext,duration (unreliable),EventName,EventStartDate,matchDate,reconciledOverallScore,reconciledGameScore,Winner,trueBestOf
0,2410,TTEMSINGLES-----------RND1000700--,Men Singles,Men's Singles - Preliminary Round 1 - Match 7,Lusail Sports Arena,Table 6,False,115449.0,AUT,Andreas LEVENKO,...,,,00:15:15,WTT Contender Doha 2021,2021-02-28 11:30:00+00:00,2021-02-28 07:40:00+00:00,3-0,"13-11,12-10,11-8",home,5.0
1,2410,TTEMSINGLES-----------RND1002600--,Men Singles,Men's Singles - Preliminary Round 1 - Match 26,Lusail Sports Arena,Table 4,False,100189.0,QAT,Ahmed ALAWLAQI,...,,,00:00:21,WTT Contender Doha 2021,2021-02-28 11:30:00+00:00,2021-02-28 07:40:00+00:00,0-3,"3-11,10-12,4-11",away,5.0
2,2410,TTEMSINGLES-----------RND1004600--,Men Singles,Men's Singles - Preliminary Round 1 - Match 46,Lusail Sports Arena,Table 3,False,131375.0,JPN,Yuta TANAKA,...,,,00:00:00,WTT Contender Doha 2021,2021-02-28 11:30:00+00:00,2021-02-28 07:40:00+00:00,3-0,"11-9,11-9,11-8",home,5.0
3,2410,TTEMSINGLES-----------RND1001900--,Men Singles,Men's Singles - Preliminary Round 1 - Match 19,Lusail Sports Arena,Table 5,False,101192.0,ITA,Mihai BOBOCICA,...,,,00:27:19,WTT Contender Doha 2021,2021-02-28 11:30:00+00:00,2021-02-28 07:40:00+00:00,3-0,"11-6,11-9,15-13",home,5.0
4,2410,TTEMSINGLES-----------RND1001000--,Men Singles,Men's Singles - Preliminary Round 1 - Match 10,Lusail Sports Arena,Table 7,False,111959.0,EGY,Khalid ASSAR,...,,,00:13:28,WTT Contender Doha 2021,2021-02-28 11:30:00+00:00,2021-02-28 07:40:00+00:00,0-3,"12-14,7-11,6-11",away,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23679,3066,TTEWSINGLES-----------QFNL000300----------,Women's Singles,Women's Singles - Quarterfinal - Match 3,Pavilhao Municipal de Vila Nova de Gaia,Table 1,False,112453.0,CRO,JEGER Mateja,...,False,112453.0,00:24:32,WTT Feeder Vila Nova de Gaia 2025,2025-11-05 00:00:00+00:00,2025-11-08 19:05:00+00:00,1-3,"0-11,11-7,8-11,6-11",away,5.0
23680,3066,TTEMSINGLES-----------QFNL000300----------,Men's Singles,Men's Singles - Quarterfinal - Match 3,Pavilhao Municipal de Vila Nova de Gaia,Table 2,False,112074.0,SWE,FALCK Mattias,...,False,111050.0,00:42:01,WTT Feeder Vila Nova de Gaia 2025,2025-11-05 00:00:00+00:00,2025-11-08 20:29:00.297000+00:00,2-3,"7-11,11-6,8-11,11-4,5-11",away,5.0
23681,3066,TTEMSINGLES-----------QFNL000100----------,Men's Singles,Men's Singles - Quarterfinal - Match 1,Pavilhao Municipal de Vila Nova de Gaia,Table 1,False,201466.0,GER,VERDONSCHOT Wim,...,False,201466.0,00:43:51,WTT Feeder Vila Nova de Gaia 2025,2025-11-05 00:00:00+00:00,2025-11-08 19:40:00+00:00,2-3,"12-10,9-11,7-11,11-9,7-11",away,5.0
23682,3066,TTEMSINGLES-----------QFNL000200----------,Men's Singles,Men's Singles - Quarterfinal - Match 2,Pavilhao Municipal de Vila Nova de Gaia,Table 1,False,121684.0,ENG,JARVIS Tom,...,False,102841.0,00:50:21,WTT Feeder Vila Nova de Gaia 2025,2025-11-05 00:00:00+00:00,2025-11-08 21:18:11.670000+00:00,3-2,"6-11,4-11,13-11,12-10,11-3",home,5.0


In [246]:
"""
Shortened code for mapping home and away to winner and loser 
dependent on who won - thus removing home and away terms
This MAY cause issues later if further analysis is done from other match data sources
e.g match logs 
"""
is_home_winner = cleaned_matches_df["Winner"] == "home"

column_map = {
    
    'winnerName':    ('homePlayer',         'awayPlayer'),
    'winnerId':      ('homeCompetitorId',   'awayCompetitorId'),
    'winnerCountry': ('homeCompetitorOrg',  'awayCompetitorOrg'),
    
    
    'loserName':     ('awayPlayer',         'homePlayer'),
    'loserId':       ('awayCompetitorId',   'homeCompetitorId'),
    'loserCountry':  ('awayCompetitorOrg',  'homeCompetitorOrg'),
}

for new_col, (home_case, false_case) in column_map.items():
    cleaned_matches_df[new_col] = np.where(
        is_home_winner,
        cleaned_matches_df[home_case],  # Value if True
        cleaned_matches_df[false_case]  # Value if False
    )

cols_to_drop = ["homePlayer", "awayPlayer", "homeCompetitorId", "awayCompetitorId", "homeCompetitorOrg", "awayCompetitorOrg"]
integer_columns = ["trueBestOf", "winnerId", "loserId"]

cleaned_matches_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
for col in integer_columns:
    cleaned_matches_df[col] = cleaned_matches_df[col].astype("Int64")


In [254]:
all_appearances = pd.concat([
    cleaned_matches_df['winnerName'], 
    cleaned_matches_df['loserName']
])
total_matches_played = all_appearances.value_counts()
print("--- Total Matches Played per Player ---")
print(total_matches_played.head(10))

--- Total Matches Played per Player ---
HARIMOTO Miwa        194
CALDERANO Hugo       181
SUN Yingsha          177
AN Jaehyun           174
SHIN Yubin           164
HARIMOTO Tomokazu    164
WANG Yidi            163
DUDA Benedikt        162
LIN Shidong          161
WANG Chuqin          161
Name: count, dtype: int64


In [256]:
final_filter = cleaned_matches_df["subEventDescription"].str.contains(" final", case=False, na=False)

In [258]:
finals = cleaned_matches_df [final_filter]

In [261]:
finals["winnerName"].value_counts().head(10)

winnerName
SUN Yingsha       21
WANG Chuqin       16
CALDERANO Hugo    14
WANG Manyu         9
FAN Zhendong       8
LIN Shidong        8
LIANG Jingkun      7
HAYATA Hina        7
HARIMOTO Miwa      7
ODO Satsuki        7
Name: count, dtype: int64