In [242]:
import pandas as pd 
import glob 
from typing import Optional
import os 
import re
import numpy as np

In [243]:
CLEANED_MATCHES_DIR = "../Data/Processed/Matches/"
CLEANED_MATCHES_SUFFIX = "_cleaned_matches.csv"
CLEANED_MATCHES_REGEX = rf"^\d{{8}}{re.escape(CLEANED_MATCHES_SUFFIX)}$"
START_DROP_COLS = [
    "A_rawGameScores",
    "A_rawOverallScore",
    "B_rawGameScores",
    "B_rawOverallScore",
    "actionType",
    "awayNestedGameScores",
    "bestOf",
    "calcBestOf",
    "calcNestedGameScores",
    "calcNestedOverallScores",
    
    "currentGameNumber",
    "gameScoreConsistent",
    "gameScoreFlags",
    "homeNestedGameScores",
    "scoreConsistent",
    "tableNumber",
    
]

In [244]:
def get_latest_cleaned_matches(cleaned_matches_dir:str, cleaned_matches_regex) -> Optional[pd.DataFrame]:
    """
    Parses specified directory for cleaned_matches_files in format yyyy_mm_dd. 
    Attempts to read latest file in this format. 

    Args:
        directory (str): The folder where the clened_matches_files are stored 
        filename_pattern (str): The pattern to match 

    Returns:
        Tuple[pd.DataFrame,Optional]: returns DF with data if available or blank df if data unavailable
    """
    if not os.path.isdir(cleaned_matches_dir):
        print (f"❌{cleaned_matches_dir} does not exist as a directory")
            
        
    # Get csv files in 
    files = glob.glob(f"{cleaned_matches_dir}/*.csv")


    cleaned_matches_files = []


    for file in files:
        filename = os.path.basename(file)   
         
        
        if re.match(cleaned_matches_regex,filename):
            cleaned_matches_files.append(file)
            print(f"✅ Found cleaned_matches file: {file}")

    if not cleaned_matches_files:
        print(f"❌ No existing cleaned_matches files in format: {cleaned_matches_regex} in {cleaned_matches_dir}")
        None 
    cleaned_matches_files.sort()    
    latest_cleaned_matches = cleaned_matches_files[-1]

    try: 
        latest_cleaned_matches_df = pd.read_csv(latest_cleaned_matches)
        print(f"✅ {len(latest_cleaned_matches_df)} matches found in latest cleaned_matches: {latest_cleaned_matches} ")
        return latest_cleaned_matches_df
        
    except Exception as e:
        print (f"❌ Error reading lastest cleaned_matches, {latest_cleaned_matches}: {e}")
        None 

In [245]:
cleaned_matches_df = get_latest_cleaned_matches(CLEANED_MATCHES_DIR, CLEANED_MATCHES_REGEX)
cleaned_matches_df = cleaned_matches_df.drop(columns=START_DROP_COLS)

✅ Found cleaned_matches file: ../Data/Processed/Matches/20251111_cleaned_matches.csv
✅ Found cleaned_matches file: ../Data/Processed/Matches/20251108_cleaned_matches.csv
✅ Found cleaned_matches file: ../Data/Processed/Matches/20251110_cleaned_matches.csv
✅ 23690 matches found in latest cleaned_matches: ../Data/Processed/Matches/20251111_cleaned_matches.csv 


  latest_cleaned_matches_df = pd.read_csv(latest_cleaned_matches)


In [246]:
dnf_filter = cleaned_matches_df["dnf"] != "False"
winner_mismatch = cleaned_matches_df["reconciledOverallScoreWinner"]!= cleaned_matches_df["calculatedGameScoreWinner"]

winner_mismatch_df = cleaned_matches_df[~dnf_filter & winner_mismatch]
num_mismatches = len(winner_mismatch_df)

if num_mismatches > 0:
    print(f"❌ {num_mismatches} matches have a mismatch between reconciledOverallScoreWinner and calculatedGameScoreWinner")
else: 
    print("✅ No mismatches between reconciledOverallScoreWinner and calculatedGameScoreWinner")
cleaned_matches_df.drop("reconciledOverallScoreWinner",axis=1, inplace= True)
cleaned_matches_df.rename(columns={"calculatedGameScoreWinner" : "Winner"}, inplace=True)

✅ No mismatches between reconciledOverallScoreWinner and calculatedGameScoreWinner


In [247]:
"""
Shortened code for mapping home and away to winner and loser 
dependent on who won - thus removing home and away terms
MAY need to restore home and away terms later if further analysis is done from other match data sources
e.g match logs 
"""
is_home_winner = cleaned_matches_df["Winner"] == "home"

column_map = {
    
    'winnerName':    ('homePlayer',         'awayPlayer'),
    'winnerId':      ('homeCompetitorId',   'awayCompetitorId'),
    'winnerCountry': ('homeCompetitorOrg',  'awayCompetitorOrg'),
    
    
    'loserName':     ('awayPlayer',         'homePlayer'),
    'loserId':       ('awayCompetitorId',   'homeCompetitorId'),
    'loserCountry':  ('awayCompetitorOrg',  'homeCompetitorOrg'),
}

for new_col, (home_case, false_case) in column_map.items():
    cleaned_matches_df[new_col] = np.where(
        is_home_winner,
        cleaned_matches_df[home_case],  # Value if True
        cleaned_matches_df[false_case]  # Value if False
    )

cols_to_drop = ["homePlayer", "awayPlayer", "homeCompetitorId", "awayCompetitorId", "homeCompetitorOrg", "awayCompetitorOrg"]
integer_columns = ["trueBestOf", "winnerId", "loserId"]

cleaned_matches_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
for col in integer_columns:
    cleaned_matches_df[col] = cleaned_matches_df[col].astype("Int64")


In [248]:
cleaned_matches_df.rename(columns={
    "reconciledOverallScore": "overallScore",
    "reconciledGameScore": "gameScore",
    "trueBestOf": "bestOf",
}, inplace=True)

In [249]:
cleaned_matches_df

Unnamed: 0,eventId,documentCode,subEventName,subEventDescription,venueName,tableName,dnf,ttrReview,serverNext,duration (unreliable),...,overallScore,gameScore,Winner,bestOf,winnerName,winnerId,winnerCountry,loserName,loserId,loserCountry
0,2410,TTEMSINGLES-----------RND1000700--,Men Singles,Men's Singles - Preliminary Round 1 - Match 7,Lusail Sports Arena,Table 6,False,,,00:15:15,...,3-0,"13-11,12-10,11-8",home,5,Andreas LEVENKO,115449,AUT,Aleksandar KARAKASEVIC,104359,SRB
1,2410,TTEMSINGLES-----------RND1002600--,Men Singles,Men's Singles - Preliminary Round 1 - Match 26,Lusail Sports Arena,Table 4,False,,,00:00:21,...,0-3,"3-11,10-12,4-11",away,5,LIAO Cheng-Ting,119565,TPE,Ahmed ALAWLAQI,100189,QAT
2,2410,TTEMSINGLES-----------RND1004600--,Men Singles,Men's Singles - Preliminary Round 1 - Match 46,Lusail Sports Arena,Table 3,False,,,00:00:00,...,3-0,"11-9,11-9,11-8",home,5,Yuta TANAKA,131375,JPN,Alexandre CASSIN,117003,FRA
3,2410,TTEMSINGLES-----------RND1001900--,Men Singles,Men's Singles - Preliminary Round 1 - Match 19,Lusail Sports Arena,Table 5,False,,,00:27:19,...,3-0,"11-6,11-9,15-13",home,5,Mihai BOBOCICA,101192,ITA,Daniel GONZALEZ,111512,PUR
4,2410,TTEMSINGLES-----------RND1001000--,Men Singles,Men's Singles - Preliminary Round 1 - Match 10,Lusail Sports Arena,Table 7,False,,,00:13:28,...,0-3,"12-14,7-11,6-11",away,5,Ricardo WALTHER,109946,GER,Khalid ASSAR,111959,EGY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23685,3191,TTEMSINGLES-----------GP04000100----------,Men's Singles,Men's Singles - Group 4 - Match 1,Centrum Szkolenia PZTS,Table 1,False,False,119273.0,00:38:01,...,1-3,"10-12,8-11,21-19,5-11",away,5,FELKEL Grzegorz,119273,POL,CAO Andrew,145852,USA
23686,3191,TTEMSINGLES-----------GP03000100----------,Men's Singles,Men's Singles - Group 3 - Match 1,Centrum Szkolenia PZTS,Table 3,False,False,135473.0,00:13:28,...,3-0,"11-5,11-2,11-7",home,5,SHUSHAN Eitay,135219,ISR,CHOJNACKI Jakub,135473,POL
23687,3191,TTEMSINGLES-----------GP02000100----------,Men's Singles,Men's Singles - Group 2 - Match 1,Centrum Szkolenia PZTS,Table 2,False,False,121073.0,00:16:28,...,3-0,"11-6,11-3,11-3",home,5,GIARDI Federico,121073,SMR,CABRERA Tomas,221863,ARG
23688,3191,TTEMSINGLES-----------GP09000100----------,Men's Singles,Men's Singles - Group 9 - Match 1,Centrum Szkolenia PZTS,Table 2,WO,False,132196.0,00:00:00,...,0-3,"0-11,0-11,0-11",away,5,MONGIUSTI Mattias,132196,SMR,MISAL John Russel,145472,PHI


In [250]:
score_splits = cleaned_matches_df["overallScore"].str.split("-", expand=True)
home_score_numeric = pd.to_numeric(score_splits[0], errors='coerce')
away_score_numeric = pd.to_numeric(score_splits[1], errors='coerce')
winner_correct = (home_score_numeric >= away_score_numeric)

In [251]:
cleaned_matches_df[~winner_correct]

Unnamed: 0,eventId,documentCode,subEventName,subEventDescription,venueName,tableName,dnf,ttrReview,serverNext,duration (unreliable),...,overallScore,gameScore,Winner,bestOf,winnerName,winnerId,winnerCountry,loserName,loserId,loserCountry
1,2410,TTEMSINGLES-----------RND1002600--,Men Singles,Men's Singles - Preliminary Round 1 - Match 26,Lusail Sports Arena,Table 4,False,,,00:00:21,...,0-3,"3-11,10-12,4-11",away,5,LIAO Cheng-Ting,119565,TPE,Ahmed ALAWLAQI,100189,QAT
4,2410,TTEMSINGLES-----------RND1001000--,Men Singles,Men's Singles - Preliminary Round 1 - Match 10,Lusail Sports Arena,Table 7,False,,,00:13:28,...,0-3,"12-14,7-11,6-11",away,5,Ricardo WALTHER,109946,GER,Khalid ASSAR,111959,EGY
6,2410,TTEMSINGLES-----------RND1003800--,Men Singles,Men's Singles - Preliminary Round 1 - Match 38,Lusail Sports Arena,Table 2,False,,,00:00:52,...,0-3,"6-11,7-11,8-11",away,5,Jeremy HAZIN,117604,CAN,Ahmad Khalil AL-MOHANNADI,137380,QAT
7,2410,TTEMSINGLES-----------RND1004700--,Men Singles,Men's Singles - Preliminary Round 1 - Match 47,Lusail Sports Arena,Table 8,False,,,00:20:26,...,0-3,"5-11,10-12,6-11",away,5,Pavel PLATONOV,107456,BLR,Mohamed EL-BEIALI,102486,EGY
8,2410,TTEMSINGLES-----------RND1005500--,Men Singles,Men's Singles - Preliminary Round 1 - Match 55,Lusail Sports Arena,Table 3,False,,,00:47:04,...,2-3,"8-11,7-11,11-9,11-6,8-11",away,5,Masaki YOSHIDA,110411,JPN,Zokhid KENJAEV,112261,UZB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23679,3066,TTEWSINGLES-----------QFNL000300----------,Women's Singles,Women's Singles - Quarterfinal - Match 3,Pavilhao Municipal de Vila Nova de Gaia,Table 1,False,False,112453.0,00:24:32,...,1-3,"0-11,11-7,8-11,6-11",away,5,KIM Seongjin,135370,KOR,JEGER Mateja,112453,CRO
23680,3066,TTEMSINGLES-----------QFNL000300----------,Men's Singles,Men's Singles - Quarterfinal - Match 3,Pavilhao Municipal de Vila Nova de Gaia,Table 2,False,False,111050.0,00:42:01,...,2-3,"7-11,11-6,8-11,11-4,5-11",away,5,ROBLES Alvaro,111050,ESP,FALCK Mattias,112074,SWE
23681,3066,TTEMSINGLES-----------QFNL000100----------,Men's Singles,Men's Singles - Quarterfinal - Match 1,Pavilhao Municipal de Vila Nova de Gaia,Table 1,False,False,201466.0,00:43:51,...,2-3,"12-10,9-11,7-11,11-9,7-11",away,5,GERALDO Joao,115396,POR,VERDONSCHOT Wim,201466,GER
23685,3191,TTEMSINGLES-----------GP04000100----------,Men's Singles,Men's Singles - Group 4 - Match 1,Centrum Szkolenia PZTS,Table 1,False,False,119273.0,00:38:01,...,1-3,"10-12,8-11,21-19,5-11",away,5,FELKEL Grzegorz,119273,POL,CAO Andrew,145852,USA


In [252]:
def switch_overall_score(score_str):
    """
    Safely switches a single score string like "2-4" to "4-2".
    Uses regex to be robust and handle DNF strings (like 'WO') safely.
    """
    if pd.isna(score_str):
        return pd.NA
    return re.sub(r'(\d+)-(\d+)', r'\2-\1', str(score_str))


def switch_game_score(score_str):
    """
    Switches a comma-separated list of scores.
    (This is your function, slightly hardened with a try-except block)
    """
    if pd.isna(score_str) or not score_str:
        return pd.NA
    try:
        
        return re.sub(r'(\d+)-(\d+)', r'\2-\1', str(score_str))
    except Exception:
        return pd.NA 
    
rows_to_switch = (winner_correct == False)
print(f"--- ⚙️ Standardizing Scores to 'Winner-First' Format ---")
print(f"Identifying {rows_to_switch.sum()} rows to switch...")
cleaned_matches_df.loc[rows_to_switch, 'overallScore'] =  cleaned_matches_df.loc[rows_to_switch, 'overallScore'].apply(switch_overall_score)

# Apply the switch to 'gameScore'
cleaned_matches_df.loc[rows_to_switch, 'gameScore'] = cleaned_matches_df.loc[rows_to_switch, 'gameScore'].apply(switch_game_score)

print("✅ Scores successfully standardized.")

--- ⚙️ Standardizing Scores to 'Winner-First' Format ---
Identifying 11454 rows to switch...
✅ Scores successfully standardized.


In [259]:
score_splits = cleaned_matches_df["overallScore"].str.split("-", expand=True)
home_score_numeric = pd.to_numeric(score_splits[0], errors='coerce')
away_score_numeric = pd.to_numeric(score_splits[1], errors='coerce')
winner_correct = (home_score_numeric >= away_score_numeric)

In [None]:
ENGLAND_CODE = ''

print(f"--- 🔎 Isolating Only {ENGLAND_CODE} Players 🔎---")

# 1. Create the mask (filter) for matches where the winner was English
winner_is_english = (cleaned_matches_df['winnerCountry'] == ENGLAND_CODE)

# 2. Create the mask (filter) for matches where the loser was English
loser_is_english = (cleaned_matches_df['loserCountry'] == ENGLAND_CODE)

# 3. Extract names only from the correct column based on the mask
#    We use np.where to select the name ONLY if the country code is 'ENG', otherwise we use pd.NA
english_winners = np.where(winner_is_english, cleaned_matches_df['winnerName'], pd.NA)
english_losers = np.where(loser_is_english, cleaned_matches_df['loserName'], pd.NA)

# 4. Concatenate the two resulting arrays (Series) and remove missing values (opponents)
#    This Series now contains only the names of English players.
all_english_players_series = pd.concat([
    pd.Series(english_winners), 
    pd.Series(english_losers)
]).dropna()

# 5. Get the unique list of names
unique_english_players = all_english_players_series.unique()

# --- Reporting ---
for player in unique_english_players:
    print (player)


--- 🔎 Isolating Only CHN Players 🔎---
CHEN Meng
FAN Zhendong
MA Long
SUN Yingsha
LIU Dingshuo
WANG Yidi
WANG Chuqin
LIN Gaoyuan
LIU Weishan
LIANG Jingkun
ZHOU Qihao
CHEN Xingtong
WANG Manyu
WANG Xiaotong
LIN Shidong
ZHANG Rui
XU Yingbin
LIU Shiwen
XU Xin
XIANG Peng
YUAN Licen
KUAI Man
FAN Siqi
YANG Huijing
QI Fei
SHI Xunyao
CHEN Yi
YU Ziyang
CAO Wei
XUE Fei
ZHANG Yudong
XU Haidong
SUN Wen
ZHAO Zihao
HE Zhuojia
QIAN Tianyi
CHEN Yuanyu
ZENG Beixun
WANG Tingyu
HUANG Youzheng
ZHU Sibing
ZONG Geman
QIN Yuxuan
XU Yi
XIONG Mengyang
LIANG Guodong
WANG Chen Ce
LI Yuqi
HAN Feier
LENG Yutong
WANG Tianyi
REN Hao
QUAN Kaiyuan
LIU Yebo
SAI Linwei
CHU Hanwen
ZANG Xiaotong
FAN Shuhan
ZHOU Kai
GAO Yang
LIANG Yanning
ZHU Ziyu
WU Yangchen
WANG Yiduo
NIU Guankai
YAN Sheng
ZHANG Xiangyu
LI Yake
GUO Yuhan
YANG Yiyun
SUN Zheng
XIE Congfan
SONG Zhuoheng
ZHU Jiaqi
SHI Cancheng
SUN Yang
DING Yijie
YAN Yutong
CHEN Hengda
SUN Sinan
WEN Ruibo
GAO Yuxin
LI Hechen
HU Dongshen
FEI Junhang
KANG Youde
LUO Jiecheng
NING