In [94]:
import glob 
import pandas as pd
import numpy as np
import os 
import json
import re
from typing import Tuple, Optional
from thefuzz import fuzz, process

In [95]:
RAW_MATCH_DETAILS_DIR = "../Data/Raw/Match_details"

DNF_KEYWORDS = ['WO', 'INJ', 'RET', 'DSQ', 'DNS']
DNF_PATTERN = '|'.join(DNF_KEYWORDS)
DNF_PATTERN_CAPTURE = r'(' + r'|'.join(DNF_KEYWORDS) + r')'


# drop some columns that are not currently of interest for the project (e.g table number and venue))
# result status is "offical" for all entries - no neeed to keep
DROP_COLUMNS_START = [
    "resultStatus",        
    "playByPlaySequenceNumber"]

MASTER_EVENTS_DIR = "../Data/Master/Events"
MASTER_EVENTS_SUFFIX = "master_events.csv"
MASTER_EVENTS_REGEX = rf"^\d{{8}}{re.escape('_')}{re.escape(MASTER_EVENTS_SUFFIX)}$"
MINIMAL_EVENT_COLUMNS = ["eventId"]


CLEANED_MATCHES_DIR = "../Data/Processed/Matches"


JUNK_PATTERN_CAPTURE = r'([^\d,-]+)'
JUNK_PATTERN_CLEAN = r'[^\d,-]+'

In [96]:
def get_latest_master_events(master_dir:str, master_regex) -> pd.DataFrame:
    """
    Parses specified directory for events files in format yyyy_mm_dd. 
    Attempts to read latest file in this format. 

    Args:
        directory (str): The folder where the master files are stored (e.g., '../Data/Events/Intermediate').
        filename_pattern (str): The pattern to match (e.g., '*_events_intermediate.csv').

    Returns:
        Tuple[pd.DataFrame,Optional]: returns DF with data if available or blank df if data unavailable
    """
    if not os.path.isdir(master_dir):
        print (f"‚ùå{master_dir} does not exist as a directory")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS)  
    
    # Get csv files in 
    files = glob.glob(f"{master_dir}/*.csv")
   

    master_files = []
    
    if not files:
        print(f"‚ùå No existing *.csv files found in MASTER Events Directory: {master_dir} ")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS)

    for file in files:
        filename = os.path.basename(file)    
       
        if re.match(master_regex,filename):
            master_files.append(file)

    if not master_files:
        print(f"‚ùå No existing MASTER files in format: {master_regex} in {master_dir}")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS)
    master_files.sort()    
    latest_master = master_files[-1]

    try: 
        latest_master_df = pd.read_csv(latest_master)
        print(f"‚úÖ {len(latest_master_df)} events found in latest MASTER: {latest_master} ")
        return latest_master_df
        
    except Exception as e:
        print (f"‚ùå Error reading lastest MASTER, {latest_master}: {e}")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS)

In [97]:
# Get all match files in the RAW_MATCH_DETAILS_DIR
# Create an all_matches_df to be filtered down 
# Many of these matches are in fact match ups between teams rather than players - need to filter out.
# some of the singles matches are from teams matches and events - can keep these.



# parse all match details inside the json files.
all_match_details_files = glob.glob(os.path.join(RAW_MATCH_DETAILS_DIR, "*.json"))
all_matches = []
for file in all_match_details_files:
    with open(file, 'r', encoding='utf-8') as f:
        matches_data = json.load(f)

    all_matches.extend(matches_data)

# create the df 

all_matches_df = pd.DataFrame(all_matches)







In [98]:
# Keep here inside this cell to prevent rerunning file parsing and extraction every time.
all_matches_df = pd.DataFrame(all_matches)

# drop some columns that are not currently of interest for the project (e.g table number and venue etc )



# initialise the cleaned matches df and drop empty columns and na rows
print(f"cleaned_matches_df before  dropna and dropping irrelevant columns: {len(all_matches_df)} with {len(all_matches_df.columns)} cols")
cleaned_matches_df = all_matches_df
cleaned_matches_df.dropna(axis=0, how='all', inplace=True)
cleaned_matches_df.dropna(axis=1, how='all', inplace=True)
cleaned_matches_df.drop(columns=DROP_COLUMNS_START, inplace=True)
print(f"cleaned_matches_df after dropna and dropping irrelevant columns: {len(cleaned_matches_df)} with {len(cleaned_matches_df.columns)} cols")


dnf_from_overallScores = cleaned_matches_df["overallScores"].str.extract(JUNK_PATTERN_CAPTURE, expand=False).str.strip()
dnf_from_resultoverallScores = cleaned_matches_df["resultOverallScores"].str.extract(JUNK_PATTERN_CAPTURE, expand=False).str.strip()
dnf_from_gameScores = cleaned_matches_df["gameScores"].str.extract(JUNK_PATTERN_CAPTURE, expand=False).str.strip()
dnf_from_resultsGameScores = cleaned_matches_df["resultsGameScores"].str.extract(JUNK_PATTERN_CAPTURE, expand=False).str.strip()

cleaned_matches_df["overallScores"] = cleaned_matches_df["overallScores"].str.replace(JUNK_PATTERN_CLEAN, '', regex=True)
cleaned_matches_df["resultOverallScores"] = cleaned_matches_df["resultOverallScores"].str.replace(JUNK_PATTERN_CLEAN, '', regex=True)
cleaned_matches_df["gameScores"] = cleaned_matches_df["gameScores"].str.replace(JUNK_PATTERN_CLEAN, '', regex=True)
cleaned_matches_df["resultsGameScores"] = cleaned_matches_df["resultsGameScores"].str.replace(JUNK_PATTERN_CLEAN, '', regex=True)



cleaned_matches_df["dnf"] = dnf_from_overallScores
cleaned_matches_df["dnf"] = cleaned_matches_df["dnf"].fillna(dnf_from_resultoverallScores)
cleaned_matches_df["dnf"] = cleaned_matches_df["dnf"].fillna(False)

SCORE_RENAME_DICT = {
    # rename columns to A and B for better clarity
    "gameScores": "A_rawGameScores",
    "resultsGameScores": "B_rawGameScores",
    "overallScores": "A_rawOverallScore",
    "resultOverallScores": "B_rawOverallScore"
}

cleaned_matches_df.rename(columns=SCORE_RENAME_DICT, inplace=True)

print("‚úÖ 'dnf' column updated by sequentially filling missing values from 'overallScores' THEN 'resultOverallScores'.")






cleaned_matches_df before  dropna and dropping irrelevant columns: 24920 with 25 cols
cleaned_matches_df after dropna and dropping irrelevant columns: 24920 with 21 cols
‚úÖ 'dnf' column updated by sequentially filling missing values from 'overallScores' THEN 'resultOverallScores'.


In [99]:
print(f"cleaned_df before dropping names with teams_parent_data: {len(cleaned_matches_df)}")
team_parent_filter = cleaned_matches_df["teamParentData"].notna()
team_parent_df  = cleaned_matches_df[team_parent_filter].copy()
cleaned_matches_df = cleaned_matches_df[~team_parent_filter ].copy()

# also drop team summaries
for col in cleaned_matches_df.columns:
    if "team" in col.lower():
        cleaned_matches_df.drop(columns=[col], inplace=True)


print(f"cleaned_df after dropping names with teams_parent_data: {len(cleaned_matches_df)}")

cleaned_df before dropping names with teams_parent_data: 24920
cleaned_df after dropping names with teams_parent_data: 24105


In [100]:
print(f"cleaned_df before dropping para and age limit matches: {len(cleaned_matches_df)}")
age_limit_mask = cleaned_matches_df['subEventName'].str.contains(r"U\d{2}", case=False, na=False)
para_class_mask = cleaned_matches_df['subEventName'].str.contains("class", case=False, na=False)
age_para_filter = age_limit_mask | para_class_mask
cleaned_matches_df = cleaned_matches_df[~age_para_filter].copy()


print(f"cleaned_df after dropping para and age limit matches: {len(cleaned_matches_df)}")



cleaned_df before dropping para and age limit matches: 24105


cleaned_df after dropping para and age limit matches: 23945


In [101]:
# Before filtering - extact key information from the 'competitors' column
# player name column can contain team names.
# competitors column keeps track of either the 1 player for a singles listing
# or the multiple players for a team listing 
# some doubles matches may be leftover from payload filtering - need to filter out.
# some doubles matches may be here inside team events - need to filter out.

def extract_competitor_details(competitor_list):
    """
    Extracts only the top-level competitor details (Name, ID, ORG) 
    for Home (H) and Away (A) competitors, ignoring the nested 'players' dict.
    """
    
    # Initialize the output dictionary  
    data = {}
    
    # check for empty data incase 
    if not isinstance(competitor_list, list) or len(competitor_list) < 2:
        return pd.Series(data)

    try:
        # use prefix pattern to determine home/away and build the keys for the output dict.
        for comp in competitor_list:
            comp_type = comp.get('competitorType')
            
            if comp_type == 'H':
                prefix = 'home'
            elif comp_type == 'A':
                prefix = 'away'
            else:
                continue                
           
            
            # get competitor id 
            data[f'{prefix}CompetitorId'] = comp.get('competitiorId', pd.NA)
                                
            # get competitor country code 
            data[f'{prefix}CompetitorOrg'] = comp.get('competitiorOrg', pd.NA)

            data[f'{prefix}Player'] = [player.get('playerName', pd.NA) for player in comp.get('players', pd.NA)]

            data[f'{prefix}NestedGameScores'] = comp.get('scores', pd.NA)



    except Exception as e:
        print(f"Error processing row: {e} | Data: {competitor_list}")
        pass

    return pd.Series(data)

#
print("--- üöÄ Getting competitor details üöÄ ---")

# apply the function to the competitors column from the main df
competitor_details_df = cleaned_matches_df['competitiors'].apply(extract_competitor_details)



cleaned_matches_df = pd.concat([cleaned_matches_df, competitor_details_df], axis=1)
cleaned_matches_df.drop(columns=["competitiors"],inplace=True, errors='ignore')
cleaned_matches_df["homePlayer"] = cleaned_matches_df["homePlayer"].str[0]
cleaned_matches_df["awayPlayer"] = cleaned_matches_df["awayPlayer"].str[0]



print("‚úÖ Competitor details extracted and added to cleaned_matches_df and competitiors column dropped.")


--- üöÄ Getting competitor details üöÄ ---


‚úÖ Competitor details extracted and added to cleaned_matches_df and competitiors column dropped.


In [102]:
# All teams matches should be removed by now 

def extract_format(config):
    """
    Attempts to extract the best of format from the 'matchConfig' column
    """
    data = {"bestOf": pd.NA, "ttrReview": pd.NA}
    if not isinstance(config,dict):
        return pd.Series(data)
    try:
        data['bestOf'] = config.get('bestOfXGames')
        data['ttrReview'] = config.get('tTRReview')
        return pd.Series(data)
    except Exception as e:
        print(f"Error processing row: {e} | Data: {config}")
        pass
    return pd.Series(data)
#
print("--- üöÄ Getting Match Config deatils üöÄ ---")

# apply the function to the competitors column from the main df
match_config_df = cleaned_matches_df['matchConfig'].apply(extract_format)


cleaned_matches_df = pd.concat([cleaned_matches_df, match_config_df], axis=1)
cleaned_matches_df.drop(columns=["matchConfig"],inplace=True, errors='ignore')

print("‚úÖ Match config extracted and added to cleaned_matches_df,matchConfig column dropped.")


--- üöÄ Getting Match Config deatils üöÄ ---
‚úÖ Match config extracted and added to cleaned_matches_df,matchConfig column dropped.


In [103]:
# serverNext = NEXT SERVER AFTER THE MATCH POINT WAS DONE
# (even if point would not be played as match was over)

def extract_next_server(action):
    """
    Attempts to extract the best of format from the 'matchConfig' column
    """
    data = {"serverNext":pd.NA,
            "actionType": pd.NA
}
    if not isinstance(action,dict):
        return pd.Series(action)
    try:
        
        data["serverNext"] = action.get("serverNext")   
        data["actionType"] = action.get("actionType")
        return pd.Series(data)
    except Exception as e:
        print(f"Error processing row: {e} | Data: {action}")
        pass
    return pd.Series(data)
#
print("--- üöÄ Getting Match Config deatils üöÄ ---")

# apply the function to the competitors column from the main df
last_server_df = cleaned_matches_df['action'].apply(extract_next_server)


cleaned_matches_df = pd.concat([cleaned_matches_df, last_server_df], axis=1)
cleaned_matches_df.drop(columns=["action"],inplace=True, errors='ignore')

print("‚úÖ Match lastServer extracted and added to cleaned_matches_df,matchConfig column dropped.")

--- üöÄ Getting Match Config deatils üöÄ ---
‚úÖ Match lastServer extracted and added to cleaned_matches_df,matchConfig column dropped.


In [104]:
def extract_times(matchTime):
    """
    Attempts to extract the match date and duration from the matchDate column
    """
    data = {"duration (unreliable)": pd.NA, "startDateLocal": pd.NA, "startDateUTC": pd.NA}
    if not isinstance(matchTime, dict):
        return pd.Series(matchTime)
    try:
        data['duration (unreliable)'] = matchTime.get('duration')
        data['startDateLocal'] = matchTime.get('startDateLocal')
        data['startDateUTC'] = matchTime.get('startDateUTC')
        return pd.Series(data)
    except Exception as e:
        print(f"Error processing row: {e} | Data: {matchTime}")
        pass
    return pd.Series(data)

times_df  = cleaned_matches_df['matchDateTime'].apply(extract_times)

cleaned_matches_df = pd.concat([cleaned_matches_df, times_df], axis=1)

cleaned_matches_df['startDateLocal'] = pd.to_datetime(
    cleaned_matches_df['startDateLocal'], errors='coerce', utc=False
)
cleaned_matches_df['startDateUTC'] = pd.to_datetime(
    cleaned_matches_df['startDateUTC'], errors='coerce', utc=True
)
cleaned_matches_df['startDateLocal'] = cleaned_matches_df['startDateLocal']
cleaned_matches_df['startDateUTC'] = cleaned_matches_df['startDateUTC']

cleaned_matches_df.drop(columns=["matchDateTime"],inplace=True, errors='ignore')

In [105]:
events_df=get_latest_master_events(MASTER_EVENTS_DIR,MASTER_EVENTS_REGEX)
events_df["StartDate"] = pd.to_datetime(events_df["StartDate"], errors='coerce', utc=True)


event_dates_df = events_df[["EventName","eventId", "StartDate"]]
event_dates_df = event_dates_df.rename(columns = {"StartDate":"EventStartDate"})
cleaned_matches_df["eventId"] = cleaned_matches_df["eventId"].astype(int)
cleaned_matches_df = cleaned_matches_df.merge(
    event_dates_df, 
    on='eventId', 
    how='left',
    validate='m:1'
)

‚úÖ 188 events found in latest MASTER: ../Data/Master/Events/20251119_master_events.csv 


In [106]:

cleaned_matches_df = cleaned_matches_df.sort_values(by = ["EventStartDate", "startDateLocal", "matchStartTimeUTC"])

DATE_COLS = [
    "matchStartTimeUTC", # Already exists, just needs re-conversion for safety
    "startDateLocal",    # Needs conversion (was in nested dict)
    "startDateUTC",      # Needs conversion (was in nested dict)
    "EventStartDate"     # The merged date (likely a date object or string)
]


for col in DATE_COLS:
    # Check if the column exists in the DataFrame before trying to convert
    if col in cleaned_matches_df.columns:
        
        # 1. Convert to Datetime (errors='coerce' handles bad strings, turning them to NaT)
        # 2. Assign the result back to the same column
        cleaned_matches_df[col] = pd.to_datetime(
            cleaned_matches_df[col], 
            errors='coerce',
            utc=True # Ensure the resulting datetime object is timezone-aware (UTC)
        )


print("‚úÖ All  date / timecolumns standardized to UTC datetime dtype.")

# Check the dtypes to confirm the conversion



‚úÖ All  date / timecolumns standardized to UTC datetime dtype.


In [107]:
# One entry for muscat 2025 has a clearly erroneous date.
# For now, manually remove this date and let it be handled by fillNa hierarchy
target_event = all_matches_df["eventId"] == "3084"
target_match = all_matches_df["documentCode"] == "TTEWSINGLES-----------GP11000400----------"
target_mask = target_event & target_match
cleaned_matches_df.loc[target_mask,"matchStartTimeUTC"] = pd.NaT

In [108]:
dates_hierarchy = ['matchStartTimeUTC',
                   'startDateUTC',
                   'startDateLocal',                  
                   'EventStartDate']

cleaned_matches_df['matchDate'] = cleaned_matches_df[dates_hierarchy[0]]
for col in dates_hierarchy[1:]:
    # This only fills rows where 'matchDate_filled' is currently NA
    cleaned_matches_df['matchDate'] = cleaned_matches_df['matchDate'].fillna(
        cleaned_matches_df[col]
    )
columns_to_drop = ['matchStartTimeUTC',
                   'startDateUTC',
                   'startDateLocal']
cleaned_matches_df = cleaned_matches_df.drop(columns=columns_to_drop, errors='ignore')


### Some matches start times are hours before their event start times - we can't fix that.
## not gonna break anything -i can't confirm which one is correct for each match / event.  

In [109]:
def calculate_nested_scores(home_points_str, away_points_str):
    """
    Calculates the game scores and overall match score from point total strings.
    Returns: (calc_game_scores_str, calc_overall_scores_str)
    """
    # dont consider missing data 
    if pd.isna(home_points_str) or pd.isna(away_points_str):
        return pd.Series([pd.NA, pd.NA])
    
    # split strings into list 
    try:
        # Filter out empty strings and convert to int
        home_points = [int(p.strip()) for p in str(home_points_str).split(',') if p.strip()]
        away_points = [int(p.strip()) for p in str(away_points_str).split(',') if p.strip()]
    except ValueError:
        return pd.Series([pd.NA, pd.NA])

    # Ensure lists are the same length
    min_length = min(len(home_points), len(away_points))
    
    home_games_won = 0
    away_games_won = 0
    game_scores = []

    # zip home and away points - itererate over both up to length of shortest list 
    for h_pts, a_pts in zip(home_points[:min_length], away_points[:min_length]):
        
        # Skip 0-0 games
        if h_pts == 0 and a_pts == 0:
            continue
            
        # Create game score string 
        game_scores.append(f'{h_pts}-{a_pts}')
        
        # higher points tally as game wins
        if h_pts > a_pts:
            home_games_won += 1
        elif a_pts > h_pts:
            away_games_won += 1
            
    # join games scores intp output string
    calc_game_scores_str = ','.join(game_scores)
    calc_overall_scores_str = f'{home_games_won}-{away_games_won}'
    
    return pd.Series([calc_game_scores_str, calc_overall_scores_str])

print("--- üöÄ Calculating Game and Overall Scores from nested Home and Away game scores üöÄ ---")

# use .apply and lambdafunction onto whole df 
new_score_cols = cleaned_matches_df.apply(
    lambda row: calculate_nested_scores(row['homeNestedGameScores'], row['awayNestedGameScores']),
    axis=1,
    result_type='expand'
)

# Add names to the columns
new_score_cols.columns = ['calcNestedGameScores', 'calcNestedOverallScores']

# concat  new columns + original df 
cleaned_matches_df = pd.concat([cleaned_matches_df, new_score_cols], axis=1)
# cleaned_matches_df.drop(columns=["homePlayerGameScores","awayPlayerGameScores"],inplace=True,errors="ignore")

print("‚úÖ Game scores and overall match scores calculated and added.")


--- üöÄ Calculating Game and Overall Scores from nested Home and Away game scores üöÄ ---


‚úÖ Game scores and overall match scores calculated and added.


In [110]:
# normalise scores by removing 0-0 games (at end of scores i.e unplayed sets)
cleaned_matches_df["A_rawGameScores"] = cleaned_matches_df["A_rawGameScores"].str.replace(',0-0', '')
cleaned_matches_df["B_rawGameScores"] = cleaned_matches_df["B_rawGameScores"].str.replace(',0-0', '')

# clean all scores by removing entries that don't have non-0 digits



def clean_zero_scores(df: pd.DataFrame, score_column: str) -> pd.DataFrame:
    """
    Replaces score strings that contain NO digits from 1-9
    These entries are usually 0-0 fillers that should be removed
    to ease later processing.
    Args:
        df: The DataFrame to clean.
        score_column: The name of the column containing the game score strings.

    Returns:
        The DataFrame with the cleaned score column.
    """
    
    non_zero_score_mask = df[score_column].str.contains(r'[1-9]', regex=True, na=False)
    
    # Get the mask for rows that DO NOT contain non-zero digits
    mask = ~non_zero_score_mask    
   
    # replace the masked values with pd.NA
    df.loc[mask, score_column] = pd.NA

    # log to check it has worked :)
    
    print(f"‚úÖ Replaced {mask.sum()} zero score strings in '{score_column}' with pd.NA.")
    
    return df

scores_columns = [col for col in cleaned_matches_df.columns if "score" in col.lower()]
for column in scores_columns:
    cleaned_matches_df = clean_zero_scores(cleaned_matches_df, column)
    


‚úÖ Replaced 589 zero score strings in 'A_rawGameScores' with pd.NA.
‚úÖ Replaced 7 zero score strings in 'B_rawGameScores' with pd.NA.
‚úÖ Replaced 6 zero score strings in 'A_rawOverallScore' with pd.NA.
‚úÖ Replaced 6 zero score strings in 'B_rawOverallScore' with pd.NA.
‚úÖ Replaced 694 zero score strings in 'homeNestedGameScores' with pd.NA.
‚úÖ Replaced 723 zero score strings in 'awayNestedGameScores' with pd.NA.
‚úÖ Replaced 596 zero score strings in 'calcNestedGameScores' with pd.NA.
‚úÖ Replaced 596 zero score strings in 'calcNestedOverallScores' with pd.NA.


In [111]:
#### Triangulate and reconcile incosistent overall scores ####


print("--- üöÄ reconciling overall scores üöÄ ---")


# strip remaining scores of strings before reconciling
cleaned_matches_df["calcNestedGameScores"] = cleaned_matches_df["calcNestedGameScores"].str.replace(JUNK_PATTERN_CLEAN, '', regex=True)
cleaned_matches_df["homeNestedGameScores"] = cleaned_matches_df["homeNestedGameScores"].str.replace(JUNK_PATTERN_CLEAN, '', regex=True)
cleaned_matches_df["awayNestedGameScores"] = cleaned_matches_df["awayNestedGameScores"].str.replace(JUNK_PATTERN_CLEAN, '', regex=True)

cleaned_matches_df["A_rawOverallScore"] = cleaned_matches_df["A_rawOverallScore"].str.replace(JUNK_PATTERN_CLEAN, '', regex=True)
cleaned_matches_df["B_rawOverallScore"] = cleaned_matches_df["B_rawOverallScore"].str.replace(JUNK_PATTERN_CLEAN, '', regex=True)





# normalise scores for string comparison, s1, s2, s3 = scores to be reconciled. 
s1 = cleaned_matches_df['calcNestedOverallScores'].astype(str).str.strip().replace({'nan': '', 'NaT': '', '<NA>': ''})
s2 = cleaned_matches_df['B_rawOverallScore'].astype(str).str.strip().replace({'nan': '', 'NaT': '', '<NA>': ''})
s3 = cleaned_matches_df['A_rawOverallScore'].astype(str).str.strip().replace({'nan': '', 'NaT': '', '<NA>': ''})

# use a temporary score_df to do work on the series
temp_scores_df = pd.DataFrame({'s1': s1, 's2': s2, 's3': s3}, index=cleaned_matches_df.index)

 
def check_overall_score_consistency(row):
    """
    Checks if all *present* scores in the row are identical.
    returns true if all scores agree, otherwise false
    """

    # temp score df only contains scores so just need to compare all rows
    valid_scores = set([s for s in row if s != ""])
    
    # if set is length 1 or 0 - scores are consistent. 
    return len(valid_scores) == 1

# apply the scores_check
scoreConsistenct = temp_scores_df.apply(check_overall_score_consistency, axis=1)


# Set dnf matches to be consistentScore = true, as these matches are a separate case.
is_dnf = cleaned_matches_df['dnf'] != False 

# create column in the orginal to flag inconsistent scores (for future reference if needed)
cleaned_matches_df['scoreConsistent'] = scoreConsistenct | is_dnf


# compare all scores - if 2 or more agree - that value is used
# otherwise scores are taken in order of the s1,s2,s3
def get_democratic_overall_score(row):
    valid_scores = [s for s in row if s!= ""]
    if not valid_scores: return ""
    mode_result = pd.Series(valid_scores).mode()
    
    if not mode_result.empty:
        return mode_result.iloc[0] 
    else:
        return row['s1'] # Fallback

cleaned_matches_df['reconciledOverallScore'] = temp_scores_df.apply(get_democratic_overall_score, axis=1)
# cleaned_matches_df.drop(columns=["resultOverallScores","overallScores", "calcOverallScores"],inplace=True, errors="ignore")

print("--- ‚úîÔ∏è Scores Triangulated / reconciliated ‚úîÔ∏è ---")

--- üöÄ reconciling overall scores üöÄ ---


--- ‚úîÔ∏è Scores Triangulated / reconciliated ‚úîÔ∏è ---


In [112]:
cleaned_matches_df.fillna(value=pd.NA,inplace=True)

In [113]:
# Check which scores are missing

missing_A_gameScores = cleaned_matches_df["A_rawGameScores"].isnull()
missing_B_gameScores = cleaned_matches_df["B_rawGameScores"].isnull() 
missing_calcNestedGameScores = cleaned_matches_df["calcNestedGameScores"].isnull()
not_dnf = cleaned_matches_df["dnf"] == False

missing_A_gameScores_df = cleaned_matches_df[missing_A_gameScores & not_dnf]
missing_B_gameScores_df = cleaned_matches_df[missing_B_gameScores & not_dnf]
missing_calcGameScores_df = cleaned_matches_df[missing_calcNestedGameScores & not_dnf]
print(f"Missing A_gameScores: {len(missing_A_gameScores_df)}")
print(f"Missing B_gameScores: {len(missing_B_gameScores_df)}")
print(f"Missing calcNestedGameScores: {len(missing_calcGameScores_df)}")


Missing A_gameScores: 579
Missing B_gameScores: 0
Missing calcNestedGameScores: 579


In [114]:
cleaned_matches_df

Unnamed: 0,eventId,documentCode,subEventName,subEventDescription,venueName,tableNumber,tableName,currentGameNumber,A_rawGameScores,B_rawGameScores,...,serverNext,actionType,duration (unreliable),EventName,EventStartDate,matchDate,calcNestedGameScores,calcNestedOverallScores,scoreConsistent,reconciledOverallScore
18430,2410,TTEMSINGLES-----------RND1000700--,Men Singles,Men's Singles - Preliminary Round 1 - Match 7,Lusail Sports Arena,T06,Table 6,,,"13-11,12-10,11-8",...,,,00:15:15,WTT Contender Doha 2021,2021-02-28 00:00:00+00:00,2021-02-28 07:40:00+00:00,,,True,3-0
18515,2410,TTEMSINGLES-----------RND1002600--,Men Singles,Men's Singles - Preliminary Round 1 - Match 26,Lusail Sports Arena,T04,Table 4,,,"3-11,10-12,4-11",...,,,00:00:21,WTT Contender Doha 2021,2021-02-28 00:00:00+00:00,2021-02-28 07:40:00+00:00,,,True,0-3
18522,2410,TTEMSINGLES-----------RND1004600--,Men Singles,Men's Singles - Preliminary Round 1 - Match 46,Lusail Sports Arena,T03,Table 3,,,"11-9,11-9,11-8",...,,,00:00:00,WTT Contender Doha 2021,2021-02-28 00:00:00+00:00,2021-02-28 07:40:00+00:00,,,True,3-0
18544,2410,TTEMSINGLES-----------RND1001900--,Men Singles,Men's Singles - Preliminary Round 1 - Match 19,Lusail Sports Arena,T05,Table 5,,,"11-6,11-9,15-13",...,,,00:27:19,WTT Contender Doha 2021,2021-02-28 00:00:00+00:00,2021-02-28 07:40:00+00:00,,,True,3-0
18553,2410,TTEMSINGLES-----------RND1001000--,Men Singles,Men's Singles - Preliminary Round 1 - Match 10,Lusail Sports Arena,T07,Table 7,,,"12-14,7-11,6-11",...,,,00:13:28,WTT Contender Doha 2021,2021-02-28 00:00:00+00:00,2021-02-28 07:40:00+00:00,,,True,0-3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22262,3065,TTEMSINGLES-----------R64-002300----------,Men's Singles,Men's Singles - Round of 64 - Match 23,Deutsches Tischtennis- Zentrum,T06,Table 6,4.0,"13-15,11-4,3-11,8-11","13-15,11-4,3-11,8-11",...,114718,OF,00:29:56,,NaT,2025-11-25 12:20:20.057000+00:00,"13-15,11-4,3-11,8-11",1-3,True,1-3
22256,3065,TTEMSINGLES-----------R64-001500----------,Men's Singles,Men's Singles - Round of 64 - Match 15,Deutsches Tischtennis- Zentrum,T02,Table 2,4.0,"8-11,11-8,2-11,10-12","8-11,11-8,2-11,10-12",...,120845,OF,00:28:58,,NaT,2025-11-25 12:33:16.070000+00:00,"8-11,11-8,2-11,10-12",1-3,True,1-3
22285,3065,TTEMSINGLES-----------R64-001800----------,Men's Singles,Men's Singles - Round of 64 - Match 18,Deutsches Tischtennis- Zentrum,T03,Table 3,3.0,"11-7,11-6,13-11","11-7,11-6,13-11",...,134246,OF,00:25:56,,NaT,2025-11-25 11:55:00+00:00,"11-7,11-6,13-11",3-0,True,3-0
22292,3065,TTEMSINGLES-----------R64-000700----------,Men's Singles,Men's Singles - Round of 64 - Match 7,Deutsches Tischtennis- Zentrum,T05,Table 5,3.0,"9-11,3-11,8-11","9-11,3-11,8-11",...,124192,OF,00:21:33,,NaT,2025-11-25 11:55:00+00:00,"9-11,3-11,8-11",0-3,True,0-3


In [115]:
def get_overall_score_winner(row):

    try:    
        scores = row["reconciledOverallScore"]
        score_split = scores.split("-")
        home_score = int(score_split[0])
        away_score = int(score_split[1])
        
        if home_score > away_score:
            return "home"
        elif away_score > home_score:
            return "away"
        else:
            return "tie"
    except:
        return pd.NA
        


cleaned_matches_df["reconciledOverallScoreWinner"] = cleaned_matches_df.apply(get_overall_score_winner, axis=1)


In [116]:
### Triangulate and reconcile incosistent overall scores ####


print("--- üöÄ reconciling game scores s üöÄ ---")





# normalise scores for string comparison
s1 = cleaned_matches_df['calcNestedGameScores'].astype(str).str.strip().replace({'nan': '', 'NaT': '', '<NA>': ''})
s2 = cleaned_matches_df['B_rawGameScores'].astype(str).str.strip().replace({'nan': '', 'NaT': '', '<NA>': ''})
s3 = cleaned_matches_df['A_rawGameScores'].astype(str).str.strip().replace({'nan': '', 'NaT': '', '<NA>': ''})

# use a temporary score_df to do work on the series
temp_scores_df = pd.DataFrame({'s1': s1, 's2': s2, 's3': s3}, index=cleaned_matches_df.index)

 
def check_score_consistency(row):
    """
    Checks if all available scores in the row are identical.
    returns true if all scores agree, otherwise false
    """

    # temp score df only contains scores so just need to compare all rows
    valid_scores = set([s for s in row if s != ""])
    
    # if set is length 1 or 0 - scores are consistent. 
    return len(valid_scores) == 1

# apply the scores_check
scoreConsistenct = temp_scores_df.apply(check_score_consistency, axis=1)


# Set dnf matches to be consistentScore = true, as these matches are a separate case.
is_dnf = cleaned_matches_df['dnf'] != False 

# create column in the orginal to flag inconsistent scores (for future reference if needed)
cleaned_matches_df['gameScoreConsistent'] = scoreConsistenct | is_dnf


# compare all scores - if 2 or more agree - that value is used
# otherwise scores are taken in order of the s1,s2,s3
def get_democratic_game_score(row):
    valid_scores = [s for s in row if s != ""]
    if not valid_scores: return ""
    mode_result = pd.Series(valid_scores).mode()
    
    if not mode_result.empty:
        return mode_result.iloc[0] 
    else:
        return row['s1'] # Fallback

cleaned_matches_df['reconciledGameScore'] = temp_scores_df.apply(get_democratic_game_score, axis=1)
# cleaned_matches_df.drop(columns=["resultOverallScores","overallScores", "calcOverallScores"],inplace=True, errors="ignore")

print("--- ‚úîÔ∏è Scores Triangulated / reconciliated ‚úîÔ∏è ---")

--- üöÄ reconciling game scores s üöÄ ---
--- ‚úîÔ∏è Scores Triangulated / reconciliated ‚úîÔ∏è ---


In [117]:
def check_game_score_validity(score_str: str) -> str:
    """
    Checks if a single game score (e.g., '11-8' or '13-11') is valid.
    Returns a status string: 'Valid', 'Incomplete', or 'Invalid'.
    """
    try:
        home, away = map(int, score_str.split('-'))
    except ValueError:
        return "Invalid_Format"

    if home == 0 and away == 0:
        return "Incomplete"
    
    score_diff = abs(home - away)
    winner_score = max(home, away)

    if winner_score >= 11:
        if score_diff >= 2:
            return "Valid"
        else:
            return "Invalid_Too_Close" 
    else:
        return "Invalid_Too_Low" 
def calculate_reconciled_game_scores_winner(row) -> pd.Series:
# Use the cleaned game score string (e.g., '11-8,11-9,10-12')
    score_string = row.get("reconciledGameScore")

    # Initialize the outputs
    winner_result = "error"
    game_status_flags = []

    if not score_string:
        winner_result = pd.NA
    else:
        games = score_string.split(",")
        home_tally = 0
        away_tally = 0
        
        for game_score_pair in games:
            game_score_pair = game_score_pair.strip() # Clean up spaces
            if not game_score_pair:
                continue
                
            # Run the rule check for diagnostic column
            game_status = check_game_score_validity(game_score_pair)
            if game_status != "Valid":
                game_status_flags.append(f"{game_score_pair}:{game_status}")

            try:
                home_score, away_score = map(int, game_score_pair.split('-'))
            except ValueError:
                # If a score can't be parsed, stop tallying
                winner_result = "Error_Parsing"
                break
            
            # Use Python's match/case for clean winner determination logic
            match (home_score > away_score, away_score > home_score):
                case (True, False):
                    home_tally += 1
                case (False, True):
                    away_tally += 1
                case _:
                    continue

        # Final match winner determination (only if no parsing error occurred)
        if winner_result != "Error_Parsing":
            if home_tally > away_tally:
                winner_result = "home"
            elif away_tally > home_tally:
                winner_result = "away"
            else:
                winner_result = "tie"

    # --- RETURN A SERIES ---
    return pd.Series({
        'calculatedGameScoreWinner': winner_result,
        'gameScoreFlags': "; ".join(game_status_flags)
    })

new_cols_df = cleaned_matches_df.apply(calculate_reconciled_game_scores_winner, axis=1)

# Concat new data to original df 
cleaned_matches_df = pd.concat([cleaned_matches_df, new_cols_df], axis=1)



In [118]:
# mark the flagged matches where a result is wrong 
flagged_matches = cleaned_matches_df["gameScoreFlags"]!= ""
not_dnf = cleaned_matches_df["dnf"] == False
winner_mismatch = (cleaned_matches_df["calculatedGameScoreWinner"] != cleaned_matches_df["reconciledOverallScoreWinner"])
score_errors_df  = cleaned_matches_df[(flagged_matches |  winner_mismatch) & not_dnf]

In [119]:
score_errors_df.to_csv("../Data/Processed/Matches/score_errors.csv")


In [120]:
score_errors_fixed_df = pd.read_csv("../Data/Processed/Matches/score_errors_AMENDED.csv", index_col=0)
STABLE_KEY = ['eventId', 'documentCode']

cleaned_matches_df = cleaned_matches_df.set_index(STABLE_KEY)
score_errors_fixed_df = score_errors_fixed_df.set_index(STABLE_KEY)
cleaned_matches_df.update(score_errors_fixed_df)
cleaned_matches_df = cleaned_matches_df.reset_index()



In [121]:
cleaned_matches_df[winner_mismatch]

  cleaned_matches_df[winner_mismatch]


Unnamed: 0,eventId,documentCode,subEventName,subEventDescription,venueName,tableNumber,tableName,currentGameNumber,A_rawGameScores,B_rawGameScores,...,matchDate,calcNestedGameScores,calcNestedOverallScores,scoreConsistent,reconciledOverallScore,reconciledOverallScoreWinner,gameScoreConsistent,reconciledGameScore,calculatedGameScoreWinner,gameScoreFlags
1429,2503,TTEMSINGLES-----------RND4000200----------,Men's Singles,Men's Singles - Qualifying Round 4 - Match 2,Sports Hall Tri Lilije,T02,Table 2,5.0,"9-11,11-6,11-7,1-11,11-9","9-11,11-6,11-7,1-11,11-9",...,2021-11-03 13:32:11+00:00,"9-11,11-6,11-7,1-11,11-9",3-2,True,3-2,home,True,"9-11,11-6,11-7,1-11,11-9",home,
3851,2539,TTEMSINGLES-----------RND1002200----------,Men's Singles,Men's Singles - Qualifying Round 1 - Match 22,Dom Sportova,T05,Table 5,4.0,"11-6,11-8,9-11,12-10","11-6,11-8,9-11,12-10",...,2022-06-13 09:18:06+00:00,"11-6,11-8,9-11,12-10",3-1,True,3-1,home,True,"11-6,11-8,9-11,12-10",home,
4908,2568,TTEWSINGLES-----------R32-000300----------,Women's Singles,Women's Singles - Round of 32 - Match 3,Salle Omnisport de Rades,T02,Table 2,3.0,"16-14,11-7,11-3","16-14,11-7,11-3",...,2022-08-03 14:29:32+00:00,"16-14,11-7,11-3",3-0,True,3-0,home,True,"16-14,11-7,11-3",home,
5555,2615,TTEWSINGLES-----------GP02000600----------,Women's Singles,Women's Singles - Group 2 - Match 6,Eastland Shopping Centre,T04,Table 4,3.0,"11-5,11-9,11-9","11-5,11-9,11-9",...,2022-09-08 04:39:45+00:00,"11-5,11-9,11-9",3-0,True,3-0,home,True,"11-5,11-9,11-9",home,
9679,2722,TTEMSINGLES-----------RND1001100----------,Men's Singles,Men's Singles - Qualifying Round 1 - Match 11,Arena Carioca 1,T03,Table 3,5.0,"9-11,11-9,13-11,1-11,11-9","9-11,11-9,13-11,1-11,11-9",...,2023-08-07 19:05:00+00:00,"9-11,11-9,13-11,1-11,11-9",3-2,True,3-2,home,True,"9-11,11-9,13-11,1-11,11-9",home,
11407,2733,TTEWSINGLES-----------R32-000400----------,Women's Singles,Women's Singles - Round of 32 - Match 4,Sport Centre Otocec,T04,Table 4,5.0,"11-8,9-11,6-11,11-9,8-11","11-8,9-11,6-11,11-9,8-11",...,2023-11-03 09:35:00+00:00,"11-8,9-11,6-11,11-9,8-11",2-3,True,2-3,away,True,"11-8,9-11,6-11,11-9,8-11",away,
11477,2794,TTEMSINGLES-----------R32-000900----------,Men's Singles,Men's Singles - Round of 32 - Match 9,Taiyuan Binhe Sports Center,T03,Table 3,3.0,"3-11,8-11,10-12","3-11,8-11,10-12",...,2023-11-09 02:00:00+00:00,"3-11,8-11,10-12",0-3,True,0-3,away,True,"3-11,8-11,10-12",away,
13271,2894,TTEWSINGLES-----------8FNL000200----------,Women's Singles,Women's Singles - Round of 16 - Match 2,Al Kawthar Secondary School,T02,Table 2,3.0,"4-11,4-11,10-12","4-11,4-11,10-12",...,2024-03-23 09:00:00+00:00,"4-11,4-11,10-12",0-3,True,0-3,away,True,"4-11,4-11,10-12",away,
13569,2882,TTEMSINGLES-----------RND1000200----------,Men's Singles,Men's Singles - Qualifying Round 1 - Match 2,ARENA VARA?DIN,T01,Table 1,5.0,"11-9,8-11,10-12,11-6,11-4","11-9,8-11,10-12,11-6,11-4",...,2024-04-02 13:00:00+00:00,"11-9,8-11,10-12,11-6,11-4",3-2,True,3-2,home,True,"11-9,8-11,10-12,11-6,11-4",home,
18911,3024,TTEMSINGLES-----------RND1003000----------,Men's Singles,Men's Singles - Qualifying Round 1 - Match 30,Sport Centre Otocec,T01,Table 1,3.0,"11-8,11-5,11-5","11-8,11-5,11-5",...,2025-03-25 17:10:00+00:00,"11-8,11-5,11-5",3-0,True,3-0,home,True,"11-8,11-5,11-5",home,


In [122]:
cols_to_remake = ["gameScoreFlags", "calculatedGameScoreWinner","reconciledOverallScoreWinner"]
cleaned_matches_df.drop(columns=cols_to_remake, inplace=True,errors="ignore")
cleaned_matches_df["reconciledOverallScoreWinner"] = cleaned_matches_df.apply(get_overall_score_winner, axis=1)
new_cols_df = cleaned_matches_df.apply(calculate_reconciled_game_scores_winner, axis=1)
cleaned_matches_df = pd.concat([cleaned_matches_df, new_cols_df], axis=1)
flagged_matches = cleaned_matches_df["gameScoreFlags"]!= ""
not_dnf = cleaned_matches_df["dnf"] == False
winner_mismatch = (cleaned_matches_df["calculatedGameScoreWinner"] != cleaned_matches_df["reconciledOverallScoreWinner"])
score_errors_persistent_df = cleaned_matches_df[(flagged_matches | winner_mismatch) & not_dnf]

if len(score_errors_persistent_df) == 0:
    print("‚úÖ All Errors Resolved")
    print(f"{len(score_errors_persistent_df)} Errors Remaining")
else:
    print(f"{len(score_errors_persistent_df)} Errors Remaining")


‚úÖ All Errors Resolved
0 Errors Remaining


In [123]:
def get_best_of(row):
    try:
        scores = row["reconciledOverallScore"].split("-")
        home_score = int(scores[0])
        away_score = int(scores[1])
        max_games_won = max(home_score, away_score)
    
        match max_games_won:
            case 4:
                return 7
            case 3:
                return 5
            case _:
                return 0
    except:
        return pd.NA
    

cleaned_matches_df["calcBestOf"] = cleaned_matches_df.apply(get_best_of, axis=1)

pd.to_numeric(cleaned_matches_df["calcBestOf"],errors="coerce")
pd.to_numeric(cleaned_matches_df["bestOf"],errors="coerce")

best_of_mismatch = cleaned_matches_df["calcBestOf"] != cleaned_matches_df["bestOf"]
bestOf_missing = cleaned_matches_df["bestOf"].isnull()



# Only mismatches are for Paris Olympics 2024 (data error) and Macau World cup 24 and 25 (special cases)
# The calcBestOf can be trusted it seems and taken as the source
# however macau group games shall be set to best of 0 
# macau world cup  2024 eventid = 2937, 2025 = 3109
# Paris Olympics Event ID  = 2603

In [124]:
cleaned_matches_df["trueBestOf"]= cleaned_matches_df["calcBestOf"]





world_cup_filter = cleaned_matches_df["eventId"].isin([2937,3109])
world_cup_group_filter = cleaned_matches_df["documentCode"].str.contains("GP")
cleaned_matches_df.loc[world_cup_filter & world_cup_group_filter & not_dnf, "trueBestOf"] = 0

paris_olympics_filter = cleaned_matches_df["eventId"] == 2603
paris_olympics_singles_filter = cleaned_matches_df["subEventName"].str.contains("Singles")
cleaned_matches_df.loc[paris_olympics_filter & paris_olympics_singles_filter & not_dnf, "trueBestOf"] = int(7)



In [125]:
cleaned_matches_df["trueBestOf"]     = pd.to_numeric(cleaned_matches_df["trueBestOf"],errors="coerce")
cleaned_matches_df["bestOf"]     = pd.to_numeric(cleaned_matches_df["bestOf"],errors="coerce")
best_of_mismatch = cleaned_matches_df["bestOf"] != cleaned_matches_df["trueBestOf"]
bestOf_missing = cleaned_matches_df[not_dnf]["trueBestOf"].isnull()

cleaned_matches_df[best_of_mismatch & ~bestOf_missing & ~world_cup_filter][["bestOf","trueBestOf","EventName"]]


Unnamed: 0,bestOf,trueBestOf,EventName
60,,5.0,WTT Contender Doha 2021
415,,7.0,Tokyo 2020 Olympic Games
416,,7.0,Tokyo 2020 Olympic Games
417,,7.0,Tokyo 2020 Olympic Games
418,,7.0,Tokyo 2020 Olympic Games
...,...,...,...
15898,5.0,7.0,Paris 2024 Olympic Games
15899,5.0,7.0,Paris 2024 Olympic Games
15900,5.0,7.0,Paris 2024 Olympic Games
15901,5.0,7.0,Paris 2024 Olympic Games


In [126]:
cleaned_matches_df['reconciledOverallScoreWinner'] = cleaned_matches_df['reconciledOverallScoreWinner'].fillna(
    cleaned_matches_df['calculatedGameScoreWinner'] 
)

cleaned_matches_df['calculatedGameScoreWinner'] = cleaned_matches_df['calculatedGameScoreWinner'].fillna(
    cleaned_matches_df['reconciledOverallScoreWinner'] )

winner_mismatch = cleaned_matches_df["reconciledOverallScoreWinner"] != cleaned_matches_df["calculatedGameScoreWinner"]


winner_mismatch_df = cleaned_matches_df[winner_mismatch]
num_mismatches = len(winner_mismatch_df)

if num_mismatches > 0:
    print(f"‚ùå {num_mismatches} matches have a mismatch between reconciledOverallScoreWinner and calculatedGameScoreWinner")
else: 
    print("‚úÖ No mismatches between reconciledOverallScoreWinner and calculatedGameScoreWinner")


‚ùå 8 matches have a mismatch between reconciledOverallScoreWinner and calculatedGameScoreWinner


In [132]:
# Final step is to amend the player_ids for the Tokyo Olympic Games
# These ids are 7 digits long and not the standard wtt / ittf ids
# They return no data from the player_details api 


home_players_df = cleaned_matches_df[["homeCompetitorId", "homePlayer","homeCompetitorOrg","EventName"]].drop_duplicates()
home_players_df = home_players_df.rename(columns={"homeCompetitorId": "playerId", "homePlayer": "playerName", "homeCompetitorOrg": "playerCountry"})
away_players_df = cleaned_matches_df[["awayCompetitorId", "awayPlayer","awayCompetitorOrg","EventName"]].drop_duplicates()
away_players_df = away_players_df.rename(columns={"awayCompetitorId": "playerId", "awayPlayer": "playerName", "awayCompetitorOrg": "playerCountry"})
players_lookup_df = pd.concat([home_players_df, away_players_df], axis=0).drop_duplicates()

players_lookup_tokyo_filter = players_lookup_df["EventName"].str.contains("Tokyo",case=False, na=False)


tokyo_players_lookup_df = players_lookup_df[players_lookup_tokyo_filter]

non_tokyo_players_lookup_df= players_lookup_df[~players_lookup_tokyo_filter]

non_tokyo_players_names = non_tokyo_players_lookup_df["playerName"].unique()

def get_fuzzy_name_matches(row, non_tokyo_players_names_list, non_tokyo_df, score_threshold=60, high_score_threshold=90):
    """
    Takes a row (from tokyo_players_lookup_df), finds the best fuzzy match
    from non_tokyo_players_names_list, and returns a new Series for the mapping.
    
    Includes a country check to confirm lower-score matches.
    """
    tokyo_id = row["playerId"]
    tokyo_name = row["playerName"]
    tokyo_country = row["playerCountry"]
    
    # Use process.extractOne to find the best match
    best_match = process.extractOne(
        tokyo_name, 
        non_tokyo_players_names_list, 
        scorer=fuzz.token_sort_ratio
    )
    
    # check if match was found
    if best_match:
        matched_name = best_match[0]
        match_score = best_match[1]
        
        # --- NEW LOGIC: Get matched player's ID and Country ---
        
        # Get the full row for the matched player
        # We use .iloc[0] to get the first (and should be only) matching player's data as a Series
        matched_player_row = non_tokyo_df[
            non_tokyo_df["playerName"] == matched_name
        ].iloc[0]
        
        matched_id = matched_player_row["playerId"]
        matched_country = matched_player_row["playerCountry"] # Get the country to compare
        
        # Check if countries match
        country_match = (tokyo_country == matched_country)
        
        # return match if  the name score is medium (>= 60) AND the countries match
        # or if match is high 
        # needs manual checking 
        is_confident_match = (match_score >= high_score_threshold) or(match_score >= score_threshold and country_match)

        if is_confident_match:
            # return the new Series with all info
            return pd.Series({
                'tokyo_id': tokyo_id,
                'tokyo_name': tokyo_name,
                'tokyo_country': tokyo_country,
                'matched_id': matched_id,
                'matched_name': matched_name,
                'matched_country': matched_country,
                'country_match': country_match,
                'match_score': match_score
            })
    
    
    return pd.Series({
        'tokyo_id': tokyo_id,
        'tokyo_name': tokyo_name,
        'tokyo_country': tokyo_country,
        'matched_id': pd.NA,
        'matched_name': matched_name if best_match else None, 
        'matched_country': matched_country if best_match else None, 
        'country_match': country_match if best_match else False,
        'match_score': match_score if best_match else 0
    })



fuzzy_matches_df = tokyo_players_lookup_df.apply(
    get_fuzzy_name_matches, 
    args=(non_tokyo_players_names, non_tokyo_players_lookup_df), 
    axis=1)

fuzzy_matches_df = fuzzy_matches_df.sort_values(by="match_score", axis =0, ascending=True)
fuzzy_matches_df["playerId"] = fuzzy_matches_df["matched_id"]
fuzzy_matches_df.to_csv("../Data/Processed/Matches/fuzzy_matches.csv")
fixed_ids_df = pd.read_csv("../Data/Processed/Matches/fuzzy_matches_fixed.csv")

fixed_id_map = pd.Series(
    fixed_ids_df['playerId'].astype(str).values,
    index=fixed_ids_df['tokyo_id'].astype(str)
)

#
cleaned_matches_df["homeCompetitorId"] = cleaned_matches_df["homeCompetitorId"].replace(fixed_id_map)
cleaned_matches_df["awayCompetitorId"] = cleaned_matches_df["awayCompetitorId"].replace(fixed_id_map)



In [133]:
## In the Tokyo Olympics 2020 - American Woman, Juan Liu has had her ID input wrong
# The id used for her matches is the id for the American Man Dan Liu
# Dan Liu has ID: 121226
# Juan Liu has ID: 105472
tokyo_filter = cleaned_matches_df["EventName"].str.contains("Tokyo")
juan_liu_filter_tokyo_id_home = cleaned_matches_df["homeCompetitorId"] == "121226"
juan_liu_filter_tokyo_id_away = cleaned_matches_df["awayCompetitorId"] == "121226"


cleaned_matches_df.loc[tokyo_filter & juan_liu_filter_tokyo_id_home, "homeCompetitorId"] = 105472
cleaned_matches_df.loc[tokyo_filter & juan_liu_filter_tokyo_id_away, "awayCompetitorId"] = 105472


In [134]:
from datetime import date


date_string = date.today().strftime("%Y%m%d")
file_name = f"{date_string}_cleaned_matches.csv"
file_path = os.path.join(CLEANED_MATCHES_DIR, file_name)

cleaned_matches_df.to_csv(file_path, index=False)


In [135]:
cleaned_matches_df

Unnamed: 0,eventId,documentCode,subEventName,subEventDescription,venueName,tableNumber,tableName,currentGameNumber,A_rawGameScores,B_rawGameScores,...,calcNestedOverallScores,scoreConsistent,reconciledOverallScore,gameScoreConsistent,reconciledGameScore,reconciledOverallScoreWinner (more reliable),calculatedGameScoreWinner,gameScoreFlags,calcBestOf,trueBestOf
0,2410,TTEMSINGLES-----------RND1000700--,Men Singles,Men's Singles - Preliminary Round 1 - Match 7,Lusail Sports Arena,T06,Table 6,,,"13-11,12-10,11-8",...,,True,3-0,True,"13-11,12-10,11-8",home,home,,5,5.0
1,2410,TTEMSINGLES-----------RND1002600--,Men Singles,Men's Singles - Preliminary Round 1 - Match 26,Lusail Sports Arena,T04,Table 4,,,"3-11,10-12,4-11",...,,True,0-3,True,"3-11,10-12,4-11",away,away,,5,5.0
2,2410,TTEMSINGLES-----------RND1004600--,Men Singles,Men's Singles - Preliminary Round 1 - Match 46,Lusail Sports Arena,T03,Table 3,,,"11-9,11-9,11-8",...,,True,3-0,True,"11-9,11-9,11-8",home,home,,5,5.0
3,2410,TTEMSINGLES-----------RND1001900--,Men Singles,Men's Singles - Preliminary Round 1 - Match 19,Lusail Sports Arena,T05,Table 5,,,"11-6,11-9,15-13",...,,True,3-0,True,"11-6,11-9,15-13",home,home,,5,5.0
4,2410,TTEMSINGLES-----------RND1001000--,Men Singles,Men's Singles - Preliminary Round 1 - Match 10,Lusail Sports Arena,T07,Table 7,,,"12-14,7-11,6-11",...,,True,0-3,True,"12-14,7-11,6-11",away,away,,5,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23940,3065,TTEMSINGLES-----------R64-002300----------,Men's Singles,Men's Singles - Round of 64 - Match 23,Deutsches Tischtennis- Zentrum,T06,Table 6,4.0,"13-15,11-4,3-11,8-11","13-15,11-4,3-11,8-11",...,1-3,True,1-3,True,"13-15,11-4,3-11,8-11",away,away,,5,5.0
23941,3065,TTEMSINGLES-----------R64-001500----------,Men's Singles,Men's Singles - Round of 64 - Match 15,Deutsches Tischtennis- Zentrum,T02,Table 2,4.0,"8-11,11-8,2-11,10-12","8-11,11-8,2-11,10-12",...,1-3,True,1-3,True,"8-11,11-8,2-11,10-12",away,away,,5,5.0
23942,3065,TTEMSINGLES-----------R64-001800----------,Men's Singles,Men's Singles - Round of 64 - Match 18,Deutsches Tischtennis- Zentrum,T03,Table 3,3.0,"11-7,11-6,13-11","11-7,11-6,13-11",...,3-0,True,3-0,True,"11-7,11-6,13-11",home,home,,5,5.0
23943,3065,TTEMSINGLES-----------R64-000700----------,Men's Singles,Men's Singles - Round of 64 - Match 7,Deutsches Tischtennis- Zentrum,T05,Table 5,3.0,"9-11,3-11,8-11","9-11,3-11,8-11",...,0-3,True,0-3,True,"9-11,3-11,8-11",away,away,,5,5.0
