In [None]:
import glob 
import pandas as pd
import pandas as pd
import numpy as np
import os 
import json

from stat import mode 

In [516]:
RAW_MATCH_DETAILS_DIR = "../Data/Raw/Match_details"

DNF_KEYWORDS = ['WO', 'INJ', 'RET', 'DSQ', 'DNS']
DNF_PATTERN = '|'.join(DNF_KEYWORDS)
DNF_PATTERN_CAPTURE = r'(' + r'|'.join(DNF_KEYWORDS) + r')'

# drop some columns that are not currently of interest for the project (e.g table number and venue))
# result status is "offical" for all entries - no neeed to keep
DROP_COLUMNS_START = [
    "resultStatus",        
    "playByPlaySequenceNumber"]

EVENTS_FILE = "../Data/Processed/Events/shortlist_events.csv"


In [517]:
# Get all match files in the RAW_MATCH_DETAILS_DIR
# Create an all_matches_df to be filtered down 
# Many of these matches are in fact match ups between teams rather than players - need to filter out.
# some of the singles matches are from teams matches and events - can keep these.



# parse all match details inside the json files.
all_match_details_files = glob.glob(os.path.join(RAW_MATCH_DETAILS_DIR, "*.json"))
all_matches = []
for file in all_match_details_files:
    with open(file, 'r', encoding='utf-8') as f:
        matches_data = json.load(f)

    all_matches.extend(matches_data)

# create the df 

all_matches_df = pd.DataFrame(all_matches)







In [518]:
# Keep here inside this cell to prevent rerunning file parsing and extraction every time.
all_matches_df = pd.DataFrame(all_matches)

# drop some columns that are not currently of interest for the project (e.g table number and venue etc )



# initialise the cleaned matches df and drop empty columns and na rows
print(f"cleaned_matches_df before  dropna and dropping irrelevant columns: {len(all_matches_df)} with {len(all_matches_df.columns)} cols")
cleaned_matches_df = all_matches_df
cleaned_matches_df.dropna(axis=0, how='all', inplace=True)
cleaned_matches_df.dropna(axis=1, how='all', inplace=True)
cleaned_matches_df.drop(columns=DROP_COLUMNS_START, inplace=True)
print(f"cleaned_matches_df after dropna and dropping irrelevant columns: {len(cleaned_matches_df)} with {len(cleaned_matches_df.columns)} cols")


dnf_from_overallScores = cleaned_matches_df["overallScores"].str.extract(DNF_PATTERN_CAPTURE, expand=False).str.strip()
dnf_from_resultoverallScores = cleaned_matches_df["resultOverallScores"].str.extract(DNF_PATTERN_CAPTURE, expand=False).str.strip()

cleaned_matches_df["dnf"] = dnf_from_overallScores
cleaned_matches_df["dnf"] = cleaned_matches_df["dnf"].fillna(dnf_from_resultoverallScores)
print("âœ… 'dnf' column updated by sequentially filling missing values from 'overallScores' THEN 'resultOverallScores'.")




cleaned_matches_df before  dropna and dropping irrelevant columns: 24542 with 25 cols
cleaned_matches_df after dropna and dropping irrelevant columns: 24542 with 21 cols
âœ… 'dnf' column updated by sequentially filling missing values from 'overallScores' THEN 'resultOverallScores'.


In [519]:
print(f"cleaned_df before dropping names with teams_parent_data: {len(cleaned_matches_df)}")
team_parent_filter = cleaned_matches_df["teamParentData"].notna()
team_parent_df  = cleaned_matches_df[team_parent_filter].copy()
cleaned_matches_df = cleaned_matches_df[~team_parent_filter ].copy()

# also drop team summaries
for col in cleaned_matches_df.columns:
    if "team" in col.lower():
        cleaned_matches_df.drop(columns=[col], inplace=True)


print(f"cleaned_df after dropping names with teams_parent_data: {len(cleaned_matches_df)}")

cleaned_df before dropping names with teams_parent_data: 24542
cleaned_df after dropping names with teams_parent_data: 23727


In [520]:
print(f"cleaned_df before dropping para and age limit matches: {len(cleaned_matches_df)}")
age_limit_mask = cleaned_matches_df['subEventName'].str.contains(r"U\d{2}", case=False, na=False)
para_class_mask = cleaned_matches_df['subEventName'].str.contains("class", case=False, na=False)
age_para_filter = age_limit_mask | para_class_mask
cleaned_matches_df = cleaned_matches_df[~age_para_filter].copy()


print(f"cleaned_df after dropping para and age limit matches: {len(cleaned_matches_df)}")



cleaned_df before dropping para and age limit matches: 23727
cleaned_df after dropping para and age limit matches: 23567


In [521]:
# Before filtering - extact key information from the 'competitors' column
# player name column can contain team names.
# competitors column keeps track of either the 1 player for a singles listing
# or the multiple players for a team listing 
# some doubles matches may be leftover from payload filtering - need to filter out.
# some doubles matches may be here inside team events - need to filter out.

def extract_competitor_details(competitor_list):
    """
    Extracts only the top-level competitor details (Name, ID, ORG) 
    for Home (H) and Away (A) competitors, ignoring the nested 'players' dict.
    """
    
    # Initialize the output dictionary  
    data = {}
    
    # check for empty data incase 
    if not isinstance(competitor_list, list) or len(competitor_list) < 2:
        return pd.Series(data)

    try:
        # use prefix pattern to determine home/away and build the keys for the output dict.
        for comp in competitor_list:
            comp_type = comp.get('competitorType')
            
            if comp_type == 'H':
                prefix = 'home'
            elif comp_type == 'A':
                prefix = 'away'
            else:
                continue                
           
            
            # get competitor id 
            data[f'{prefix}CompetitorId'] = comp.get('competitiorId', pd.NA)
            
            # get competitor name 
            data[f'{prefix}CompetitorName'] = comp.get('competitiorName', pd.NA)
            
            # get competitor country code 
            data[f'{prefix}CompetitorOrg'] = comp.get('competitiorOrg', pd.NA)

            data[f'{prefix}Player(s)'] = [player.get('playerName', pd.NA) for player in comp.get('players', pd.NA)]

            data[f'{prefix}PlayerGameScores'] = comp.get('scores', pd.NA)



    except Exception as e:
        print(f"Error processing row: {e} | Data: {competitor_list}")
        pass

    return pd.Series(data)

#
print("--- ðŸš€ Getting competitor details ðŸš€ ---")

# apply the function to the competitors column from the main df
competitor_details_df = cleaned_matches_df['competitiors'].apply(extract_competitor_details)



cleaned_matches_df = pd.concat([cleaned_matches_df, competitor_details_df], axis=1)
cleaned_matches_df.drop(columns=["competitiors"],inplace=True, errors='ignore')
cleaned_matches_df["homePlayer(s)"] = cleaned_matches_df["homePlayer(s)"].str[0]
cleaned_matches_df["awayPlayer(s)"] = cleaned_matches_df["awayPlayer(s)"].str[0]

print("âœ… Competitor details extracted and added to cleaned_matches_df and competitiors column dropped.")


--- ðŸš€ Getting competitor details ðŸš€ ---
âœ… Competitor details extracted and added to cleaned_matches_df and competitiors column dropped.


In [522]:
# All teams matches should be removed by now 

def extract_format(config):
    """
    Attempts to extract the best of format from the 'matchConfig' column
    """
    data = {"bestOf": pd.NA, "ttrReview": pd.NA}
    if not isinstance(config,dict):
        return pd.Series(data)
    try:
        data['bestOf'] = config.get('bestOfXGames')
        data['ttrReview'] = config.get('tTRReview')
        return pd.Series(data)
    except Exception as e:
        print(f"Error processing row: {e} | Data: {config}")
        pass
    return pd.Series(data)
#
print("--- ðŸš€ Getting Match Config deatils ðŸš€ ---")

# apply the function to the competitors column from the main df
match_config_df = cleaned_matches_df['matchConfig'].apply(extract_format)


cleaned_matches_df = pd.concat([cleaned_matches_df, match_config_df], axis=1)
cleaned_matches_df.drop(columns=["matchConfig"],inplace=True, errors='ignore')

print("âœ… Match config extracted and added to cleaned_matches_df,matchConfig column dropped.")


--- ðŸš€ Getting Match Config deatils ðŸš€ ---
âœ… Match config extracted and added to cleaned_matches_df,matchConfig column dropped.


In [523]:
# Now to assess the issues if the same player (sameID) being given multiple names 
print("--- ðŸŸ¢ Generating Name-to-ID Mapping for Review ðŸŸ¢---")

# 1. Combine the ID and Name columns into one DataFrame (Home and Away)
home_map = cleaned_matches_df[['homeCompetitorId', 'homeCompetitorName']].rename(
    columns={'homeCompetitorId': 'competitor_id', 'homeCompetitorName': 'competitor_name'}
)
away_map = cleaned_matches_df[['awayCompetitorId', 'awayCompetitorName']].rename(
    columns={'awayCompetitorId': 'competitor_id', 'awayCompetitorName': 'competitor_name'}
)

# 2. Concatenate and drop duplicates to get a list of all unique ID-Name pairs
all_id_name_pairs = pd.concat([home_map, away_map]).dropna().drop_duplicates()

# 3. Group by ID and aggregate all associated names into a list
id_to_names_map = all_id_name_pairs.groupby('competitor_id')['competitor_name'].unique()

print("âœ… Mapped all names to their unique competitor IDs.")
print(f"Total unique competitor IDs found: {len(id_to_names_map)}")

# 4. Filter for IDs that have MORE THAN ONE associated name (the problem cases)
# This finds where the list of unique names for one ID is greater than length 1
discrepancy_map = id_to_names_map[id_to_names_map.apply(len) > 1]

print(f"\nDiscrepancy Report: Found {len(discrepancy_map)} IDs with multiple names.")
print("You must inspect and choose a canonical name for these IDs:")
print(discrepancy_map.head(10))

--- ðŸŸ¢ Generating Name-to-ID Mapping for Review ðŸŸ¢---
âœ… Mapped all names to their unique competitor IDs.
Total unique competitor IDs found: 2808

Discrepancy Report: Found 357 IDs with multiple names.
You must inspect and choose a canonical name for these IDs:
competitor_id
100001                   [ANTHONY Amalraj, Amalraj ANTHONY]
100032                 [ABDEL-AZIZ Farah, Farah ABDEL-AZIZ]
100189                     [ALAWLAQI Ahmed, Ahmed ALAWLAQI]
100439                           [SALEH Ahmed, Ahmed SALEH]
100486                           [ALTO Gaston, Gaston ALTO]
100621                     [Tiago APOLONIA, APOLONIA Tiago]
100696                             [Omar ASSAR, ASSAR Omar]
100868    [BALAZOVA Barbora, VARADY Barbora, Barbora BAL...
101192                     [BOBOCICA Mihai, Mihai BOBOCICA]
101480                       [CANTERO Jesus, Jesus CANTERO]
Name: competitor_name, dtype: object


In [524]:
# serverNext = NEXT SERVER AFTER THE MATCH POINT WAS DONE
# (even if point would not be played as match was over)

def extract_next_server(action):
    """
    Attempts to extract the best of format from the 'matchConfig' column
    """
    data = {"serverNext":pd.NA,
            "actionType": pd.NA
}
    if not isinstance(action,dict):
        return pd.Series(action)
    try:
        
        data["serverNext"] = action.get("serverNext")   
        data["actionType"] = action.get("actionType")
        return pd.Series(data)
    except Exception as e:
        print(f"Error processing row: {e} | Data: {action}")
        pass
    return pd.Series(data)
#
print("--- ðŸš€ Getting Match Config deatils ðŸš€ ---")

# apply the function to the competitors column from the main df
last_server_df = cleaned_matches_df['action'].apply(extract_next_server)


cleaned_matches_df = pd.concat([cleaned_matches_df, last_server_df], axis=1)
cleaned_matches_df.drop(columns=["action"],inplace=True, errors='ignore')

print("âœ… Match lastServer extracted and added to cleaned_matches_df,matchConfig column dropped.")

--- ðŸš€ Getting Match Config deatils ðŸš€ ---
âœ… Match lastServer extracted and added to cleaned_matches_df,matchConfig column dropped.


In [525]:
def extract_times(matchTime):
    """
    Attempts to extract the match date and duration from the matchDate column
    """
    data = {"duration (unreliable)": pd.NA, "startDateLocal": pd.NA, "startDateUTC": pd.NA}
    if not isinstance(matchTime, dict):
        return pd.Series(matchTime)
    try:
        data['duration (unreliable)'] = matchTime.get('duration')
        data['startDateLocal'] = matchTime.get('startDateLocal')
        data['startDateUTC'] = matchTime.get('startDateUTC')
        return pd.Series(data)
    except Exception as e:
        print(f"Error processing row: {e} | Data: {matchTime}")
        pass
    return pd.Series(data)

times_df  = cleaned_matches_df['matchDateTime'].apply(extract_times)

cleaned_matches_df = pd.concat([cleaned_matches_df, times_df], axis=1)

cleaned_matches_df['startDateLocal'] = pd.to_datetime(
    cleaned_matches_df['startDateLocal'], errors='coerce', utc=False
)
cleaned_matches_df['startDateUTC'] = pd.to_datetime(
    cleaned_matches_df['startDateUTC'], errors='coerce', utc=True
)
cleaned_matches_df['startDateLocal'] = cleaned_matches_df['startDateLocal']
cleaned_matches_df['startDateUTC'] = cleaned_matches_df['startDateUTC']

cleaned_matches_df.drop(columns=["matchDateTime"],inplace=True, errors='ignore')

In [526]:
events_df=pd.read_csv(EVENTS_FILE)
events_df["FromStartDate"] = pd.to_datetime(events_df["FromStartDate"])
events_df["FromStartDate"] = events_df["FromStartDate"]
event_dates_df = events_df[["EventName","eventId", "StartDateTime"]]
event_dates_df = event_dates_df.rename(columns = {"StartDateTime":"EventStartDate"})
cleaned_matches_df["eventId"] = cleaned_matches_df["eventId"].astype(int)
cleaned_matches_df = cleaned_matches_df.merge(
    event_dates_df, 
    on='eventId', 
    how='left',
    validate='m:1'
)

In [527]:
times_columns = ["EventName","eventId", "documentCode","matchStartTimeUTC","startDateLocal", "startDateUTC",  "EventStartDate"]
cleaned_matches_df = cleaned_matches_df.sort_values(by = ["EventStartDate", "startDateLocal", "matchStartTimeUTC"])

DATE_COLS = [
    "matchStartTimeUTC", # Already exists, just needs re-conversion for safety
    "startDateLocal",    # Needs conversion (was in nested dict)
    "startDateUTC",      # Needs conversion (was in nested dict)
    "EventStartDate"     # The merged date (likely a date object or string)
]


for col in DATE_COLS:
    # Check if the column exists in the DataFrame before trying to convert
    if col in cleaned_matches_df.columns:
        
        # 1. Convert to Datetime (errors='coerce' handles bad strings, turning them to NaT)
        # 2. Assign the result back to the same column
        cleaned_matches_df[col] = pd.to_datetime(
            cleaned_matches_df[col], 
            errors='coerce',
            utc=True # Ensure the resulting datetime object is timezone-aware (UTC)
        )


print("âœ… All  date / timecolumns standardized to UTC datetime dtype.")

# Check the dtypes to confirm the conversion



âœ… All  date / timecolumns standardized to UTC datetime dtype.


In [528]:
# One entry for muscat 2025 has a clearly erroneous date.
# For now, manually remove this date and let it be handled by fillNa hierarchy
target_event = all_matches_df["eventId"] == "3084"
target_match = all_matches_df["documentCode"] == "TTEWSINGLES-----------GP11000400----------"
target_mask = target_event & target_match
cleaned_matches_df.loc[target_mask,"matchStartTimeUTC"] = pd.NaT

In [529]:
cleaned_matches_df[target_mask].columns


dates_hierarchy = ['matchStartTimeUTC',
                   'startDateUTC',
                   'startDateLocal',                  
                   'EventStartDate']

cleaned_matches_df['matchDate'] = cleaned_matches_df[dates_hierarchy[0]]
for col in dates_hierarchy[1:]:
    # This only fills rows where 'matchDate_filled' is currently NA
    cleaned_matches_df['matchDate'] = cleaned_matches_df['matchDate'].fillna(
        cleaned_matches_df[col]
    )
columns_to_drop = ['matchStartTimeUTC',
                   'startDateUTC',
                   'startDateLocal']
cleaned_matches_df = cleaned_matches_df.drop(columns=columns_to_drop, errors='ignore')

  cleaned_matches_df[target_mask].columns


In [None]:
score_df_columns = [
    "eventId",
    "documentCode",
    "EventName",
    "gameScores",
    "resultsGameScores",
    "overallScores",
    "resultOverallScores",
    "homePlayerGameScores",
    "awayPlayerGameScores",
    "dnf",
    "homeCompetitorName",
    "awayCompetitorName"
]
    
score_test_df = cleaned_matches_df[score_df_columns].copy()


In [None]:
def calculate_scores(home_points_str, away_points_str):
    """
    Calculates the game scores and overall match score from point total strings.
    Returns: (calc_game_scores_str, calc_overall_scores_str)
    """
    # dont consider missing data 
    if pd.isna(home_points_str) or pd.isna(away_points_str):
        return pd.Series([pd.NA, pd.NA])
    
    # split strings into list 
    try:
        # Filter out empty strings and convert to int
        home_points = [int(p.strip()) for p in str(home_points_str).split(',') if p.strip()]
        away_points = [int(p.strip()) for p in str(away_points_str).split(',') if p.strip()]
    except ValueError:
        return pd.Series([pd.NA, pd.NA])

    # Ensure lists are the same length
    min_length = min(len(home_points), len(away_points))
    
    home_games_won = 0
    away_games_won = 0
    game_scores = []

    # zip home and away points - itererate over both up to length of shortest list 
    for h_pts, a_pts in zip(home_points[:min_length], away_points[:min_length]):
        
        # Skip 0-0 games
        if h_pts == 0 and a_pts == 0:
            continue
            
        # Create game score string 
        game_scores.append(f'{h_pts}-{a_pts}')
        
        # higher points tally as game wins
        if h_pts > a_pts:
            home_games_won += 1
        elif a_pts > h_pts:
            away_games_won += 1
            
    # join games scores intp output string
    calc_game_scores_str = ','.join(game_scores)
    calc_overall_scores_str = f'{home_games_won}-{away_games_won}'
    
    return pd.Series([calc_game_scores_str, calc_overall_scores_str])

print("--- ðŸŸ¢ Stage 4: Calculating Game and Overall Match Scores on score_test_df ðŸŸ¢---")

# use .apply and lambdafunction onto whole df 
new_score_cols = score_test_df.apply(
    lambda row: calculate_scores(row['homePlayerGameScores'], row['awayPlayerGameScores']),
    axis=1,
    result_type='expand'
)

# Add names to the columns
new_score_cols.columns = ['calcGameScores', 'calcOverallScores']

# concat  new columns + original df 
score_test_df = pd.concat([score_test_df, new_score_cols], axis=1)

print("âœ… Game scores and overall match scores calculated and added to score_test_df.")


--- ðŸŸ¢ Stage 4: Calculating Game and Overall Match Scores on score_test_df ðŸŸ¢---
âœ… Game scores and overall match scores calculated and added to score_test_df.


In [None]:
# normalise scores by removing 0-0 games (at end of scores i.e unplayed sets)
score_test_df["resultsGameScores"] = score_test_df["resultsGameScores"].str.replace(',0-0', '')
score_test_df["gameScores"] = score_test_df["gameScores"].str.replace(',0-0', '')

In [593]:
game_calc_mimsmatch_filter = score_test_df["calcOverallScores"] != score_test_df["overallScores"]
result_game_mismatch_filter = score_test_df["overallScores"] != score_test_df["resultOverallScores"]
games_mismatch_mask = game_calc_mimsmatch_filter | result_game_mismatch_filter
missing_calc_filter = score_test_df["calcGameScores"].isna()
missing_game_filer = score_test_df["gameScores"].isna()
missing_result_game_filter = score_test_df["resultsGameScores"].isna()
missing_data_mask = missing_calc_filter | missing_game_filer | missing_result_game_filter
dnf_filter = score_test_df["dnf"].isna()

games_mismatch_df = score_test_df[games_mismatch_mask & ~missing_data_mask & dnf_filter]


In [594]:
games_mismatch_df

Unnamed: 0,eventId,documentCode,EventName,gameScores,resultsGameScores,overallScores,resultOverallScores,homePlayerGameScores,awayPlayerGameScores,dnf,homeCompetitorName,awayCompetitorName,calcGameScores,calcOverallScores
9562,2480,TTEMSINGLES-----------R32-001200----------,2021 ITTF Czech International Open,"13-11,11-9,10-12,8-11,11-8,11-2","11-13,11-9,10-12,8-11,11-8,9-11",2-4,2-4,131110811110,1191211820.0,,CHANDRA Jeet,ROBINOT Alexandre,"13-11,11-9,10-12,8-11,11-8,11-2",4-2
7942,2502,TTEMSINGLES-----------RND2001000----------,WTT Contender Tunis 2021,"9-11,6-11,8-11","9-9,6-11,8-11",0-3,0-2,96800,11111100.0,,PINTO Daniele,SHIBAEV Alexander,"9-11,6-11,8-11",0-3
19818,2234,TTEWSINGLES-----------R32-000400----------,2021 ITTF Pan American Championships,"9-11,10-12,14-16,3-11","9-11,10-12,14-13,3-11",0-4,1-3,910143000,11121611000.0,,KE Tiffany,ORTEGA Daniela,"9-11,10-12,14-16,3-11",0-4
11294,2346,TTEMSINGLES-----------R128000500----------,2021 World Table Tennis Championships Finals,"13-11,11-8,8-11,6-11,11-5,13-11","13-11,11-8,8-11,6-11,11-5,8-11,13-11",4-3,4-3,13118611130,11811115110.0,,GARDOS Robert,LIND Anders,"13-11,11-8,8-11,6-11,11-5,13-11",4-2
11360,2346,TTEWSINGLES-----------R128001300----------,2021 World Table Tennis Championships Finals,"3-11,0-11,0-11,0-11","11-8,11-5,11-5,11-7",4-0,4-0,3000000,11111111000.0,,PESOTSKA Margaryta,HELMY Yousra,"3-11,0-11,0-11,0-11",0-4
11290,2346,TTEWSINGLES-----------R128000200----------,2021 World Table Tennis Championships Finals,"11-5,11-0,11-0,11-0","11-9,12-10,6-11,9-11,7-11,11-6,5-11",3-4,3-4,11111111000,5000000.0,,PARANANG Orawan,LIN Ye,"11-5,11-0,11-0,11-0",4-0
11287,2346,TTEWSINGLES-----------R128006300----------,2021 World Table Tennis Championships Finals,"5-11,11-9,3-11,11-13,11-13","5-11,11-9,3-11,11-13,11-10",1-4,2-3,5113111100,11911131300.0,,KUMAHARA Caroline,NG Wing Nam,"5-11,11-9,3-11,11-13,11-13",1-4
3870,2479,TTEWSINGLES-----------R64-002300----------,2021 ITTF Finlandia International Open,"11-9,5-11,11-0,11-0","11-9,11-5,11-9",3-0,3-0,11511110,911000.0,,MIASHCHANSKAYA Ulyana,MIKALAUSKYTE Marija,"11-9,5-11,11-0,11-0",3-1
3851,2479,TTEMSINGLES-----------R64-001400----------,2021 ITTF Finlandia International Open,"11-8,7-11,5-11,11-9,0-11","8-11,11-7,11-5,11-9",3-1,3-1,1175110,81111911.0,,SUNDIN Isak Theodor Mikael,RASANEN Aleksi,"11-8,7-11,5-11,11-9,0-11",2-3
21873,2519,TTEWSINGLES-----------R32-000800----------,WTT Feeder DÃ¼sseldorf,"7-11,2-11,11-6,6-11,10-12","1-11,2-11,6-6,6-11,10-12",1-4,0-4,721161000,11116111200.0,,STRAZAR Katarina,WAN Yuan,"7-11,2-11,11-6,6-11,10-12",1-4


In [596]:
s1 = score_test_df['resultOverallScores'].astype(str).str.strip().replace('nan', pd.NA)
s2 = score_test_df['calcOverallScores'].astype(str).str.strip().replace('nan', pd.NA)
s3 = score_test_df['overallScores'].astype(str).str.strip().replace('nan', pd.NA)
def triangulate_scores(row):
    """
    when game scores disagree - triangulate scores between:
        1. calcGameScores
        2. gameScores
        3. resultsGameScores
    """
