In [310]:
import glob 
import pandas as pd
from rapidfuzz import fuzz
import pycountry
import pandas as pd
import numpy as np


In [321]:
RAW_MATCH_DETAILS_DIR = "../Data/Raw/Match_details"

DNF_KEYWORDS = ['WO', 'INJ', 'RET', 'DSQ']
DNF_PATTERN = '|'.join(DNF_KEYWORDS)
DNF_PATTERN_CAPTURE = r'(' + r'|'.join(DNF_KEYWORDS) + r')'

TEAM_NAME_PATTERN = r'team|federacion|federation|table'

DROP_COLUMNS_START = ["resultStatus", "tableName", "tableNumber","venueName"]

In [322]:
# Get all match files in the RAW_MATCH_DETAILS_DIR
# Create an all_matches_df to be filtered down 
# Many of these matches are in fact match ups between teams rather than players - need to filter out.
# some of the singles matches are from teams matches and events - can keep these.



# parse all match details inside the json files.
all_match_details_files = glob.glob(os.path.join(RAW_MATCH_DETAILS_DIR, "*.json"))
all_matches = []
for file in all_match_details_files:
    with open(file, 'r', encoding='utf-8') as f:
        matches_data = json.load(f)

    all_matches.extend(matches_data)

# create the df 

all_matches_df = pd.DataFrame(all_matches)







In [342]:
# Keep here inside this cell to prevent rerunning file parsing and extraction every time.
all_matches_df = pd.DataFrame(all_matches)

# drop some columns that are not currently of interest for the project (e.g table number and venue etc )
all_matches_df.drop(columns=DROP_COLUMNS_START, inplace=True)


# initialise the cleaned matches df and drop empty columns and na rows
print(f"all_matches_df before dropping empty rows and columns: {len(all_matches_df)}")
cleaned_matches_df = all_matches_df.dropna(axis=0, how='all', inplace=False)
cleaned_matches_df = all_matches_df.dropna(axis=0, how='all', inplace=False)
print(f"cleaned_matches_df after dropping empty rows and columns: {len(cleaned_matches_df)}")


# Of the remaining matches, extract the DNF keyword from the overallScores column - non DNF matches will be na
cleaned_matches_df["dnf"] = cleaned_matches_df["overallScores"].str.extract(DNF_PATTERN_CAPTURE, expand=False).str.strip()



all_matches_df before dropping empty rows and columns: 24469
cleaned_matches_df after dropping empty rows and columns: 24469


In [343]:
# Before filtering - extact key information from the 'competitors' column
# player name column can contain team names.
# competitors column keeps track of either the 1 player for a singles listing
# or the multiple players for a team listing 
# some doubles matches may be leftover from payload filtering - need to filter out.
# some doubles matches may be here inside team events - need to filter out.

def extract_competitor_details(competitor_list):
    """
    Extracts only the top-level competitor details (Name, ID, ORG) 
    for Home (H) and Away (A) competitors, ignoring the nested 'players' dict.
    """
    
    # Initialize the output dictionary  
    data = {}
    
    # check for empty data incase 
    if not isinstance(competitor_list, list) or len(competitor_list) < 2:
        return pd.Series(data)

    try:
        # use prefix pattern to determine home/away and build the keys for the output dict.
        for comp in competitor_list:
            comp_type = comp.get('competitorType')
            
            if comp_type == 'H':
                prefix = 'home'
            elif comp_type == 'A':
                prefix = 'away'
            else:
                continue                
           
            
            # get competitor id 
            data[f'{prefix}CompetitorId'] = comp.get('competitiorId', pd.NA)
            
            # get competitor name 
            data[f'{prefix}CompetitorName'] = comp.get('competitiorName', pd.NA)
            
            # get competitor country code 
            data[f'{prefix}CompetitorOrg'] = comp.get('competitiorOrg', pd.NA)

            data[f'{prefix}Player(s)'] = [player.get('playerName', pd.NA) for player in comp.get('players', pd.NA)]



    except Exception as e:
        print(f"Error processing row: {e} | Data: {competitor_list}")
        pass

    return pd.Series(data)

#
print("--- ðŸš€ Getting competitor details ðŸš€ ---")

# apply the function to the competitors column from the main df
competitor_details_df = cleaned_matches_df['competitiors'].apply(extract_competitor_details)



cleaned_matches_df = pd.concat([cleaned_matches_df, competitor_details_df], axis=1)

print("âœ… Competitor details extracted and added to  cleaned_matches_df.")


--- ðŸš€ Getting competitor details ðŸš€ ---
âœ… Competitor details extracted and added to  cleaned_matches_df.


In [344]:
print(f"cleaned_df before dropping para and age limit matches: {len(cleaned_matches_df)}")
age_limit_mask = cleaned_matches_df['subEventName'].str.contains(r"U\d{2}", case=False, na=False)
para_class_mask = cleaned_matches_df['subEventName'].str.contains("class", case=False, na=False)
age_para_filter = age_limit_mask | para_class_mask
cleaned_matches_df = cleaned_matches_df[~age_para_filter].copy()


print(f"cleaned_df after dropping para and age limit matches: {len(cleaned_matches_df)}")



cleaned_df before dropping para and age limit matches: 24469
cleaned_df after dropping para and age limit matches: 24295


In [345]:
print(f"cleaned_df before dropping names with numbers (teams): {len(cleaned_matches_df)}")
home_contains_digit = cleaned_matches_df["homeCompetitorName"].str.contains(r"[0-9]", na=False)
away_contains_digit = cleaned_matches_df["awayCompetitorName"].str.contains(r"[0-9]", na=False)
keep_filter = ~(home_contains_digit | away_contains_digit)
cleaned_matches_df = cleaned_matches_df[keep_filter].copy()
print(f"cleaned_df after dropping names with numbers (teams): {len(cleaned_matches_df)}")

cleaned_df before dropping names with numbers (teams): 24295
cleaned_df after dropping names with numbers (teams): 23634


In [346]:
print(f"cleaned_df before dropping matches with multiple players (teams): {len(cleaned_matches_df)}")
home_multiple_players = cleaned_matches_df["homePlayer(s)"].map(len) >1
away_multiple_players = cleaned_matches_df["awayPlayer(s)"].map(len) >1
keep_filter = ~(home_multiple_players | away_multiple_players)
remove_filter = (home_multiple_players | away_multiple_players)
remove_df = cleaned_matches_df[remove_filter].copy()
cleaned_matches_df = cleaned_matches_df[keep_filter].copy()
print(f"cleaned_df after dropping matches with multiple players (teams): {len(cleaned_matches_df)}")

cleaned_df before dropping matches with multiple players (teams): 23634
cleaned_df after dropping matches with multiple players (teams): 23495


In [347]:
def build_country_name_list():
    """
    Creates a comprehensive list of country names for fuzzy matching from pycountries
    """
    country_names = set()
    
    #  Get ALL official names and common names from pycountry
    for country in pycountry.countries:
        # Standard Common Name
        country_names.add(country.name)
        # Official Full Name (often different)
        if hasattr(country, 'official_name'):
            country_names.add(country.official_name)
        # Historical/Alternative Names (if the library provides them)
        if hasattr(country, 'common_name'):
             country_names.add(country.common_name)

    # Add other possible options for sports teams across different tournaments and regions
    # generated using Google Gemini - may be superfluous or be missing some possible options
    
    sports_variants = [
        # China/Taiwan/HK
        'Chinese Taipei', 'Taiwan', 'Hong Kong, China', 'Hong Kong','Macau, China', 'Macao'
        # N/S Korea
        'Republic of Korea', 'North Korea', 'South Korea', 'DPR Korea',
        # Former Czech/Slovak
        'Czechia', 'Czech Republic', 'Slovakia',
        # Common English/French alternatives
        'Ivory Coast', 'Cote d\'Ivoire', 'Cape Verde', 'Cabo Verde',
        # Common abbreviations that might appear unparsed
        'DR Congo', 'ROC', 'PRC', 'USA', 'UK', 'UAE' 
    ]
    
    country_names.update(sports_variants)
    
    # Clean up the list (remove duplicates and empty/None entries)
    final_list = [name.strip() for name in country_names if name and isinstance(name, str)]
    
    return final_list

# Generate the master list
list_of_countries = build_country_name_list()

print(f"Generated a master list of {len(list_of_countries)} country/territory names and variants for fuzzy matching.")

def fuzz_match_check(player_name, keywords_list, threshold):
    """
    Checks if a player name has a high fuzzy match score against any
    keyword in the list.
    """
    # Safety check for empty/NA names
    if pd.isna(player_name):
        return False
        
    player_name_lower = str(player_name).lower()
    
    for keyword in keywords_list:
        # We use partial_ratio to see if the keyword is 'contained'
        # in the player name with high confidence.
        score = fuzz.ratio(player_name_lower, keyword.lower())
        
        # If any keyword scores above the threshold, flag it as suspicious
        if score >= threshold:
            return True
            
    # If no keyword matched, it's a safe name
    return False

print(f"cleaned_df before dropping matches with multiple players (teams): {len(cleaned_matches_df)}")

threshold = 90

home_is_fuzzy = cleaned_matches_df['homeCompetitorName'].apply(fuzz_match_check,args=(list_of_countries,threshold))
away_is_fuzzy = cleaned_matches_df['awayCompetitorName'].apply(fuzz_match_check,args=(list_of_countries,threshold))

remove_check_filter = home_is_fuzzy | away_is_fuzzy
remove_check_df = cleaned_matches_df[remove_check_filter].copy()
cleaned_matches_df = cleaned_matches_df[~remove_check_filter]
print(f"after: {len(cleaned_matches_df)}")





Generated a master list of 439 country/territory names and variants for fuzzy matching.
cleaned_df before dropping matches with multiple players (teams): 23495
after: 23494


In [252]:


print("--- ðŸŸ¢ Generating Name-to-ID Mapping for Review ðŸŸ¢---")

# 1. Combine the ID and Name columns into one DataFrame (Home and Away)
home_map = cleaned_matches_df[['homeCompetitorId', 'homeCompetitorName']].rename(
    columns={'homeCompetitorId': 'competitor_id', 'homeCompetitorName': 'competitor_name'}
)
away_map = cleaned_matches_df[['awayCompetitorId', 'awayCompetitorName']].rename(
    columns={'awayCompetitorId': 'competitor_id', 'awayCompetitorName': 'competitor_name'}
)

# 2. Concatenate and drop duplicates to get a list of all unique ID-Name pairs
all_id_name_pairs = pd.concat([home_map, away_map]).dropna().drop_duplicates()

# 3. Group by ID and aggregate all associated names into a list
id_to_names_map = all_id_name_pairs.groupby('competitor_id')['competitor_name'].unique()

print("âœ… Mapped all names to their unique competitor IDs.")
print(f"Total unique competitor IDs found: {len(id_to_names_map)}")

# 4. Filter for IDs that have MORE THAN ONE associated name (the problem cases)
# This finds where the list of unique names for one ID is greater than length 1
discrepancy_map = id_to_names_map[id_to_names_map.apply(len) > 1]

print(f"\nDiscrepancy Report: Found {len(discrepancy_map)} IDs with multiple names.")
print("You must inspect and choose a canonical name for these IDs:")
print(discrepancy_map.head(10))

--- ðŸŸ¢ Generating Name-to-ID Mapping for Review ðŸŸ¢---
âœ… Mapped all names to their unique competitor IDs.
Total unique competitor IDs found: 2909

Discrepancy Report: Found 388 IDs with multiple names.
You must inspect and choose a canonical name for these IDs:
competitor_id
100001               [ANTHONY Amalraj, Amalraj ANTHONY]
100032             [ABDEL-AZIZ Farah, Farah ABDEL-AZIZ]
100116237      [Abdelbasset CHAICHI, Maheidine A BELLA]
100116238               [Amir Atanda ADOU, Jimoh AMUSA]
100116239      [Ange-Aime KOUASSI, Don Ange Cedric OBA]
100116240           [Simon EBODE MVILONGO, Ylane BATIX]
100116241       [Mohamed EL-BEIALI, Youssef ABDEL-AZIZ]
100116242          [Aden FARIS, Demesse MESFIN BIRHANU]
100116244           [Habeb ALAJAEBI, Alhusayn HAMRANAH]
100116245    [Antoine RAZAFINARIVO, Setra RAKOTOARISOA]
Name: competitor_name, dtype: object


In [348]:
remove_check_df

Unnamed: 0,eventId,documentCode,subEventName,subEventDescription,matchConfig,competitiors,currentGameNumber,full_msg,gameScores,resultsGameScores,...,matchStartTimeUTC,dnf,homeCompetitorId,homeCompetitorName,homeCompetitorOrg,homePlayer(s),awayCompetitorId,awayCompetitorName,awayCompetitorOrg,awayPlayer(s)
15887,2751,TTEMTEAM--------------GP0500010000--------,Men's Teams,Men's Teams - Group 5 - Match 1,"{'bestOfXGames': 5, 'maxPointsPerGame': 11, 'a...","[{'competitorType': 'H', 'competitiorId': '100...",1.0,,"1-0,2-0,3-0,3-0,3-0",,...,,WO,100119299,Japan,JPN,[Japan],100119160,Nigeria,NGR,[Nigeria]


In [253]:
home_id_len = cleaned_matches_df["homeCompetitorId"].str.len() != 6
away_id_len = cleaned_matches_df["awayCompetitorId"].str.len() != 6

id_len_filter = home_id_len | away_id_len

id_len_check = cleaned_matches_df[id_len_filter]





In [None]:
id_len_check.

In [258]:
id_len_check[["eventId", "documentCode", "subEventName", "competitiors"]].to_csv("id_check.csv", index=False)

In [259]:
def is_individual_match(competitor_list):
    """
    Checks if a match is likely a Singles match by ensuring both Home (H) 
    and Away (A) competitors have *exactly one* entry in their 'players' list.
    """
    
    if not isinstance(competitor_list, list) or len(competitor_list) < 2:
        return False # Invalid data structure - remove it

    # Check the 'players' list for Home (H) and Away (A)
    for comp in competitor_list:
        if comp.get('competitorType') in ['H', 'A']:
            players_list = comp.get('players')
            
            # The key check: If the 'players' list is not present, is empty, 
            # or has more than one player, it's not a true Singles match.
            if not players_list or len(players_list) != 1:
                return False
                
    # If both home and away competitors pass the check (length is exactly 1)
    return True

# --- ðŸŸ¢ Filtering Team/Doubles Matches ðŸŸ¢---

print(f"Total matches before individual player filter: {len(cleaned_matches_df)}")

# 1. Apply the function to the 'competitiors' column to create a boolean Series
individual_match_filter = cleaned_matches_df['competitiors'].apply(is_individual_match)

# 2. Filter the main DataFrame, keeping only the rows that return True
# Create a DataFrame for inspection (for auditing the removed rows)
removed_team_matches_df = cleaned_matches_df[~individual_match_filter].copy()
print(f"Identified {len(removed_team_matches_df)} team/doubles matches for removal.")

cleaned_matches_df = cleaned_matches_df[individual_match_filter].copy()

print(f"Total individual matches remaining: {len(cleaned_matches_df)}")
print("âœ… Successfully filtered out team/doubles matches by checking the 'players' list length.")

Total matches before individual player filter: 23514
Identified 107 team/doubles matches for removal.
Total individual matches remaining: 23407
âœ… Successfully filtered out team/doubles matches by checking the 'players' list length.


In [260]:
removed_team_matches_df[["eventId", "documentCode", "subEventName", "competitiors"]].to_csv("team_check.csv", index=False)

In [278]:
cleaned_matches_df

Unnamed: 0,eventId,documentCode,subEventName,subEventDescription,matchConfig,competitiors,currentGameNumber,gameScores,resultsGameScores,overallScores,...,teamMatchScores,teamMatchScoresSummary,matchStartTimeUTC,dnf,homeCompetitorId,homeCompetitorName,homeCompetitorOrg,awayCompetitorId,awayCompetitorName,awayCompetitorOrg
0,2603,TTEWSINGLES-----------R64-002900----------,Women's Singles,Women's Singles Round of 64,"{'bestOfXGames': 5, 'maxPointsPerGame': 11, 'a...","[{'competitorType': 'H', 'competitiorId': '112...",7.0,"13-15,2-11,11-8,11-6,5-11,7-11,0-0","13-15,2-11,11-8,11-6,5-11,7-11",2-4,...,"0-0,0-0,0-0,0-0,0-0",0,,,112868,Xiaoxin YANG,MON,105913,Hana MATELOVA,CZE
1,2603,TTEWSINGLES-----------R64-003200----------,Women's Singles,Women's Singles Round of 64,"{'bestOfXGames': 5, 'maxPointsPerGame': 11, 'a...","[{'competitorType': 'H', 'competitiorId': '117...",5.0,"2-11,4-11,5-11,2-11,0-0,0-0,0-0","2-11,4-11,5-11,2-11",0-4,...,"0-0,0-0,0-0,0-0,0-0",0,,,117142,Lynda LOGHRAIBI,ALG,112019,Meng CHEN,CHN
3,2603,TTEMSINGLES-----------R128005800----------,Men's Singles,Men's Singles Preliminary Round,"{'bestOfXGames': 5, 'maxPointsPerGame': 11, 'a...","[{'competitorType': 'H', 'competitiorId': '116...",5.0,"11-5,11-6,11-5,11-3,0-0,0-0,0-0","11-5,11-6,11-5,11-3",4-0,...,"0-0,0-0,0-0,0-0,0-0",0,,,116021,Kanak JHA,USA,122320,Vladislav URSU,MDA
4,2603,TTEMSINGLES-----------R64-002500----------,Men's Singles,Men's Singles Round of 64,"{'bestOfXGames': 5, 'maxPointsPerGame': 11, 'a...","[{'competitorType': 'H', 'competitiorId': '123...",5.0,"11-3,11-2,11-3,11-5,0-0,0-0,0-0","11-3,11-2,11-3,11-5",4-0,...,"0-0,0-0,0-0,0-0,0-0",0,,,123980,Tomokazu HARIMOTO,JPN,114848,Martin ALLEGRO,BEL
5,2603,TTEMSINGLES-----------8FNL000500----------,Men's Singles,Men's Singles Round of 16,"{'bestOfXGames': 5, 'maxPointsPerGame': 11, 'a...","[{'competitorType': 'H', 'competitiorId': '135...",8.0,"11-9,15-13,12-10,8-11,3-11,8-11,11-7","11-9,15-13,12-10,8-11,3-11,8-11,11-7",4-3,...,"0-0,0-0,0-0,0-0,0-0",0,,,135977,Felix LEBRUN,FRA,107028,Dimitrij OVTCHAROV,GER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24464,2503,TTEWSINGLES-----------R32-001400----------,Women's Singles,Women's Singles - Round of 32 - Match 14,"{'bestOfXGames': 5, 'maxPointsPerGame': 11, 'a...","[{'competitorType': 'H', 'competitiorId': '133...",4.0,"11-6,11-5,10-12,11-9,0-0","11-6,11-5,10-12,11-9,0-0",3-1,...,"0-0,0-0,0-0,0-0,0-0",0,,,133894,LIU Weishan,CHN,120187,LUPULESKU Izabela,SRB
24465,2503,TTEWSINGLES-----------R32-000600----------,Women's Singles,Women's Singles - Round of 32 - Match 6,"{'bestOfXGames': 5, 'maxPointsPerGame': 11, 'a...","[{'competitorType': 'H', 'competitiorId': '112...",4.0,"6-11,11-6,5-11,5-11,0-0","6-11,11-6,5-11,5-11,0-0",1-3,...,"0-0,0-0,0-0,0-0,0-0",0,,,112453,JEGER Mateja,CRO,120108,GAUTHIER Lucie,FRA
24466,2503,TTEMSINGLES-----------RND3001400----------,Men's Singles,Men's Singles - Qualifying Round 3 - Match 14,"{'bestOfXGames': 5, 'maxPointsPerGame': 11, 'a...","[{'competitorType': 'H', 'competitiorId': '120...",3.0,"3-11,5-11,7-11,0-0,0-0","3-11,5-11,7-11,0-0,0-0",0-3,...,"0-0,0-0,0-0,0-0,0-0",0,,,120074,MLADENOVIC Luka,LUX,107456,PLATONOV Pavel,BLR
24467,2503,TTEMSINGLES-----------RND1001800----------,Men's Singles,Men's Singles - Qualifying Round 1 - Match 18,"{'bestOfXGames': 5, 'maxPointsPerGame': 11, 'a...","[{'competitorType': 'H', 'competitiorId': '119...",3.0,"11-5,11-8,11-8,0-0,0-0","11-5,11-8,11-8,0-0,0-0",3-0,...,"0-0,0-0,0-0,0-0,0-0",0,,,119092,BERTRAND Irvin,FRA,116831,HELMY Mahmoud,EGY
