In [517]:
import requests
import json
import pandas as pd
import os
from datetime import datetime, timezone
import re
import random 
import time 
import glob
import asyncio
from typing import Tuple, Optional, List, Dict, Any, Set
from pydantic import BaseModel, ValidationError, Field
import aiohttp
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
import country_converter as coco

In [518]:
MASTER_MATCHES_DIR = "../Data/Master/Matches"
MASTER_MATCHES_SUFFIX = "master_matches.csv"
MASTER_MATCHES_REGEX = rf"^\d{{8}}{re.escape('_')}{re.escape(MASTER_MATCHES_SUFFIX)}$"



MASTER_PLAYERS_DIR = "../Data/Master/Players"
MASTER_PLAYERS_SUFFIX = "_master_players.csv" 
now_date = datetime.now()
date_string = now_date.strftime("%Y%m%d")
MASTER_PLAYERS_OUTPUT_PATH = os.path.join(MASTER_PLAYERS_DIR, f"{date_string}{MASTER_PLAYERS_SUFFIX}")
os.makedirs(MASTER_PLAYERS_DIR, exist_ok=True)
MASTER_PLAYERS_REGEX = rf"^\d{{8}}{re.escape(MASTER_PLAYERS_SUFFIX)}$"



MIN_PAUSE = 1
MAX_PAUSE = 10

MAX_RETRIES = 2

FAILURE_LOG_PATH = "../Data/Raw/failure_log_match_details.csv"


# only used in get_latest_master_events to return blank df if no files found and further checks are required
MINIMAL_MATCH_COLUMNS = ["documentCode"]

MINIMAL_EVENT_COLUMNS = ["eventId"]

MINIMAL_PLAYER_COLUMNS = ["IttfId"]

In [519]:
def get_latest_master_matches(master_dir: str, master_regex) -> pd.DataFrame:
    """
    Parses specified directory for matches files in format yyyy_mm_dd. 
    Attempts to read latest file in this format. 

    Args:
        master_dir (str): The folder where the master files are stored (e.g., '../Data/Master/Matches').
        master_regex (str): The pattern to match (e.g., '^\\d{8}_master_matches.csv$').

    Returns:
        pd.DataFrame: returns DF with data if available or blank df if data unavailable
    """
    
    
    if not os.path.isdir(master_dir):
        print(f"‚ùå {master_dir} does not exist as a directory")
        return pd.DataFrame(columns=MINIMAL_MATCH_COLUMNS) 
    #
    files = glob.glob(f"{master_dir}/*.csv")
    
    master_files = []
    
    if not files:
        print(f"‚ùå No existing *.csv files found in MASTER Matches Directory: {master_dir} ") 
        return pd.DataFrame(columns=MINIMAL_MATCH_COLUMNS) 

   
    for file in files:
        filename = os.path.basename(file)     
        
        if re.match(master_regex, filename):
            master_files.append(file)

    if not master_files:
        print(f"‚ùå No existing MASTER files in format: {master_regex} in {master_dir}")
        return pd.DataFrame(columns=MINIMAL_MATCH_COLUMNS) 
  
    master_files.sort()     
    latest_master = master_files[-1]

    try: 
        latest_master_df = pd.read_csv(latest_master)
        print(f"‚úÖ {len(latest_master_df)} matches found in latest MASTER: {latest_master} ") 
        return latest_master_df
        
    except Exception as e:
        print(f"‚ùå Error reading lastest MASTER, {latest_master}: {e}")
        return pd.DataFrame(columns=MINIMAL_MATCH_COLUMNS) 

In [520]:
def get_latest_master_events(master_dir:str, master_regex) -> Tuple[pd.DataFrame,Optional[str]]:
    """
    Parses specified directory for events files in format yyyy_mm_dd. 
    Attempts to read latest file in this format. 

    Args:
        directory (str): The folder where the master files are stored (e.g., '../Data/Events/Intermediate').
        filename_pattern (str): The pattern to match (e.g., '*_events_intermediate.csv').

    Returns:
        Tuple[pd.DataFrame,Optional]: returns DF with data if available or blank df if data unavailable
    """
    if not os.path.isdir(master_dir):
        print (f"‚ùå{master_dir} does not exist as a directory")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None    
    
    # Get csv files in 
    files = glob.glob(f"{master_dir}/*.csv")
   

    master_files = []
    
    if not files:
        print(f"‚ùå No existing *.csv files found in MASTER Events Directory: {master_dir} ")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None 

    for file in files:
        filename = os.path.basename(file)    
       
        if re.match(master_regex,filename):
          master_files.append(file)

    if not master_files:
        print(f"‚ùå No existing MASTER files in format: {master_regex} in {master_dir}")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None 
    master_files.sort()    
    latest_master = master_files[-1]

    try: 
        latest_master_df = pd.read_csv(latest_master)
        print(f"‚úÖ {len(latest_master_df)} events found in latest MASTER: {latest_master} ")
        return latest_master_df, latest_master
        
    except Exception as e:
        print (f"‚ùå Error reading lastest MASTER, {latest_master}: {e}")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None 

In [521]:
async def get_player_details(session: aiohttp.ClientSession, 
                            player_id: int, 
                            min_pause: float, 
                            max_pause: float
                           ) -> Tuple[bool, Optional[Dict[str, Any]], Tuple[int, str], str]:
    """
    Takes input of player_id and scrapes wtt/getplayers endpoint to get player details.

    Args:
        session: An aiohttp.ClientSession object.
        player_id: The ID of the player to get details for.
        min_pause: The minimum pause time in seconds.
        max_pause: The maximum pause time in seconds.

    Returns:    
        status: A boolean indicating success (True) or failure (False).
        dict: The FULL RAW JSON response as a Python dictionary, or None if an error occurs.
        player_id: The original input player ID.
    
    """
    # pause before running for api politeness
    await asyncio.sleep(random.uniform(min_pause,max_pause))     


    url = 'https://wtt-ttu-connect-frontdoor-g6gwg6e2bgc6gdfm.a01.azurefd.net/Players/GetPlayers'
    
    # Generate a current timestamp in the required format to avoid cached results.
    current_timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'

    params = {
        'IttfId': player_id,
        'q': current_timestamp
    }

    headers = {
        'accept': 'application/json, text/plain, */*',
        'accept-language': 'en-GB,en;q=0.9,es;q=0.8',
        'apikey': '2bf8b222-532c-4c60-8ebe-eb6fdfebe84a',
        'cache-control': 'no-cache',
        'dnt': '1',
        'origin': 'https://www.worldtabletennis.com',
        'pragma': 'no-cache',
        'priority': 'u=1, i',
        'referer': 'https://www.worldtabletennis.com/',
        'sec-ch-ua': '"Chromium";v="140", "Not=A?Brand";v="24", "Google Chrome";v="140"',
        'sec-ch-ua-mobile': '?1',
        'sec-ch-ua-platform': '"Android"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'cross-site',
        'secapimkey': 'S_WTT_882jjh7basdj91834783mds8j2jsd81',
        'user-agent': 'Mozilla/5.0 (Linux; Android 11.0; Surface Duo) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Mobile Safari/537.36'
    }

    
    
    try:
        async with session.get(url,params=params,headers=headers,timeout = 20) as response:
            
            # Raise an error for bad status codes
            response.raise_for_status()       
            raw_json_response = await response.json()                

            if raw_json_response:          
                status_msg = "Success"
               
               
                return True, player_id, raw_json_response
               
            else:
                
                return False,player_id, None
    except Exception as e:        
        return False,  player_id, str(e)

In [522]:

async def main_scraper(player_ids: List[Tuple[int, str]]) -> Tuple[List[Dict[str, Any]], List[Tuple[int, str, str]]]:
    """
    Runs the simulation and collects results into success and failure lists.
    (Docstring remains the same)
    """
    
    successful_players: List[Dict[str, Any]] = []
    # This list will hold the (eventId, matchCode) tuples of failed tasks
    failed_players_log: List[Tuple[int, str, str]] = []
    total_tasks = len(player_ids)
    
    #start session for concurrent requests
    async with aiohttp.ClientSession() as session:
    
       
        coroutines = [get_player_details(session, task, MIN_PAUSE, MAX_PAUSE) for task in player_ids]

        print(f"--- üöÄ Launching {total_tasks} tasks concurrently... ---")
        start_time = time.time()
        processed_count = 0

        # process results as coroutines are completed.
        for future in asyncio.as_completed(coroutines):
            processed_count += 1
            
            # await result tuple - helper function returns and catches errors itself
            # including data validation using pydantic. 
            # here function returns the input tuple as an output for logging.
            status, player_id, result = await future

            # check status of function result
            
            if status:
                if (int(result.get("StatusCode")) == 200) & (result.get("Result") !=[]):
                # on successfully getting expected api response:
                    successful_players.append(result)                    
                         
                    
                else:
                    # add the input tuple AND error message to the FAILED log
                                    
                    failed_players_log.append((player_id, result))
                # print every 200 tasks to keep track of progress
            if processed_count % 500 == 0:
                elapsed = time.time() - start_time      
                log_line = f"--- üü† [{processed_count}/{total_tasks}] Time elapsed: {elapsed:.2f}s. üü† ---"
                print(log_line.ljust(80), end='\r')
                # print every 10 tasks to keep track of pro

        # Final print summary outside the loop.
        print(" " * 80, end='\r') 
        print("\n" + "=" * 50)        
        print(f"üü¢ Successfully fetched: {len(successful_players)} tasks üü¢")
        print(f"Failed to fetch (ready for retry): {len(failed_players_log)} tasks")
        
        return successful_players, failed_players_log


In [523]:

def get_latest_master_players(master_dir: str, master_regex: str) -> Tuple[pd.DataFrame, str]:
    """
    Parses the specified directory for player master files in the format yyyy_mm_dd_master_players.csv.
    Attempts to read the latest file in this format. 

    Args:
        master_dir (str): The folder where the master files are stored (e.g., '../Data/Master/Players').
        master_regex (str): The pattern to match (e.g., '^\\d{8}_master_players.csv$').

    Returns:
        Tuple[pd.DataFrame, str]: A tuple containing the DataFrame and the path to the file found, 
                                or a blank DataFrame and an empty string on failure.
    """
    
    # 1. Directory Existence Check
    if not os.path.isdir(master_dir):
        print(f"‚ùå {master_dir} does not exist as a directory.")
        return pd.DataFrame(columns=MINIMAL_PLAYER_COLUMNS), ""
    
    # 2. Find all CSV files
    files = glob.glob(f"{master_dir}/*.csv")
    master_players_files = []
    
    if not files:
        print(f"‚ùå No existing *.csv files found in MASTER Players Directory: {master_dir}") 
        return pd.DataFrame(columns=MINIMAL_PLAYER_COLUMNS), ""

    # 3. Filter files by regex pattern
    for file in files:
        filename = os.path.basename(file)    
        
        if re.match(master_regex, filename):
            master_players_files.append(file)

    if not master_players_files:
        print(f"‚ùå No existing MASTER files in format: {master_regex} in {master_dir}")
        return pd.DataFrame(columns=MINIMAL_PLAYER_COLUMNS), ""
    
    # 4. Find the latest file (sorting by name/date)
    master_players_files.sort()     
    latest_master_players_file_path = master_players_files[-1]

    # 5. Read the file
    try: 
        latest_master_players_df = pd.read_csv(latest_master_players_file_path)
        print(f"‚úÖ {len(latest_master_players_df)} players found in latest MASTER: {latest_master_players_file_path}") 
        return latest_master_players_df, latest_master_players_file_path
        
    except Exception as e:
        print(f"‚ùå Error reading latest MASTER player file, {latest_master_players_file_path}: {e}")
        return pd.DataFrame(columns=MINIMAL_PLAYER_COLUMNS), ""




In [524]:
if __name__ == "__main__": 

    master_matches_df = get_latest_master_matches(MASTER_MATCHES_DIR, MASTER_MATCHES_REGEX)
    if master_matches_df.empty:
        print(f"‚ùå Exiting: No existing Master Matches File available")
        sys.exit(1)

    # 2. Calculate ALL potential player IDs from the matches (Winners + Losers)
    # We use a set immediately to handle duplicates automatically
    all_match_player_ids = set(master_matches_df["winnerId"].dropna().unique()) | \
                           set(master_matches_df["loserId"].dropna().unique())

    # 3. Get the list of players we have ALREADY scraped
    latest_master_player_df, master_file_path = get_latest_master_players(MASTER_PLAYERS_DIR, MASTER_PLAYERS_REGEX)
    
    already_scraped_ids = set()
    
    if not latest_master_player_df.empty:
        # Assuming the column in your master player file is 'IttfId' (based on previous prompts)
        # If it is 'playerId', change "IttfId" to "playerId" below.
        if "IttfId" in latest_master_player_df.columns:
             already_scraped_ids = set(latest_master_player_df["IttfId"].unique())
        elif "playerId" in latest_master_player_df.columns:
             already_scraped_ids = set(latest_master_player_df["playerId"].unique())

    # 4. Determine which players are left to scrape (Set Subtraction)
    # Logic: (All Players in Matches) MINUS (Players already in Master File)
    ids_to_scrape_set = all_match_player_ids - already_scraped_ids
    
    # Convert back to list for processing
    player_ids = list(ids_to_scrape_set)

    print(f"\nüèì Total unique players in matches: {len(all_match_player_ids)}")
    print(f"üèì Already scraped: {len(already_scraped_ids)}")
    print(f"üèì New players to scrape: {len(player_ids)} üèì")
    
  
    # initialise variables for main player scraping
    all_successful_data = []    
    failures = [] 
    # set retries count to 0 
    retries = 0
    # Start total timer
    global_start_time = time.time() 


    ############################ START OF MAIN ASYNC LOOP  #####################################

    # while loop - keep trying if there are still matches to get or until max retries count is reached
    while (retries < MAX_RETRIES) and bool(player_ids):
        retries += 1 # Increment attempt counter (Attempt 1, 2, ...)
        
        # print intial scraping started
        if retries == 1:
            print(f"--- üöÄ Starting initial scrape for {len(player_ids)} matches... ---")
        else:
            # log if a retry has started
            print(f"\n--- üîÑ Starting Retry {retries-1}/{MAX_RETRIES-1} for {len(player_ids)} remaining matches... ---")
        
        # start scraping and log retry start time (useful if one retry goes awry)
        attempt_start_time = time.time()
        
        # run the async fetching 
        successes, failures = await (main_scraper(player_ids=player_ids))
        
        
        # process results as they finish
        
        # Add new successes to the successes list 
        all_successful_data.extend(successes)
        
        # after each retry - update new matches to scrape list for next loop

        player_ids = [player_id for player_id,e in failures]
        
        # log results of the retry attempt dependent on success / fails
        attempt_duration = time.time() - attempt_start_time
        if successes: 
            print(f"--- Attempt {retries} finished in {attempt_duration:.2f}s. Got {len(successes)} new results. ---")
        if failures: 
            print(f"--- {len(failures)} tasks failed and will be retried. ---")

    ################################### END OF MAIN ASYNC LOOP #####################################

    # Final Printing summary (OUTSIDE OF MAIN LOOP) 
    print("\n" + "=" * 50)
    
    # if no matches left to scrape, then ¬†log success :) 
    if not player_ids: 
        print("--- ‚úÖüèìüü¢ Scraping Complete. All tasks finished successfully. üü¢üèì‚úÖ---")
    else:
        # 'failures' holds the leftover failures ¬†from the LAST attempt
        print(f"--- ‚ö†Ô∏è Scraping Complete. {len(failures)} tasks permanently failed after {MAX_RETRIES} attempts. ---")
        
    print(f"Total successful players collected: {len(all_successful_data)}")
    total_duration = time.time() - global_start_time
    print(f"Total Wall Clock Time: {total_duration:.2f} seconds.")      


‚úÖ 23814 matches found in latest MASTER: ../Data/Master/Matches/20251114_master_matches.csv 
‚úÖ 2697 players found in latest MASTER: ../Data/Master/Players/20251115_master_players.csv

üèì Total unique players in matches: 2697
üèì Already scraped: 2697
üèì New players to scrape: 0 üèì

--- ‚úÖüèìüü¢ Scraping Complete. All tasks finished successfully. üü¢üèì‚úÖ---
Total successful players collected: 0
Total Wall Clock Time: 0.00 seconds.


  latest_master_df = pd.read_csv(latest_master)


In [525]:
failures_ids = [fail[0] for fail in failures]

winners_df = master_matches_df[["winnerId", "winnerName","winnerCountry"]].drop_duplicates()
winners_df = winners_df.rename(columns={"winnerId": "playerId", "winnerName": "playerName", "winnerCountry": "playerCountry"})
losers_df = master_matches_df[["loserId", "loserName","loserCountry"]].drop_duplicates()
losers_df = losers_df.rename(columns={"loserId": "playerId", "loserName": "playerName", "loserCountry": "playerCountry"})
players_lookup_df = pd.concat([winners_df, losers_df], axis=0).drop_duplicates()

In [526]:
failed_players_filter = players_lookup_df["playerId"].isin(failures_ids)
failed_players_df = players_lookup_df[failed_players_filter]
failed_players_df.rename(columns={"playerId": "IttfId"}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  failed_players_df.rename(columns={"playerId": "IttfId"}, inplace=True)


In [527]:
players_details_list = [result.get("Result")[0] for result in all_successful_data]
players_details_df = pd.DataFrame(players_details_list)
players_details_df

# ids_to_enrich = players_details_df["IttfId"]

In [528]:
def get_player_style_simple(player_id):
    """
    Scrapes a player's profile page and searches the raw HTML for style and grip keywords.
    Returns a dictionary of {'style': style, 'grip': grip}.
    """
    url = 'https://results.ittf.link/index.php/player-profile/list/60'
    params = {'vw_profiles___player_id_raw': player_id}
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'
    }

    sleep_duration = random.uniform(0.01, 0.03)
    time.sleep(sleep_duration)
    
    # Initialize variables
    style = pd.NA
    grip = pd.NA
    hand = pd.NA

    try:
        # Get the entire HTML content as a single string
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        html_text = response.text
        

       
       
        # checking 'Defence' first ensures correct categorization.
        if 'Defence' in html_text:
            style = 'Defender'
        elif 'Attack' in html_text:
            style = 'Attacker'
        # Else: style remains 'Style not found'
        
       
        if 'penhold' in html_text.lower():
            grip = 'Penhold'
        elif 'shakehand' in html_text.lower():
            grip = 'Shakehand'


        if "Left-Hand" in html_text:
            hand = "Left"
        elif "Right-Hand" in html_text:
            hand = "Right"      
      
        
        # Return both results as a dictionary
        return {'IttfId': player_id, 'hand': hand, 'style': style, 'grip': grip}
        
    except Exception as e:
        print(f"An error occurred for player {player_id}: {e}")
        # Return a dictionary indicating failure for both fields
        return {'IttfId': player_id,'hand': pd.NA, 'style': pd.NA, 'grip': pd.NA, "error": str(e)}

In [529]:
import pandas as pd
import country_converter as coco
import numpy as np

# --- GLOBAL CONSTANTS (Defined by the user) ---
FLAG_CDN_BASE_URL = "https://flagcdn.com/"
FLAG_CDN_SUFFIX = ".svg"
cc = coco.CountryConverter()

# --- CUSTOM OVERRIDE MAPS ---
# Non-sovereign entities or known sports codes that must be handled manually.
CUSTOM_ISO2_OVERRIDES = {
    'WALES': 'gb-wls',           
    'SCOTLAND': 'gb-sct', 
    'NORTHERN IRELAND': 'gb-nir',
    'ENGLAND': 'gb-eng',        
    'TAHITI': 'PYF', 
    'CENTRAL AFRICA': 'CF',  
    'REFUGEE': 'REF',
    'AIN': 'AIN'                
}

# Special URL Overrides (for restricted or non-flag sources)
CUSTOM_FLAG_URLS = {
    'REF': 'https://upload.wikimedia.org/wikipedia/commons/a/a7/Olympic_flag.svg',
    'REFUGEE': 'https://upload.wikimedia.org/wikipedia/commons/a/a7/Olympic_flag.svg',
    'RU': 'https://upload.wikimedia.org/wikipedia/commons/6/62/F1_white_flag.svg', # Russia
    'BY': 'https://upload.wikimedia.org/wikipedia/commons/6/62/F1_white_flag.svg', # Belarus
    'AIN': 'https://upload.wikimedia.org/wikipedia/commons/6/62/F1_white_flag.svg' # AIN
}

# --- 1. ISO2 CODE GENERATOR (Modified from user's input) ---

def convert_org_name_to_iso2(org_name):
    """
    Converts a full organizational name to its 2-letter ISO2 code or a custom regional code.
    Prioritizes the manual override map.
    """
    if pd.isna(org_name) or not str(org_name).strip():
        return pd.NA
        
    upper_name = str(org_name).strip().upper()

    # 1. MANUAL OVERRIDE CHECK
    if upper_name in CUSTOM_ISO3_OVERRIDES:
        return CUSTOM_ISO3_OVERRIDES[upper_name]

    # 2. GENERAL CONVERSION (Regex/Fuzzy Match)
    try:
        # Use src='regex' for fuzzy matching against the full name
        iso2_code = cc.convert(
            names=upper_name, 
            src='regex', 
            to='ISO2', # Target format is 2-letter ISO2
            not_found=None 
        )
        # Return the ISO2 code if found, otherwise return NA
        return iso2_code if iso2_code else pd.NA
    except Exception:
        return pd.NA

# --- 2. FINAL URL COMPOSER (The main function to call) ---

def get_final_flag_url(org_name):
    """
    Master function: Converts organization name to ISO2/Custom code and returns the final URL.
    """
    # 1. Get the clean ISO2/Custom Code (e.g., 'GB-ENG' or 'JP' or 'REF')
    clean_code = convert_org_name_to_iso2(org_name)
    
    if pd.isna(clean_code):
        return pd.NA
    
    code = str(clean_code).upper()

    # 2. Check for Special URL Overrides (e.g., Russia/Refugee)
    if code in CUSTOM_FLAG_URLS:
        return CUSTOM_FLAG_URLS[code]
    
    # 3. Standard CDN Fallback (e.g., JP -> https://flagcdn.com/w2560/jp.svg)
    return f"{FLAG_CDN_BASE_URL}{code.lower()}{FLAG_CDN_SUFFIX}"

# --- Example Application ---
# master_players_df['finalFlagUrl'] = master_players_df['OrganizationName'].apply(get_final_flag_url)

In [None]:


print("--- üü¢ Enriching new player data... ---")
styles_dicts: List[Dict[str, Any]] = []

# Create a DataFrame from the new players you just scraped
# Cell [05678ba7] should create players_details_df from all_successful_data
players_details_list = [result.get("Result")[0] for result in all_successful_data if result.get("Result")]
players_details_df = pd.DataFrame(players_details_list)

if not players_details_df.empty:
    id_list = players_details_df["IttfId"].tolist()
    total_players = len(id_list)
    
    print(f"--- üü¢ Commencing Player Detail Scraper for {total_players} new players (10 Threads) üü¢ ---")
    MAX_WORKERS = 10 
    processed_count = 0
    start_time = time.time()

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_id = {executor.submit(get_player_style_simple, player_id): player_id for player_id in id_list}
        
        for future in as_completed(future_to_id):
            processed_count += 1
            try:
                player_result = future.result() 
                styles_dicts.append(player_result)
                progress_message = f"‚úÖ Found Data for ID:{player_result['IttfId']} {processed_count}/{total_players}. Time: {time.time() - start_time:.2f}s..."
                print(progress_message.ljust(80), end='\r', flush=True)
            except Exception as exc:
                player_id_failed = future_to_id[future]
                print(f"\n‚ùå Task failed for ID {player_id_failed}: {exc}")

    
    player_styles_df = pd.DataFrame(styles_dicts)
    total_duration = time.time() - start_time
    print(" " * 80, end='\r') 
    print(f"‚úÖ Finished! \nSuccessfully fetched styles for {len(player_styles_df)}/{total_players} players in {total_duration:.2f} seconds.")

    players_details_df.dropna(axis=0, how='all', inplace=True)
    players_details_df.dropna(axis=1, how='all', inplace=True)
    players_details_df["HeadShot"] = players_details_df["HeadShot"].fillna(players_details_df["HeadshotR"])
    players_details_df["HeadShot"] = players_details_df["HeadShot"].fillna(players_details_df["HeadshotL"])
    players_details_df = players_details_df.drop(columns=["HeadshotR", "HeadshotL"])
    
    enriched_new_players_df = pd.merge(players_details_df, player_styles_df, how='left', on='IttfId')
    enriched_new_players_df["Grip"] = enriched_new_players_df["Grip"].fillna(enriched_new_players_df["grip"])
    enriched_new_players_df["Handedness"] = enriched_new_players_df["Handedness"].fillna(enriched_new_players_df["hand"])
    enriched_new_players_df = enriched_new_players_df.drop(columns=["grip", "hand"])
    enriched_new_players_df = enriched_new_players_df.rename(columns={"Handedness":"Hand", "style":"Style", "IttfId":"playerId"})
    enriched_new_players_df["Hand"] = enriched_new_players_df["Hand"].str.strip(" Hand").str.strip(" h")
else:
    print("--- üü° No new players were successfully scraped to enrich. ---")
    enriched_new_players_df = pd.DataFrame() # Create empty DF


if "IttfId" in latest_master_player_df.columns:
    latest_master_player_df.rename(columns={"IttfId": "playerId"}, inplace=True)
    
if "playerId" in enriched_new_players_df.columns:
    # Ensure new player 'playerId' is int, not float, to match
    enriched_new_players_df['playerId'] = enriched_new_players_df['playerId'].astype(int)

print(f"\n--- üíæ Combining Data ---")
print(f"Old master file has: {len(latest_master_player_df)} players")
print(f"New players scraped:   {len(enriched_new_players_df)} players")

# Combine the old master list with the newly enriched players
master_players_df = pd.concat([latest_master_player_df, enriched_new_players_df], ignore_index=True)



# Drop duplicates, keeping the NEWEST entry (from 'enriched_new_players_df')
master_players_df.drop_duplicates(subset=['playerId'], keep='last', inplace=True)

new_player_column_order = [
    'playerId', 'PlayerName', 'PlayerFamilyName', 'PlayerGivenName', 'PlayerFamilyNameFirst', 
    'Gender', 'Age', 'DOB', 'CountryName', 'CountryCode', 'NationalityName', 
    'NationalityCode', 'OrganizationName', 'OrganizationCode', 
    'Style', 'Grip', 'Hand', 'HeadShot'
]

# Filter for any columns that might be missing (e.g., 'Age' if not in old data)
final_columns = [col for col in new_player_column_order if col in master_players_df.columns]
master_players_df = master_players_df[final_columns]


    
master_players_df = master_players_df.sort_values(by="playerId")


master_players_df['flagUrl'] = master_players_df['OrganizationName'].apply(get_final_flag_url)

men_filter = master_players_df["Gender"] == "M"
women_filter = master_players_df["Gender"] == "F"

def_headshot_filter = master_players_df["HeadShot"].str.contains("default", case=False)




master_players_df.loc[men_filter, "HeadShot"] = master_players_df.loc[men_filter, "HeadShot"].fillna("./imgs/men_headshot_default.svg")
master_players_df.loc[women_filter, "HeadShot"] = master_players_df.loc[women_filter, "HeadShot"].fillna("./imgs/women_headshot_default.svg")






# Save and log 
master_players_df.to_csv(MASTER_PLAYERS_OUTPUT_PATH, index=False)
print(f"\n--- ‚úÖ Success! ---")
print(f"Updated master file saved with {len(master_players_df)} total players to:")
print(f"{MASTER_PLAYERS_OUTPUT_PATH}") 

--- üü¢ Enriching new player data... ---
--- üü° No new players were successfully scraped to enrich. ---

--- üíæ Combining Data ---
Old master file has: 2697 players
New players scraped:   0 players


ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [531]:
master_players_df

Unnamed: 0,playerId,PlayerName,PlayerFamilyName,PlayerGivenName,PlayerFamilyNameFirst,Gender,Age,DOB,CountryName,CountryCode,NationalityName,NationalityCode,OrganizationName,OrganizationCode,Style,Grip,Hand,HeadShot,flagUrl
0,90051,Francisco LOPEZ,LOPEZ,Francisco,LOPEZ Francisco,M,63.0,09/19/1962 00:00:00,Venezuela,VEN,Venezuela,VEN,VENEZUELA,VEN,Attacker,Shakehand,Right,./imgs/men_headshot_default.svg,https://flagcdn.com/ve.svg
1,100001,Amalraj ANTHONY,ANTHONY,Amalraj,ANTHONY Amalraj,M,39.0,01/24/1986 00:00:00,India,IND,India,IND,INDIA,IND,Attacker,Shakehand,Right,https://wttsimfiles.blob.core.windows.net/wtt-...,https://flagcdn.com/in.svg
2,100032,Farah ABDELAZIZ,ABDELAZIZ,Farah,ABDELAZIZ Farah,F,33.0,09/01/1992 00:00:00,Egypt,EGY,Egypt,EGY,EGYPT,EGY,Attacker,Shakehand,Right,,https://flagcdn.com/eg.svg
3,100079,Bode ABIODUN,ABIODUN,Bode,ABIODUN Bode,M,45.0,09/10/1980 00:00:00,Nigeria,NGR,Nigeria,NGR,NIGERIA,NGR,Attacker,Shakehand,Right,https://wttsimfiles.blob.core.windows.net/wtt-...,https://flagcdn.com/ng.svg
4,100089,Luke ABRAHAMS,ABRAHAMS,Luke,ABRAHAMS Luke,M,37.0,07/04/1988 00:00:00,South Africa,RSA,South Africa,RSA,SOUTH AFRICA,RSA,Attacker,Shakehand,Right,./imgs/men_headshot_default.svg,https://flagcdn.com/za.svg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2692,222286,Ali Abbas SYED,SYED,Ali Abbas,SYED Ali Abbas,M,23.0,07/07/2002 00:00:00,India,IND,India,IND,INDIA,IND,Attacker,Shakehand,Right,./imgs/men_headshot_default.svg,https://flagcdn.com/in.svg
2693,222365,Doniyor MAKHKAMOV,MAKHKAMOV,Doniyor,MAKHKAMOV Doniyor,M,25.0,09/04/2000 00:00:00,Uzbekistan,UZB,Uzbekistan,UZB,UZBEKISTAN,UZB,Attacker,Shakehand,Right,https://wttsimfiles.blob.core.windows.net/wtt-...,https://flagcdn.com/uz.svg
2694,222391,Daria SPASOVA,SPASOVA,Daria,SPASOVA Daria,F,17.0,11/16/2007 00:00:00,Bulgaria,BUL,Bulgaria,BUL,BULGARIA,BUL,Attacker,Shakehand,Right,,https://flagcdn.com/bg.svg
2695,222625,Natalia GAJEWSKA,GAJEWSKA,Natalia,GAJEWSKA Natalia,F,22.0,05/18/2003 00:00:00,Poland,POL,Poland,POL,POLAND,POL,Attacker,Shakehand,Right,,https://flagcdn.com/pl.svg
