In [1]:
import os 
import json
import glob 
import pandas as pd
from datetime import datetime, timezone, date
import requests
from IPython.display import clear_output
import random 
import time 
import numpy as np

In [2]:
match_files_path = "Data/Raw/Match_details/**/*json"
all_match_files = glob.glob(match_files_path, recursive=True)
total_files = len(all_match_files)
print(f"{total_files} matches found")

target_files = all_match_files

0 matches found


In [3]:
player_ids= set()

for file in target_files:

    try:
    
        with open(file, 'r', encoding="utf-8") as f:
            
            match_data = json.load(f)
          
        ## 'competitiors' is a key typo but it is correct
        competitors = match_data.get("competitiors")

        if competitors and isinstance(competitors, list):
        
            player_id_1 = competitors[0].get("competitiorId")
            player_id_2 = competitors[1].get("competitiorId")
    
            if player_id_1:
                player_ids.add(player_id_1)
            if player_id_2:
                player_ids.add(player_id_2)
        
    except json.JSONDecodeError:
        print(f"ERROR: Could not decode JSON for file:{file}")
    except Exception as e:
        print(f"ERROR: Could not process file {file}")

In [4]:
player_ids_list = list(player_ids)

In [5]:
player_ids_list.sort()
print(len(player_ids_list))

0


In [14]:
def get_player_details(player_id):
    """
    Fetches detailed information for a single player using their ITTF ID.

    Args:
        player_id (int or str): The ITTF ID of the player.

    Returns:
        dict: The JSON response data as a Python dictionary, or None if an error occurs.
    """
    url = 'https://wtt-ttu-connect-frontdoor-g6gwg6e2bgc6gdfm.a01.azurefd.net/Players/GetPlayers'
    
    # Generate a current timestamp in the required format to avoid cached results.
    current_timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'

    params = {
        'IttfId': player_id,
        'q': current_timestamp
    }

    headers = {
        'accept': 'application/json, text/plain, */*',
        'accept-language': 'en-GB,en;q=0.9,es;q=0.8',
        'apikey': '2bf8b222-532c-4c60-8ebe-eb6fdfebe84a',
        'cache-control': 'no-cache',
        'dnt': '1',
        'origin': 'https://www.worldtabletennis.com',
        'pragma': 'no-cache',
        'priority': 'u=1, i',
        'referer': 'https://www.worldtabletennis.com/',
        'sec-ch-ua': '"Chromium";v="140", "Not=A?Brand";v="24", "Google Chrome";v="140"',
        'sec-ch-ua-mobile': '?1',
        'sec-ch-ua-platform': '"Android"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'cross-site',
        'secapimkey': 'S_WTT_882jjh7basdj91834783mds8j2jsd81',
        'user-agent': 'Mozilla/5.0 (Linux; Android 11.0; Surface Duo) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Mobile Safari/537.36'
    }

    try:
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.HTTPError as err:
        print(f"HTTP Error occurred for player {player_id}: {err}")
    except requests.exceptions.RequestException as err:
        print(f"An error occurred for player {player_id}: {err}")
    
    return None




In [None]:
players_list = []
ids = player_ids_list
total_players = len(ids)
print(f"--- ðŸŸ¢ Commencing Player Detail Scraper for {total_players} players ðŸŸ¢ ---")

for number, player_id in enumerate(ids):
    
   
    sleep_duration = random.uniform(0.02, 0.1)
        
   
    

    player_data_response = get_player_details(player_id)
    

    if player_data_response:
        result_list = player_data_response.get("Result")
        if result_list and isinstance(result_list, list):
            player_data = result_list[0]
            players_list.append(player_data)

    clear_output(wait=True)
    print(f"--- ðŸŸ¢ Commencing Player Detail Scraper for {total_players} players ðŸŸ¢ ---")
    print(f"âœ…Found Data for ID:{player_id} {number + 1}/{total_players} (ID: {player_id}). Pausing for {sleep_duration:.1f}s...")

    time.sleep(sleep_duration)


clear_output(wait=True)
print(f"âœ… Finished! \nSuccessfully fetched details for {len(players_list)}/{total_players} players.")
players_df = pd.DataFrame(players_list)

âœ… Finished! 
Successfully fetched details for 2087/2439 players.


In [19]:
players_df = players_df.dropna(axis=1, how='all')
columns_to_drop = [
    'PlayerGivenName',
    'PlayerFamilyName',
    'PlayerFamilyNameFirst',
    'CountryName',
    'NationalityCode',
    'NationalityName',
    'OrganizationCode',
    'OrganizationName',    
    'HeadshotR',
    'HeadshotL'
]


cleaned_players_df = players_df.drop(columns=columns_to_drop, errors='ignore')

display(cleaned_players_df.head())

Unnamed: 0,IttfId,PlayerName,CountryCode,Gender,Age,DOB,Handedness,Grip,HeadShot
0,100032,Farah ABDELAZIZ,EGY,F,33,09/01/1992 00:00:00,Right Hand,Shakehand,https://wttsimfiles.blob.core.windows.net/wtt-...
1,100079,Bode ABIODUN,NGR,M,45,09/10/1980 00:00:00,Right Hand,Shakehand,https://wttsimfiles.blob.core.windows.net/wtt-...
2,100089,Luke ABRAHAMS,RSA,M,37,07/04/1988 00:00:00,,,
3,100189,Ahmed ALAWLAQI,QAT,M,34,01/23/1991 00:00:00,,,https://wttsimfiles.blob.core.windows.net/wtt-...
4,100196,Moosa AHMED,MDV,M,38,07/10/1987 00:00:00,,,


In [15]:
def get_player_style(player_id):
    """
    Scrapes a player's profile page and searches the raw HTML for style keywords.
    This is very rudimentary but usually works! Data ia taken from ITTF website
    """
    url = 'https://results.ittf.link/index.php/player-profile/list/60'
    params = {'vw_profiles___player_id_raw': player_id}
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'
    }

    try:
        # Get the entire HTML content as a single string
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        html_text = response.text
        

        # Check if the keywords exist anywhere in the text
        if 'Attack' in html_text:
            return 'Attack'
        elif 'Defence' in html_text:
            return 'Defence'
        else:
            return 'Style not found'

    except Exception as e:
        print(f"An error occurred for player {player_id}: {e}")
        return None

In [16]:
id = 113931 
style = get_player_style(id)
print(style)

Attack


In [None]:
style_info = []


id_list = cleaned_players_df["IttfId"]
total_players = len(id_list)

print(f"--- ðŸŸ¢ Commencing Player Detail Scraper for {total_players} players ðŸŸ¢ ---")




for number, player_id in enumerate(id_list):
    
    sleep_duration = random.uniform(0.01, 0.03)
    
    style = get_player_style(player_id)
    player_style = {
        "IttfId" : player_id,
        "style" : style
    }
    style_info.append(player_style)
        
    clear_output(wait=True)
    print(f"--- ðŸŸ¢ Commencing Player Detail Scraper for {total_players} players ðŸŸ¢ ---")
    print(f"âœ…Found Data for ID:{player_id} {number + 1}/{total_players} (ID: {player_id}). Pausing for {sleep_duration:.1f}s...")
    time.sleep(sleep_duration)

    
    
clear_output(wait=True)
print(f"âœ… Finished! \nSuccessfully fetched styles for {len(id_list)}/{total_players} players.")


âœ… Finished! 
Successfully fetched styles for 2087/2087 players.


In [23]:
styles_df = pd.DataFrame(style_info)
final_players_df = pd.merge(cleaned_players_df,styles_df)

In [24]:
final_players_df.replace("Style not found", None, inplace=True)

final_players_df.replace(to_replace=np.nan, value=None, inplace=True)




In [25]:
output_dir = "Data/Processed/Players"
os.makedirs(output_dir, exist_ok=True)
now= date.today().strftime('%Y%m%d')
file_name = f"{now}_players.csv"
file_path = f"{output_dir}/{file_name}"
final_players_df.to_csv(file_path, index=False)

In [26]:
final_players_df.iloc[-1]

IttfId                       90051
PlayerName         Francisco LOPEZ
CountryCode                    VEN
Gender                           M
Age                             63
DOB            09/19/1962 00:00:00
Handedness                    None
Grip                          None
HeadShot                      None
Style                       Attack
Name: 2086, dtype: object