In [172]:
import os 
import json
import glob 
import pandas as pd
from datetime import datetime, timezone, date
import requests
from IPython.display import clear_output
import random 
import time 
import numpy as np

In [173]:
def determine_winner(match_json):
    """
    Parses the match JSON to determine the winner ID, game scores, and a specific match status.
    Handles both normal scores ("4-2") and special scores ("3-WO", "0-RET", etc.).
    Returns a tuple: (winner_id, player_a_score, player_b_score, match_status)
    """
    player_a_score, player_b_score = (None, None)
    winner_id = None
    match_status = "Completed"

    try:
        score_str = match_json.get('overallScores')
        competitors = match_json.get('competitiors')

        if not (score_str and '-' in score_str and competitors and len(competitors) >= 2):
            return None, None, None, "Unknown"

        player_a_id = competitors[0].get('competitiorId')
        player_b_id = competitors[1].get('competitiorId')

        try:
            player_a_score, player_b_score = map(int, score_str.split('-'))
            if player_a_score > player_b_score:
                winner_id = player_a_id
            else:
                winner_id = player_b_id
        except ValueError:
            # --- THIS IS THE CORRECTED LOGIC ---
            match_status = "Special" # Default if we can't find a code
            # Find the part of the score string that is text (e.g., 'WO' from '0-3 WO')
            for part in score_str.replace('-', ' ').split():
                if part.isalpha():
                    match_status = part
                    break
            
            # Determine the winner based on which side of the score contains the letters
            score_parts = score_str.split('-')
            if any(c.isalpha() for c in score_parts[0]):
                winner_id = player_b_id
            elif any(c.isalpha() for c in score_parts[1]):
                winner_id = player_a_id
        
        return winner_id, player_a_score, player_b_score, match_status
    except:
        return None, None, None, "Error"

        
def calculate_total_points(scores_str):
    """
    Calculates the total points scored by a player from their scores string (e.g., "11,8,7").
    """
    if not scores_str or not isinstance(scores_str, str):
        return None
    try:
        return sum(int(p) for p in scores_str.split(',') if p.isdigit())
    except:
        return None

In [174]:
match_files_path = "Data/Raw/Match_details/**/*.json"
all_match_files = glob.glob(match_files_path, recursive=True)


all_match_files = all_match_files

total_files = len(all_match_files)
print(f"{total_files} match files found")

events_dir = "Data/Processed/Events"
pattern = os.path.join(events_dir, "[0-9]*_events.csv")
dated_files = glob.glob(pattern)

if date_stamped_files:
    latest_events_file = max(date_stamped_files)
    print(f"Found latest events file: {latest_events_file}")
else:
    print("No date-stamped event files found.")
events_df = pd.read_csv(latest_events_file)
event_name_map = pd.Series(events_df.EventName.values, index=events_df.EventId).to_dict()



21614 match files found
Found latest events file: Data/Processed/Events/20251007_events.csv


In [175]:
UPDATE_INTERVAL = 50

print(f"--- 🟢 Commencing processing for {total_files} match files 🟢 ---")

all_matches_list = []
failed_match_list = []

for number, file_path in enumerate(all_match_files):
    if number % UPDATE_INTERVAL == 0:
        clear_output(wait=True)
        print(f"--- 🟢 Commencing processing for {total_files} match files 🟢 ---")
        print(f"Processing file {number + 1}/{total_files}: {file_path}")
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            match_data = json.load(f)

        competitors = match_data.get('competitiors')
        if not (competitors and isinstance(competitors, list) and len(competitors) >= 2):
            continue

       
        winner_id, player_a_score, player_b_score, match_status = determine_winner(match_data)
        
        player_a_id = competitors[0].get('players', [{}])[0].get('playerId')
        player_b_id = competitors[1].get('players', [{}])[0].get('playerId')
        player_a_name = competitors[0].get('competitiorName')
        player_b_name = competitors[1].get('competitiorName')

        winner_name = None
        if winner_id:
            if winner_id == player_a_id:
                winner_name = player_a_name
            else:
                winner_name = player_b_name
        
        event_id = match_data.get('eventId')

        match_dict = {
            'MatchID': match_data.get('documentCode'),
            'EventID': event_id,
            'EventName': event_name_map.get(int(event_id)) if event_id else None,
            'MatchDate': match_data.get('matchDateTime', {}).get('startDateLocal'),
            'BestOf': match_data.get('matchConfig', {}).get('bestOfXGames'),
            'MatchStatus': match_status,
            'MatchDuration': match_data.get('matchDateTime', {}).get('duration'),
            'PlayerA_ID': player_a_id,
            'PlayerA_Name': player_a_name,
            'PlayerA_Country': competitors[0].get('competitiorOrg'),
            'PlayerB_ID': player_b_id,
            'PlayerB_Name': player_b_name,
            'PlayerB_Country': competitors[1].get('competitiorOrg'),
            'PlayerA_GamesWon': player_a_score,
            'PlayerB_GamesWon': player_b_score,
            'PlayerA_TotalPoints': calculate_total_points(competitors[0].get('scores')),
            'PlayerB_TotalPoints': calculate_total_points(competitors[1].get('scores')),
            'WinnerID': winner_id,
            'WinnerName': winner_name,
            'GameScores': match_data.get('gameScores'),
            'RawScoreString': match_data.get('overallScores')
        }
        
        all_matches_list.append(match_dict)

    except Exception as e:
        fail_dict = {"file": file_path, "error": str(e)}
        failed_match_list.append(fail_dict)

clear_output(wait=True)
print(f"✅ Finished! Processed {total_files} files.")

if all_matches_list:
    today_str = datetime.now().strftime('%Y%m%d')
   
    output_path_csv = f"Data/Processed/{today_str}_master_match_list.csv" 

    master_df = pd.DataFrame(all_matches_list)
   
    master_df['MatchDate'] = pd.to_datetime(master_df['MatchDate'])    
    master_df['MatchDate'] = master_df['MatchDate'].dt.strftime('%d/%m/%Y')

    master_df.to_csv(output_path_csv, index=False)
    
    print(f"Saved master_df '{output_path_csv}'")
    
if failed_match_list:
    print(f"\nEncountered {len(failed_match_list)} errors during processing.")
    pd.DataFrame(failed_match_list).to_csv('failed_files.csv', index=False)

✅ Finished! Processed 21614 files.
Saved master_df 'Data/Processed/20251009_master_match_list.csv'

Encountered 2 errors during processing.


In [177]:
failed_match_list

[{'file': 'Data/Raw/Match_details/2345_match_details/2345_TTEMSINGLES-----------R128001100----------.json',
  'error': "'NoneType' object has no attribute 'get'"},
 {'file': 'Data/Raw/Match_details/2603_match_details/2603_TTEMSINGLES-----------R32-000500----------.json',
  'error': "'NoneType' object has no attribute 'get'"}]

In [182]:
file_path = failed_match_list[0].get("file")
with open(file_path, 'r', encoding='utf-8') as f:
            match_data = json.load(f)

In [184]:
competitors = match_data.get('competitiors')

In [186]:
winner_id, player_a_score, player_b_score, match_status = determine_winner(match_data)

In [190]:
match_status

'DNS'