In [None]:
import requests
import json
import pandas as pd
import os
from datetime import datetime
import re
import random 
import time 
import glob
import asyncio
from typing import Tuple, Optional, List, Dict, Any, Set
from pydantic import BaseModel, ValidationError, Field

In [57]:
MASTER_EVENTS_DIR = "../Data/Master/Events"
MASTER_EVENTS_SUFFIX = "_master_events.csv"
MASTER_EVENTS_REGEX = rf"^\d{{8}}{re.escape('_')}{re.escape(MASTER_EVENTS_SUFFIX)}$"

SINGLES_PAYLOADS_DIR = "../Data/Processed/Singles_match_payloads"

RAW_MATCH_DETAILS_DIR = "../Data/Raw/Match_details"
os.makedirs(RAW_MATCH_DETAILS_DIR, exist_ok=True)

MAX_RETRIES = 2

In [58]:
def get_latest_master_events(master_dir:str, master_regex) -> Tuple[pd.DataFrame,Optional[str]]:
    """
    Parses specified directory for events files in format yyyy_mm_dd. 
    Attempts to read latest file in this format. 

    Args:
        directory (str): The folder where the master files are stored (e.g., '../Data/Events/Intermediate').
        filename_pattern (str): The pattern to match (e.g., '*_events_intermediate.csv').

    Returns:
        Tuple[pd.DataFrame,Optional]: returns DF with data if available or blank df if data unavailable
    """
    if not os.path.isdir(master_dir):
        print (f"‚ùå{master_dir} does not exist as a directory")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None    
    
    # Get csv files in 
    files = glob.glob(f"{master_dir}/*.csv")
   

    master_files = []
    
    if not files:
        print(f"‚ùå No existing *.csv files found in MASTER Events Directory: {master_dir} ")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None 

    for file in files:
        filename = os.path.basename(file)    
       
        if re.match(master_regex,filename):
          master_files.append(file)

    if not master_files:
        print(f"‚ùå No existing MASTER files in format: {master_regex} in {master_dir}")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None 
    master_files.sort()    
    latest_master = master_files[-1]

    try: 
        latest_master_df = pd.read_csv(latest_master)
        print(f"‚úÖ {len(latest_master_df)} events found in latest MASTER: {latest_master} ")
        return latest_master_df, latest_master
        
    except Exception as e:
        print (f"‚ùå Error reading lastest MASTER, {latest_master}: {e}")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None 

In [59]:
def get_all_payloads(singles_payloads_dir:str) -> List[Tuple[int, str]]:
    """
    Reads all singles payload files and returns a complete list of (eventId, documentCode) tuples.
    """
    print(f"--- üü† Finding  all event ids and payloads from {singles_payloads_dir} ---")

    
    all_csv_files = glob.glob(f"{singles_payloads_dir}/*.csv")
    all_payloads_list = []
    
    if not all_csv_files:
        print(f"--- ‚ùå ERROR: No CSV files found in {singles_payloads_dir}. ---")
        return []
    
  
    
    for file_path in all_csv_files:    
        filename = os.path.basename(file_path)       
            
        
        try:
            # ONLY get the eventId match code
            payload_df = pd.read_csv(file_path, usecols=['eventId', 'documentCode'])
            if payload_df.empty:
                continue

            payload_df['eventId'] = payload_df['eventId'].astype(int)
            payload_df['documentCode'] = payload_df['documentCode'].astype(str)            
            # Convert to list of (eventId, documentCode) tuples
            payloads = list(payload_df[['eventId', 'documentCode']].itertuples(index=False, name=None))
            all_payloads_list.extend(payloads)

            
            if payload_df.empty:
                continue
        except (pd.errors.EmptyDataError, KeyError, FileNotFoundError) as e:
            print(f"WARN: Could not read payload file {filename}: {e}")
            continue
        # check for duplicates and remove :) 
    all_payloads_list = list(set(all_payloads_list))
    if all_payloads_list:
        print(f"--- ‚úÖ All desired payloads: Found {len(all_payloads_list)} total unique matches to scrape. ---")
        return  all_payloads_list
    else:
        print(f"‚ùå No Match payloads found")
 
                   

In [60]:
def get_obtained_match_details(raw_match_details_dir:str) -> List[Tuple[int, str]]:
    """
    Parses all obtained singles_match_details files. 
    Return a list of tuple(eventId, match_code) which are require for scraping
    used to determine the matches that are already found. 
    """
    print(f"--- üü† Finding already obtained match details from {raw_match_details_dir} ---")

    all_details_list = []
    
    # get all file_paths
    all_details_files = glob.glob(f"{raw_match_details_dir}/*match_details.json")
    files_processed_count = 0
    
    for file_path in all_details_files: 
        files_processed_count += 1
        
        # try to read the file (catch error if it fails)
        try:
            # Safely open and load the JSON file
            with open(file_path,"r") as f:
                matches_list = json.load(f)
            
            # Check if the file contains the expected list of matches
            if not isinstance(matches_list, list):
                 print(f"WARN: Skipping file {os.path.basename(file_path)}: Content is not a list.")
                 continue

            # get eventId and match code from each match 
            for match in matches_list:
                event_id_raw = match.get("eventId")
                match_code = match.get("documentCode")
                
                # check the data exists.
                if event_id_raw and match_code:
                    try:
                        event_id = int(event_id_raw) # Ensure ID is an integer
                        all_details_list.append((event_id, match_code))
                    except ValueError:
                        print(f"WARN: Skipping record in {os.path.basename(file_path)} due to bad data eventId.")
                
        except json.JSONDecodeError:
            print(f"‚ùå ERROR: Failed to read JSON in {os.path.basename(file_path)}. ")
        except Exception as e:
            print(f"‚ùå ERROR: Unexpected error reading {os.path.basename(file_path)}: {type(e).__name__}")
            
    # convert to set to remove duplicates 
    final_unique_list = list(set(all_details_list))
    
    # get the number of events read 
    unique_events_obtained = len(set(match_tuple[0] for match_tuple in final_unique_list))
    
    print(f"--- ‚úÖ Found {len(final_unique_list)} total unique match details across {files_processed_count} files. ---")
    print(f"--- ‚úÖ Unique Events with Data: {unique_events_obtained} ---")

    return final_unique_list

In [61]:
if __name__ == "__main__":    

    latest_master_df, latest_master_file = get_latest_master_events(MASTER_EVENTS_DIR, MASTER_EVENTS_REGEX)
    if latest_master_df.empty:
        print(f"‚ùå Exiting: No existing Master Events File available")
        sys.exit(1)

    all_payloads = get_all_payloads(SINGLES_PAYLOADS_DIR)
    already_obtained_matches = get_obtained_match_details(RAW_MATCH_DETAILS_DIR)

    intitial_matches_to_scrape = list(set(all_payloads) - set(already_obtained_matches))
    intitial_matches_to_scrape_count = len(intitial_matches_to_scrape)
    intitial_events_to_scrape_count = len(set([match_tuple[0] for match_tuple in intitial_matches_to_scrape]))
    print(f"\nüèì Matches to scrape: {intitial_matches_to_scrape_count} across {intitial_events_to_scrape_count} events üèì")



‚úÖ 184 events found in latest MASTER: ../Data/Master/Events/20251030__master_events.csv 
--- üü† Finding  all event ids and payloads from ../Data/Processed/Singles_match_payloads ---
--- ‚úÖ All desired payloads: Found 24359 total unique matches to scrape. ---
--- üü† Finding already obtained match details from ../Data/Raw/Match_details ---
--- ‚úÖ Found 4 total unique match details across 2 files. ---
--- ‚úÖ Unique Events with Data: 2 ---

üèì Matches to scrape: 24355 across 184 events üèì


In [None]:
async def dummy_fetch_detail(match_tuple: Tuple[int, str]) -> Tuple[bool, Optional[Dict[str, Any]], Tuple[int, str], str]:
    """
    Simulates the fetch_match_detail_json API call.
    Uses asyncio.sleep to simulate network wait time.
    """
    event_id, match_code = match_tuple
    pause_time = random.uniform(1, 3)
    
    try:
        # 1. Wait (Simulate network I/O)
        await asyncio.sleep(pause_time)
        
        # 2. Simulate an example failure
        if random.random() < 0.2:
            raise TimeoutError(f"Simulated ReadTimeout for {match_code}")
            
        # 3. Return SUCCESS
        result_dict = {
            "eventId": event_id,
            "matchCode": match_code,
            "simulatedDuration": round(pause_time, 2),
            "status": "OFFICIAL"
        }
        status_msg = f"Simulated time: {round(pause_time, 2)}s."
        return True, result_dict, match_tuple, status_msg

    except Exception as e:
        # 4. Return FAILURE
        status_msg = f"Task failed with error: {type(e).__name__}"
        # Return status=False, no data (None), the original input tuple, and the error
        return False, None, match_tuple, status_msg



async def main_scraper_simulation(tuples_to_scrape: List[Tuple[int, str]]) -> Tuple[List[Dict[str, Any]], List[Tuple[int, str, str]]]:
    """
    Runs the simulation and collects results into success and failure lists.
    (Docstring remains the same)
    """
    
    successful_matches: List[Dict[str, Any]] = []
    # This list will hold the (eventId, matchCode) tuples of failed tasks
    failed_matches_log: List[Tuple[int, str, str]] = []
    total_tasks = len(tuples_to_scrape)
    
    # create all coroutines (helper function and the input tuples to scrape (event_id, match_code))
    coroutines = [dummy_fetch_detail(task) for task in tuples_to_scrape]

    print(f"--- üöÄ Launching {total_tasks} tasks concurrently... ---")
    start_time = time.time()
    processed_count = 0

    # process results as coroutines are completed.
    for future in asyncio.as_completed(coroutines):
        processed_count += 1
        
        # await result tuple - helper function returns and catches errors itself
        # including data validation using pydantic. 
        # here function returns the input tuple as an output for logging.
        status, result_dict, input_tuple, status_msg = await future

        # check status of function result
        
        if status:
            # on successfully getting expected api response:
            successful_matches.append(result_dict)
            
            # Logging
            event_id, match_code = input_tuple
            elapsed = time.time() - start_time
            log_line = f"[{processed_count}/{total_tasks}] ‚úÖ Finished {event_id}:{match_code}. {status_msg} Total elapsed: {elapsed:.2f}s."
            print(log_line.ljust(80), end='\r')
            
        else:
            # add the input tuple AND error message to the FAILED log
            event_id, match_code = input_tuple
            
            failed_matches_log.append((event_id, match_code, status_msg))

            # print and keep
            print(f"[{processed_count}/{total_tasks}] ‚ùå Task failed {event_id}:{match_code} with error: {status_msg}".ljust(100))

    # Final print summary outside the loop.
    print(" " * 80, end='\r') 
    print("\n" + "=" * 50)
    print("--- üü¢ Simulation Complete ---")
    print(f"Successfully fetched: {len(successful_matches)} tasks")
    print(f"Failed to fetch (ready for retry): {len(failed_matches_log)} tasks")
    
    return successful_matches, failed_matches_log


In [63]:
if __name__ == "__main__":
    
    # Get the initial list of tasks / matches to be scraped
    matches_to_scrape = intitial_matches_to_scrape[0:20] # Using your test slice    
    # List to hold ALL successful results from all attempts
    all_successful_data = []     
    retries = 0
    global_start_time = time.time() # Start total timer

    # Loop as long as we have retries left AND matches to scrape
    while (retries < MAX_RETRIES) and bool(matches_to_scrape):
        retries += 1 # Increment attempt counter (Attempt 1, 2, ...)
        
        # --- Logging (As Requested) ---
        if retries == 1:
            print(f"--- üöÄ Starting initial scrape for {len(matches_to_scrape)} matches... ---")
        else:
            # \n adds a newline for readability between attempts
            print(f"\n--- üîÑ Starting Retry {retries-1}/{MAX_RETRIES-1} for {len(matches_to_scrape)} remaining matches... ---")
        
        # --- Run the Scrape ---
        attempt_start_time = time.time()
        
        # Run the simulation on the current list of tasks
        # successes = list of dicts [{...}, {...}]
        # failures = list of tuples [(id, code), (id, code)]
        successes, failures = await (main_scraper_simulation(tuples_to_scrape=matches_to_scrape))
        
        # --- Process Results ---
        
        # Add new successes to the total collection
        all_successful_data.extend(successes)
        
        # CRITICAL FIX: The list for the *next* iteration is only the tasks that just failed
        matches_to_scrape = [(event_id, match_code)for event_id, match_code, error_msg in failures]
        # --- Log Results of This Attempt ---
        attempt_duration = time.time() - attempt_start_time
        if successes: # Only print if some were found
            print(f"--- Attempt {retries} finished in {attempt_duration:.2f}s. Got {len(successes)} new results. ---")
        if failures: # Only print if some remain
            print(f"--- {len(failures)} tasks failed and will be retried. ---")


    # --- Final Summary (Outside the loop) ---
    print("\n" + "=" * 50)
    
    if not matches_to_scrape: # If the final 'failures' list is empty
        print("--- üü¢ Simulation Complete. All tasks finished successfully. üü¢---")
    else:
        print(f"--- ‚ö†Ô∏è Simulation Complete. {len(matches_to_scrape)} tasks permanently failed after {MAX_RETRIES} attempts. ---")
        
    print(f"Total successful matches collected: {len(all_successful_data)}")
    total_duration = time.time() - global_start_time
    print(f"Total Wall Clock Time: {total_duration:.2f} seconds.")

--- üöÄ Starting initial scrape for 20 matches... ---
--- üöÄ Launching 20 tasks concurrently... ---
[6/20] ‚ùå Task failed 2932:TTEWSINGLES-----------R64-002900---------- with error: Task failed with error: TimeoutError
[14/20] ‚ùå Task failed 2867:TTEWSINGLES-----------QFNL000300---------- with error: Task failed with error: TimeoutError
[19/20] ‚ùå Task failed 3108:TTEWSINGLES-----------R128005300---------- with error: Task failed with error: TimeoutError
[20/20] ‚ùå Task failed 2693:TTEWSINGLES-----------RND1002300---------- with error: Task failed with error: TimeoutError
                                                                                
--- üü¢ Simulation Complete ---
Successfully fetched: 16 tasks
Failed to fetch (ready for retry): 4 tasks
--- Attempt 1 finished in 2.97s. Got 16 new results. ---
--- 4 tasks failed and will be retried. ---

--- üîÑ Starting Retry 1/1 for 4 remaining matches... ---
--- üöÄ Launching 4 tasks concurrently... ---
                  

In [64]:
failures

[]

False