In [1]:
import pandas as pd 
import glob 
from typing import Optional
import os 
import re
import numpy as np
from datetime import datetime
from thefuzz import fuzz, process
import sys


project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

if project_root not in sys.path:
    sys.path.append(project_root)
    
from utils.getLatestFiles import get_latest_cleaned_matches

In [None]:
CLEANED_MATCHES_DIR = "../Data/Processed/Matches/"
CLEANED_MATCHES_SUFFIX = "cleaned_matches.csv"
CLEANED_MATCHES_REGEX = rf"^\d{{8}}{re.escape(CLEANED_MATCHES_SUFFIX)}$"
START_DROP_COLS = [
    "A_rawGameScores",
    "A_rawOverallScore",
    "B_rawGameScores",
    "B_rawOverallScore",
    "actionType",
    "awayNestedGameScores",
    "bestOf",
    "calcBestOf",
    "calcNestedGameScores",
    "calcNestedOverallScores",   
    "currentGameNumber",
    "gameScoreConsistent",
    "gameScoreFlags",
    "homeNestedGameScores",
    "scoreConsistent",
    "tableNumber",
    'calculatedGameScoreWinner'
    
]

now_date = datetime.now()
date_string = now_date.strftime("%Y%m%d") 

MASTER_MATCHES_DIR = "../Data/Master/Matches"
os.makedirs(MASTER_MATCHES_DIR, exist_ok=True)
MASTER_MATCHES_SUFFIX = "_master_matches.csv"
MASTER_MATCHES_OUTPUT_PATH = os.path.join(MASTER_MATCHES_DIR,f"{date_string}{MASTER_MATCHES_SUFFIX}")



In [3]:
cleaned_matches_df = get_latest_cleaned_matches(CLEANED_MATCHES_DIR, CLEANED_MATCHES_REGEX)
cleaned_matches_df.drop(columns=START_DROP_COLS, inplace=True, errors='ignore')
cleaned_matches_df.rename(columns={'reconciledOverallScoreWinner (more reliable)':"Winner"}, inplace=True) 


❌ No existing cleaned_matches files in format: ^\d{8}cleaned_matches\.csv$ in ../Data/Processed/Matches/


In [4]:
"""
Shortened code for mapping home and away to winner and loser 
dependent on who won - thus removing home and away terms
MAY need to restore home and away terms later if further analysis is done from other match data sources
e.g match logs 
"""
is_away_winner = cleaned_matches_df["Winner"] == "away"

column_map = {
    
    'winnerName':    ('homePlayer',         'awayPlayer'),
    'winnerId':      ('homeCompetitorId',   'awayCompetitorId'),
    'winnerCountry': ('homeCompetitorOrg',  'awayCompetitorOrg'),
    
    
    'loserName':     ('awayPlayer',         'homePlayer'),
    'loserId':       ('awayCompetitorId',   'homeCompetitorId'),
    'loserCountry':  ('awayCompetitorOrg',  'homeCompetitorOrg'),
}

for new_col, (away_case, home_case) in column_map.items():
    cleaned_matches_df[new_col] = np.where(
        is_away_winner,
        cleaned_matches_df[home_case],  # Value if True
        cleaned_matches_df[away_case]  # Value if False
    )

cols_to_drop = ["homePlayer", "awayPlayer", "homeCompetitorId", "awayCompetitorId", "homeCompetitorOrg", "awayCompetitorOrg"]
integer_columns = ["trueBestOf", "winnerId", "loserId"]

cleaned_matches_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
for col in integer_columns:
    cleaned_matches_df[col] = cleaned_matches_df[col].astype("Int64")


KeyError: 'Winner'

In [None]:
cleaned_matches_df.rename(columns={
    "reconciledOverallScore": "overallScore",
    "reconciledGameScore": "gameScore",
    "trueBestOf": "bestOf",
}, inplace=True)

In [None]:
score_splits = cleaned_matches_df["overallScore"].str.split("-", expand=True)
home_score_numeric = pd.to_numeric(score_splits[0], errors='coerce')
away_score_numeric = pd.to_numeric(score_splits[1], errors='coerce')
winner_correct = (home_score_numeric >= away_score_numeric)

In [None]:
def switch_overall_score(score_str):
    """
    Safely switches a single score string like "2-4" to "4-2".
    Uses regex to be robust and handle DNF strings (like 'WO') safely.
    """
    if pd.isna(score_str):
        return pd.NA
    return re.sub(r'(\d+)-(\d+)', r'\2-\1', str(score_str))


def switch_game_score(score_str):
    """
    Switches a comma-separated list of scores.
    (This is your function, slightly hardened with a try-except block)
    """
    if pd.isna(score_str) or not score_str:
        return pd.NA
    try:
        
        return re.sub(r'(\d+)-(\d+)', r'\2-\1', str(score_str))
    except Exception:
        return pd.NA 
    
rows_to_switch = (winner_correct == False)
print(f"--- ⚙️ Standardizing Scores to 'Winner-First' Format ---")
print(f"Identifying {rows_to_switch.sum()} rows to switch...")
cleaned_matches_df.loc[rows_to_switch, 'overallScore'] =  cleaned_matches_df.loc[rows_to_switch, 'overallScore'].apply(switch_overall_score)

# Apply the switch to 'gameScore'
cleaned_matches_df.loc[rows_to_switch, 'gameScore'] = cleaned_matches_df.loc[rows_to_switch, 'gameScore'].apply(switch_game_score)

print("✅ Scores successfully standardized.")

In [None]:
score_splits = cleaned_matches_df["overallScore"].str.split("-", expand=True)
home_score_numeric = pd.to_numeric(score_splits[0], errors='coerce')
away_score_numeric = pd.to_numeric(score_splits[1], errors='coerce')
winner_correct = (home_score_numeric >= away_score_numeric)

In [None]:
cleaned_matches_df.sample(5)

In [None]:
#final amendments 
cleaned_matches_df.loc[~winner_correct, "Winner"] = pd.NA

In [None]:
score_parts = cleaned_matches_df['overallScore'].str.split('-', expand=True)
winner_sets = pd.to_numeric(score_parts[0], errors='coerce').astype('Int64')
loser_sets = pd.to_numeric(score_parts[1], errors='coerce').astype('Int64')
cleaned_matches_df['winnerSets'] = winner_sets
cleaned_matches_df['loserSets'] = loser_sets
cleaned_matches_df['totalSets'] = cleaned_matches_df['winnerSets'] + cleaned_matches_df['loserSets']
cleaned_matches_df['isStraightSets'] = (cleaned_matches_df['loserSets'] == 0)
cleaned_matches_df["isDecider"] = (
    cleaned_matches_df['totalSets'] == cleaned_matches_df['bestOf']
)



In [None]:
def engineer_game_score_stats(row):
    game_score = row["gameScore"]
    
    
    output_cols = [
        "winnerTotalPoints", "loserTotalPoints", "totalPoints", 
        "winnerPointsRatio", "loserPointsRatio", "winnerMaxScore", 
        "loserMaxScore", "maxScore", "numberDeuceGames", 
        "winnerDeuceWon", "loserDeuceWon", "winnerPointsDifference",
        "pointsPerSet","winnerDroppedFirstSet",
        "winner0Wins", "winner1Wins", "winner4Wins",
        "loser0Wins", "loser1Wins", "loser4Wins",
        "winnerAvgWinningMargin", "loserAvgWinningMargin",
        "comebackBy2", "comebackBy3",
        "winnerAvgPointsPerSet", "loserAvgPointsPerSet"
    ]
    
    if pd.isna(game_score) or not str(game_score).strip():
        return pd.Series(pd.NA, index=output_cols)
        
    try:
        game_scores_split = [s for s in game_score.split(",") if s.strip()]

        if not game_scores_split:
            return pd.Series(pd.NA, index=output_cols)
            
        winner_scores = [int(score.split("-")[0]) for score in game_scores_split]
        loser_scores = [int(score.split("-")[1]) for score in game_scores_split]
        
        winner_total = sum(winner_scores)
        loser_total = sum(loser_scores)
        total_points = winner_total + loser_total
        total_sets = len(game_scores_split)

        if total_points > 0:
            winner_ratio = round(winner_total / total_points, 3)
            loser_ratio = round(loser_total / total_points, 3)
        else:
            winner_ratio = np.nan
            loser_ratio = np.nan

        winner_max = max(winner_scores) if winner_scores else 0
        loser_max = max(loser_scores) if loser_scores else 0
        max_score = max(winner_max, loser_max)
        
        num_deuce = sum(1 for w_score, l_score in zip(winner_scores, loser_scores) if w_score >= 10 and l_score >= 10)
        winner_deuce_won = sum(1 for w_score, l_score in zip(winner_scores, loser_scores) if w_score > l_score and w_score >= 12)
        loser_deuce_won = sum(1 for w_score, l_score in zip(winner_scores, loser_scores) if l_score > w_score and l_score >= 12)

        winner_points_difference = winner_total - loser_total
        points_per_set = round(total_points / total_sets, 2)
        winner_avg_points_per_set = round(winner_total / total_sets, 2)
        loser_avg_points_per_set = round(loser_total / total_sets, 2)

    
        winner_dropped_first_set = (winner_scores[0] < loser_scores[0])
        
        
        winner_wins_to_love_count = 0
        winner_wins_to_1_count = 0
        winner_wins_to_4_count = 0
        loser_wins_to_love_count = 0
        loser_wins_to_1_count = 0
        loser_wins_to_4_count = 0

        win_margins = []
        lose_margins = []
        
        winner_set_score = 0
        loser_set_score = 0
        comeback_by_2 = False
        comeback_by_3 = False
        
        for w_pts, l_pts in zip(winner_scores, loser_scores):
            if w_pts > l_pts:
                winner_set_score += 1
                win_margins.append(w_pts - l_pts)
                if l_pts == 0:
                    winner_wins_to_love_count += 1
                if l_pts == 1:
                    winner_wins_to_1_count += 1
                if l_pts <= 4:
                    winner_wins_to_4_count += 1
            elif l_pts > w_pts:
                loser_set_score += 1
                lose_margins.append(l_pts - w_pts)
                if w_pts == 0:
                    loser_wins_to_love_count += 1
                if w_pts == 1:
                    loser_wins_to_1_count += 1
                if w_pts <= 4:
                    loser_wins_to_4_count += 1
            
            deficit = loser_set_score - winner_set_score
            if deficit == 2:
                comeback_by_2 = True
            if deficit == 3:
                comeback_by_3 = True
        
        winner_average_win_margin = 0.0
        if win_margins:
            winner_average_win_margin = round(sum(win_margins) / len(win_margins), 2)
            
        loser_average_win_margin = 0.0
        if lose_margins:
            loser_average_win_margin = round(sum(lose_margins) / len(lose_margins), 2)

        return pd.Series({
            "winnerTotalPoints": winner_total, 
            "loserTotalPoints": loser_total,
            "totalPoints": total_points,
            "winnerPointsRatio": winner_ratio,
            "loserPointsRatio": loser_ratio,
            "winnerMaxScore": winner_max,
            "loserMaxScore": loser_max,
            "maxScore": max_score,
            "numberDeuceGames": num_deuce,
            "winnerDeuceWon": winner_deuce_won,
            "loserDeuceWon": loser_deuce_won,
            "winnerPointsDifference": winner_points_difference,
            "pointsPerSet": points_per_set,          
            "winnerDroppedFirstSet": winner_dropped_first_set,
            "winner0Wins": winner_wins_to_love_count,
            "winner1Wins": winner_wins_to_1_count,
            "winner4Wins" : winner_wins_to_4_count,
            "loser0Wins": loser_wins_to_love_count,
            "loser1Wins": loser_wins_to_1_count,
            "loser4Wins" : loser_wins_to_4_count,
            "winnerAvgWinningMargin": winner_average_win_margin,
            "loserAvgWinningMargin": loser_average_win_margin,
            "comebackBy2": comeback_by_2,
            "comebackBy3": comeback_by_3,
            "winnerAvgPointsPerSet": winner_avg_points_per_set,
            "loserAvgPointsPerSet": loser_avg_points_per_set
        })
        
    except (ValueError, IndexError, TypeError) as e:
        
        return pd.Series(pd.NA, index=output_cols)

In [None]:
new_cols=  cleaned_matches_df.apply(engineer_game_score_stats, axis=1, result_type='expand')

In [None]:
cleaned_matches_df = pd.concat([cleaned_matches_df, new_cols], axis=1)

In [None]:
tie_filter = cleaned_matches_df["Winner"]=="tie"
len(cleaned_matches_df[tie_filter])
cleaned_matches_df[tie_filter][["EventName","winnerName", "loserName", "Winner","overallScore","gameScore"]]


In [None]:
final_desc_filter = cleaned_matches_df["subEventDescription"].str.contains(" Final",case=False)
final_code_filter = cleaned_matches_df["documentCode"].str.contains("-FNL",case=False)

final_mismatch_1 = final_desc_filter & ~final_code_filter
final_mismatch_2 = ~final_desc_filter & final_code_filter

final_mismatch_filter = final_mismatch_1 | final_mismatch_2
cleaned_matches_df[final_mismatch_filter]

In [None]:
non_consolation_filter1 = ~cleaned_matches_df["subEventDescription"].str.contains("Consolation",case=False)
non_consolation_filter2 = ~cleaned_matches_df["documentCode"].str.contains("-CON",case=False)

non_consolation_filter = non_consolation_filter1 & non_consolation_filter2

finals_regex = " final|gold"
finals_code = "-FNL"
finals_desc_filter = cleaned_matches_df["subEventDescription"].str.contains(finals_regex,regex=True,na=False,case=False)
finals_code_filter = cleaned_matches_df["documentCode"].str.contains(finals_code,na=False,case=False)

finals_mask = finals_desc_filter & finals_code_filter & non_consolation_filter


semis_regex = "semi"
semis_code = "-SFNL"
semis_desc_filter = cleaned_matches_df["subEventDescription"].str.contains(semis_regex,regex=True,na=False,case=False)
semis_code_filter = cleaned_matches_df["documentCode"].str.contains(semis_code,na=False,case=False)

semis_mask = semis_desc_filter & semis_code_filter & non_consolation_filter


quarts_regex = "quarter"
quarts_code = "-QFNL|-8FNL"
quarts_desc_filter = cleaned_matches_df["subEventDescription"].str.contains(quarts_regex,regex=True,na=False,case=False)
quarts_code_filter = cleaned_matches_df["documentCode"].str.contains(quarts_code,na=False,case=False)

quarts_mask = quarts_desc_filter & quarts_code_filter & non_consolation_filter


cleaned_matches_df.loc[finals_mask, "Round"] = "Final"
cleaned_matches_df.loc[semis_mask, "Round"] = "Semifinal"
cleaned_matches_df.loc[quarts_mask, "Round"] = "Quarterfinal"
cleaned_matches_df["Round"] = cleaned_matches_df["Round"].fillna("Other")


In [None]:
cleaned_matches_df.to_csv(MASTER_MATCHES_OUTPUT_PATH, index=False)

In [None]:
cleaned_matches_df