### Model Comparison Visualisation

We compare and visualise the performance of the bookmaker odds model in this repository and three alternatives:
    - Bradley-Terry
    - ELO
    - Random Forest (catboost)

Note: To run this notebook, you will need to run the alternative models, using either `run_alternative_models.ps1` (Windows) or `run_alternative_models.sh` (Bash - for Mac or Linux)

In [31]:
import numpy as np
import pandas as pd

### Define Parameters

In [32]:
MODEL_FOLDERS = ['bradleyterrymodel','elomodel','catboosttennismodel','bookmakeroddsmodel']
NEAT_MODEL_NAME_MAP = {'bradleyterrymodel':'Bradley-Terry','elomodel':'ELO','catboosttennismodel':'Random Forest','bookmakeroddsmodel':'Bookmaker Model'}
TOURNAMENTS = ['australian open','french open','wimbledon','us open']
NEAT_TOURNAMENT_NAME_MAP = {'australian open':'Australian Open','french open':'French Open','wimbledon':'Wimbledon','us open':'US Open'}

YEARS = [2024,2025]
GENDERS = ['male','female']

FILE_NAMES = [f"{gender}_{tournament}_{year}.csv" for gender in GENDERS for year in YEARS for tournament in TOURNAMENTS if not (tournament=='us open' and year == 2025) ]

### Determine Matches to be Removed

In [33]:
from datetime import datetime
from collections import defaultdict

combined_data = pd.concat([pd.read_csv(f"../main_package/data/{year}_{gender}.csv") for year in [2019,2020,2021,2022,2023,2024,2025] for gender in ['men','women']])
combined_data['match_date'] = combined_data['Date'].apply(
        lambda x: datetime.strptime(x, "%m/%d/%Y")
    )
combined_data = combined_data.sort_values(by='match_date')

combined_data['tournament_filter'] = combined_data['Tournament'].isin(['Australian Open','French Open','Wimbledon','US Open'])

player_match_map = defaultdict(int)
winner_cumulative_matches:list[int] = []
loser_cumulative_matches:list[int] = []

for _, match_datum in combined_data.iterrows():
    winner_cumulative_matches.append(player_match_map[match_datum['Winner']])
    loser_cumulative_matches.append(player_match_map[match_datum['Loser']])
    if not match_datum['tournament_filter']:
        player_match_map[match_datum['Winner']] += 1
        player_match_map[match_datum['Loser']] += 1

combined_data['cumulative_winner_matches'] =winner_cumulative_matches

combined_data['cumulative_loser_matches'] = loser_cumulative_matches
matches_removed = combined_data[(combined_data['tournament_filter'])&((combined_data['cumulative_winner_matches'] <= 5)|(combined_data['cumulative_loser_matches'] <= 5))]

### Calculate Model Performance

We calculate the accuracy, log-loss and Brier score of the models

In [34]:
def bootstrap_model_level_cis(
    df: pd.DataFrame,
    metrics: list[str],
    n_bootstrap: int = 10_000,
    alpha: float = 0.05,
    random_state: int = 42
) -> pd.DataFrame:
    """
    Compute model-level bootstrap confidence intervals for multiple metrics.

    Args:
        df: DataFrame containing model-level performance per tournament/year.
        metrics: List of metric columns to bootstrap (e.g. ['brier_score', 'log_loss']).
        n_bootstrap: Number of bootstrap resamples.
        alpha: Significance level (0.05 gives 95% CIs).
        random_state: Random seed for reproducibility.

    Returns:
        DataFrame with one row per model and columns for each metric's mean and CI bounds.
        Column names are of the form <metric>_mean, <metric>_ci_lower, <metric>_ci_upper.
    """
    rng = np.random.default_rng(random_state)
    results: list[dict[str, float]] = []

    for model_name, df_model in df.groupby("model_name"):
        row_dict = {"model_name": model_name}

        for metric in metrics:
            values = df_model[metric].to_numpy()
            n = len(values)

            boot_means = np.empty(n_bootstrap)
            for i in range(n_bootstrap):
                sample = rng.choice(values, size=n, replace=True)
                boot_means[i] = np.mean(sample)

            row_dict[f"{metric}_mean"] = np.mean(values)
            row_dict[f"{metric}_ci_lower"] = np.percentile(boot_means, 100 * alpha / 2)
            row_dict[f"{metric}_ci_upper"] = np.percentile(boot_means, 100 * (1 - alpha / 2))

        results.append(row_dict)

    return pd.DataFrame(results)


In [None]:
# Initialise the model performance data
model_performance_data:list[pd.Series] = []
all_model_ci_data:list[pd.DataFrame] = []
for model in MODEL_FOLDERS:
    # Create an overall model dataframe
    overall_model_dataframes:list[pd.DataFrame] = []
    total_length = 0
    for file_name in FILE_NAMES:

        # Load the data
        model_tournament_data = pd.read_csv(f"../../{model}/{file_name}")

        # Check all matches represented
        assert len(model_tournament_data) == 127

        # Remove missing data matches
        potential_missing_matches = matches_removed[(matches_removed['Tournament'] == NEAT_TOURNAMENT_NAME_MAP[file_name.split('_')[1]])&(matches_removed['Date'].apply(lambda x:int(x.split('/')[-1])) == int(file_name.split('_')[-1].replace('.csv',''))) ].reset_index(drop=True)
        potential_missing_matches['remove'] = True
        model_tournament_data = pd.merge(model_tournament_data, potential_missing_matches[['Winner','Loser','remove']], on = ['Winner','Loser'], how = 'left')
        model_tournament_data = model_tournament_data[model_tournament_data['remove'].isna()].reset_index(drop=True)

        # Calculate the metrics, adding them to the dataframe on a by-match level and calculating the tournament averages
        model_tournament_data['log_loss'] = np.log(model_tournament_data['predicted_prob_winner'])
        log_loss = (-1)*np.mean(model_tournament_data['log_loss'])
        model_tournament_data['brier_score'] = np.square(1 - model_tournament_data['predicted_prob_winner'])
        brier_score = np.mean(model_tournament_data['brier_score'])
        model_tournament_data['accuracy'] = (model_tournament_data['predicted_prob_winner']>0.5) + 0.5*(model_tournament_data['predicted_prob_winner'] == 0.5)
        accuracy = np.mean(model_tournament_data['accuracy'])

        # Add the df to the overall model dataframes
        model_tournament_data['model_name'] = NEAT_MODEL_NAME_MAP[model]
        overall_model_dataframes.append(model_tournament_data)

        # Create the series
        male_data = file_name.split('_')[0] == 'male'
        tournament = file_name.split('_')[1]
        year = int(file_name.split('_')[-1].replace('.csv',''))

        model_performance_data.append(
            pd.Series(
                {
                    'male_data':male_data,
                    'tournament':NEAT_TOURNAMENT_NAME_MAP[tournament],
                    'year':year,
                    'model_name':NEAT_MODEL_NAME_MAP[model],
                    'accuracy':accuracy,
                    'brier_score':brier_score,
                    'log_loss':log_loss
                }
            )
        )
        total_length += len(model_tournament_data)
    print(total_length)
    # Calculate the CIs for the model
    model_ci_data = bootstrap_model_level_cis(pd.concat(overall_model_dataframes),['accuracy','brier_score','log_loss'])
    all_model_ci_data.append(model_ci_data)

        


In [36]:
# Extract the bookmaker odds
overall_model_dataframes:list[pd.DataFrame] = []
for file_name in FILE_NAMES:

    # Get the tournament info from the name
    male_data = file_name.split('_')[0] == 'male'
    tournament = file_name.split('_')[1]
    year = int(file_name.split('_')[-1].replace('.csv',''))
    
    # Create the gender suffix
    gender_suffix = "men" if male_data else "women"
    tournament_data = pd.read_csv(f"../../src/main_package/data/{year}_{gender_suffix}.csv")
    tournament_data = tournament_data[tournament_data['Tournament'] == NEAT_TOURNAMENT_NAME_MAP[tournament]]

   # Remove missing data matches
    potential_missing_matches = matches_removed[(matches_removed['Tournament'] == NEAT_TOURNAMENT_NAME_MAP[file_name.split('_')[1]])&(matches_removed['Date'].apply(lambda x:int(x.split('/')[-1])) == int(file_name.split('_')[-1].replace('.csv',''))) ].reset_index(drop=True)
    potential_missing_matches['remove'] = True
    tournament_data = pd.merge(tournament_data, potential_missing_matches[['Winner','Loser','remove']], on = ['Winner','Loser'], how = 'left')
    tournament_data = tournament_data[tournament_data['remove'].isna()].reset_index(drop=True)

    # Get the win probabilities
    win_probabilities = (1/tournament_data['AvgW'])/((1/tournament_data['AvgW']) + (1/tournament_data['AvgL']))
    

    # Calculate the metrics, adding them to the dataframe on a by-match level and calculating the tournament averages
    tournament_data['log_loss'] = np.log(win_probabilities)
    log_loss = (-1)*np.mean(tournament_data['log_loss'])
    tournament_data['brier_score'] = np.square(1 - win_probabilities)
    brier_score = np.mean(tournament_data['brier_score'])
    tournament_data['accuracy'] = (win_probabilities>0.5) + 0.5*(win_probabilities == 0.5)
    accuracy = np.mean(tournament_data['accuracy'])

    # Add the df to the overall model dataframes
    tournament_data['model_name'] = "Actual Odds"
    overall_model_dataframes.append(tournament_data)

    # Create the series
    male_data = file_name.split('_')[0] == 'male'
    tournament = file_name.split('_')[1]
    year = int(file_name.split('_')[-1].replace('.csv',''))

    model_performance_data.append(
        pd.Series(
            {
                'male_data':male_data,
                'tournament':NEAT_TOURNAMENT_NAME_MAP[tournament],
                'year':year,
                'model_name': "Actual Odds",
                'accuracy':accuracy,
                'brier_score':brier_score,
                'log_loss':log_loss
            }
        )
    )
# Calculate the CIs for the model
model_ci_data = bootstrap_model_level_cis(pd.concat(overall_model_dataframes),['accuracy','brier_score','log_loss'])
all_model_ci_data.append(model_ci_data)
    

In [None]:
model_ci_data

In [None]:
print('Overall Model Performance:')
overall_model_performance_data = pd.concat(model_performance_data,axis=1).T

overall_model_performance_data.groupby('model_name')[['accuracy','brier_score','log_loss']].mean().sort_values(by='brier_score',ascending=True)

In [None]:
print('CI Model Performance')
print(pd.concat(all_model_ci_data))