### Model Comparison Visualisation

We compare and visualise the performance of the bookmaker odds model in this repository and three alternatives:
    - Bradley-Terry
    - ELO
    - Random Forest (catboost)

In [126]:
import numpy as np
import pandas as pd

### Define Parameters

In [127]:
MODEL_FOLDERS = ['bradleyterrymodel','elomodel','catboosttennismodel','bookmakeroddsmodel']
NEAT_MODEL_NAME_MAP = {'bradleyterrymodel':'Bradley-Terry','elomodel':'ELO','catboosttennismodel':'Random Forest','bookmakeroddsmodel':'Bookmaker Model'}
TOURNAMENTS = ['australian open','french open','wimbledon','us open']
NEAT_TOURNAMENT_NAME_MAP = {'australian open':'Australian Open','french open':'French Open','wimbledon':'Wimbledon','us open':'US Open'}

YEARS = [2024,2025]
GENDERS = ['male','female']

FILE_NAMES = [f"{gender}_{tournament}_{year}.csv" for gender in GENDERS for year in YEARS for tournament in TOURNAMENTS if not (tournament=='us open' and year == 2025) ]

### Calculate Model Performance

We calculate the accuracy, log-loss and Brier score of the models

In [128]:
def bootstrap_model_level_cis(
    df: pd.DataFrame,
    metrics: list[str],
    n_bootstrap: int = 10_000,
    alpha: float = 0.1,
    random_state: int = 42
) -> pd.DataFrame:
    """
    Compute model-level bootstrap confidence intervals for multiple metrics.

    Args:
        df: DataFrame containing model-level performance per tournament/year.
        metrics: List of metric columns to bootstrap (e.g. ['brier_score', 'log_loss']).
        n_bootstrap: Number of bootstrap resamples.
        alpha: Significance level (0.05 gives 95% CIs).
        random_state: Random seed for reproducibility.

    Returns:
        DataFrame with one row per model and columns for each metric's mean and CI bounds.
        Column names are of the form <metric>_mean, <metric>_ci_lower, <metric>_ci_upper.
    """
    rng = np.random.default_rng(random_state)
    results: list[dict[str, float]] = []

    for model_name, df_model in df.groupby("model_name"):
        row_dict = {"model_name": model_name}

        for metric in metrics:
            values = df_model[metric].to_numpy()
            n = len(values)

            boot_means = np.empty(n_bootstrap)
            for i in range(n_bootstrap):
                sample = rng.choice(values, size=n, replace=True)
                boot_means[i] = np.mean(sample)

            row_dict[f"{metric}_mean"] = np.mean(values)
            row_dict[f"{metric}_ci_lower"] = np.percentile(boot_means, 100 * alpha / 2)
            row_dict[f"{metric}_ci_upper"] = np.percentile(boot_means, 100 * (1 - alpha / 2))

        results.append(row_dict)

    return pd.DataFrame(results)


In [129]:
# Initialise the model performance data
model_performance_data:list[pd.Series] = []
all_model_ci_data:list[pd.DataFrame] = []
for model in MODEL_FOLDERS:
    # Create an overall model dataframe
    overall_model_dataframes:list[pd.DataFrame] = []
    for file_name in FILE_NAMES:

        # Load the data
        model_tournament_data = pd.read_csv(f"../../{model}/{file_name}")

        # Check all matches represented
        assert len(model_tournament_data) == 127

        # Calculate the metrics, adding them to the dataframe on a by-match level and calculating the tournament averages
        model_tournament_data['log_loss'] = np.log(model_tournament_data['predicted_prob_winner'])
        log_loss = (-1)*np.mean(model_tournament_data['log_loss'])
        model_tournament_data['brier_score'] = np.square(1 - model_tournament_data['predicted_prob_winner'])
        brier_score = np.mean(model_tournament_data['brier_score'])
        model_tournament_data['accuracy'] = (model_tournament_data['predicted_prob_winner']>0.5) + 0.5*(model_tournament_data['predicted_prob_winner'] == 0.5)
        accuracy = np.mean(model_tournament_data['accuracy'])

        # Add the df to the overall model dataframes
        model_tournament_data['model_name'] = NEAT_MODEL_NAME_MAP[model]
        overall_model_dataframes.append(model_tournament_data)

        # Create the series
        male_data = file_name.split('_')[0] == 'male'
        tournament = file_name.split('_')[1]
        year = int(file_name.split('_')[-1].replace('.csv',''))

        model_performance_data.append(
            pd.Series(
                {
                    'male_data':male_data,
                    'tournament':NEAT_TOURNAMENT_NAME_MAP[tournament],
                    'year':year,
                    'model_name':NEAT_MODEL_NAME_MAP[model],
                    'accuracy':accuracy,
                    'brier_score':brier_score,
                    'log_loss':log_loss
                }
            )
        )
    # Calculate the CIs for the model
    model_ci_data = bootstrap_model_level_cis(pd.concat(overall_model_dataframes),['accuracy','brier_score','log_loss'])
    all_model_ci_data.append(model_ci_data)

        


In [130]:
# Extract the bookmaker odds
for file_name in FILE_NAMES:

    # Get the tournament info from the name
    male_data = file_name.split('_')[0] == 'male'
    tournament = file_name.split('_')[1]
    year = int(file_name.split('_')[-1].replace('.csv',''))
    
    # Create the gender suffix
    gender_suffix = "men" if male_data else "women"
    tournament_data = pd.read_csv(f"../../src/main_package/data/{year}_{gender_suffix}.csv")
    tournament_data = tournament_data[tournament_data['Tournament'] == NEAT_TOURNAMENT_NAME_MAP[tournament]]

    # Get the win probabilities
    win_probabilities = (1/tournament_data['AvgW'])/((1/tournament_data['AvgW']) + (1/tournament_data['AvgL']))
    

    # Calculate the metrics, adding them to the dataframe on a by-match level and calculating the tournament averages
    tournament_data['log_loss'] = np.log(win_probabilities)
    log_loss = (-1)*np.mean(tournament_data['log_loss'])
    tournament_data['brier_score'] = np.square(1 - win_probabilities)
    brier_score = np.mean(tournament_data['brier_score'])
    tournament_data['accuracy'] = (win_probabilities>0.5) + 0.5*(win_probabilities == 0.5)
    accuracy = np.mean(tournament_data['accuracy'])

    # Add the df to the overall model dataframes
    tournament_data['model_name'] = "Actual Odds"
    overall_model_dataframes.append(tournament_data)

    # Create the series
    male_data = file_name.split('_')[0] == 'male'
    tournament = file_name.split('_')[1]
    year = int(file_name.split('_')[-1].replace('.csv',''))

    model_performance_data.append(
        pd.Series(
            {
                'male_data':male_data,
                'tournament':NEAT_TOURNAMENT_NAME_MAP[tournament],
                'year':year,
                'model_name': "Actual Odds",
                'accuracy':accuracy,
                'brier_score':brier_score,
                'log_loss':log_loss
            }
        )
    )
# Calculate the CIs for the model
model_ci_data = bootstrap_model_level_cis(pd.concat(overall_model_dataframes),['accuracy','brier_score','log_loss'])
all_model_ci_data.append(model_ci_data)
    

In [131]:
print('Overall Model Performance:')
overall_model_performance_data = pd.concat(model_performance_data,axis=1).T

overall_model_performance_data.groupby('model_name')[['accuracy','brier_score','log_loss']].mean().sort_values(by='brier_score',ascending=True)

Overall Model Performance:


Unnamed: 0_level_0,accuracy,brier_score,log_loss
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Actual Odds,0.739314,0.174689,0.52342
Bookmaker Model,0.723847,0.187689,0.562003
Random Forest,0.704724,0.193158,0.566656
ELO,0.681102,0.198489,0.578891
Bradley-Terry,0.702475,0.204035,0.623661


In [132]:
print('CI Model Performance')
print(pd.concat(all_model_ci_data))

CI Model Performance
        model_name  accuracy_mean  accuracy_ci_lower  accuracy_ci_upper  \
0    Bradley-Terry       0.702475           0.685039           0.719910   
0              ELO       0.681102           0.663105           0.698819   
0    Random Forest       0.704724           0.686727           0.722160   
0  Bookmaker Model       0.723847           0.706412           0.741282   
0      Actual Odds       0.739314           0.721879           0.756187   
1  Bookmaker Model       0.723847           0.706412           0.741282   

   brier_score_mean  brier_score_ci_lower  brier_score_ci_upper  \
0          0.204035              0.194903              0.213237   
0          0.198489              0.192542              0.204550   
0          0.193158              0.186501              0.199771   
0          0.187689              0.180256              0.195274   
0          0.174689              0.167488              0.181829   
1          0.187689              0.180073          