## Load the data:

In [1]:
import warnings
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt

# Suppress specific UserWarnings from openpyxl
warnings.filterwarnings("ignore", category=UserWarning, module='openpyxl')

# Define the directory where your files are located
# data_dir = '.'  
data_dir = os.path.join(os.path.pardir) 

# List to hold the dataframes
dataframes = []

# Loop through the years and load the files
for year in range(2005, 2020):
    if year <= 2012:
        file_path = os.path.join(data_dir, f'{year}.xls')
    else:
        file_path = os.path.join(data_dir, f'{year}.xlsx')

    # Load the file into a dataframe
    df = pd.read_excel(file_path)

    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all the dataframes into one
betting_data = pd.concat(dataframes, ignore_index=True)


## Fixing Anomalies

In [2]:
def is_column_numeric(df, column_name):
    # Check if the column contains only numeric values
    return df[column_name].apply(lambda x: str(x).isnumeric()).all()

# Check if columns are numeric before converting
anomaly_column = ['WRank', 'LRank', 'EXW']
for column in anomaly_column:
    if is_column_numeric(betting_data, column):
        print(f"Column '{column}' is numeric.\n")
    else:
        print(f"Column '{column}' is not numeric.\n")

def find_non_numeric_values(df, column_name):
    # Function to check if a value is numeric
    def is_numeric(value):
        try:
            float(value)
            return True
        except ValueError:
            return False

    # Apply the function to the column and filter non-numeric values
    non_numeric_values = df[~df[column_name].apply(is_numeric)]

    # Display the non-numeric values
    print(f"Non-numeric values in {column_name}:")
    print(non_numeric_values[[column_name]], "\n")

# WRank column
find_non_numeric_values(betting_data, 'WRank')

# LRank column
find_non_numeric_values(betting_data, 'LRank')

# EXW column
find_non_numeric_values(betting_data, 'EXW')

Column 'WRank' is not numeric.

Column 'LRank' is not numeric.

Column 'EXW' is not numeric.

Non-numeric values in WRank:
Empty DataFrame
Columns: [WRank]
Index: [] 

Non-numeric values in LRank:
Empty DataFrame
Columns: [LRank]
Index: [] 

Non-numeric values in EXW:
        EXW
23776  2.,3 



In [3]:
# Convert WRank and LRank to numeric, coercing errors
betting_data['WRank'] = pd.to_numeric(betting_data['WRank'], errors='coerce')
betting_data['LRank'] = pd.to_numeric(betting_data['LRank'], errors='coerce')

# Fill NaN values with a high number
betting_data['WRank'].fillna(100000, inplace=True)
betting_data['LRank'].fillna(100000, inplace=True)


# Correct the typo in row 38294, column 'EXW'
if betting_data.at[38294, 'EXW'] == '2.,3':
    betting_data.at[38294, 'EXW'] = '2.3'


## Feature Engineering:

In [4]:
# Now perform the calculations
betting_data['higher_rank_won'] = (betting_data['WRank'] < betting_data['LRank']).astype(int)
betting_data['higher_rank_points'] = betting_data['higher_rank_won'] * betting_data['WPts'] + betting_data['LPts'] * (1 - betting_data['higher_rank_won'])
betting_data['lower_rank_points'] = (1 - betting_data['higher_rank_won']) * betting_data['WPts'] + betting_data['LPts'] * betting_data['higher_rank_won']
betting_data['higher_rank_points'].fillna(0, inplace=True)
betting_data['lower_rank_points'].fillna(0, inplace=True)

all_matches_538 = betting_data.copy()
# Columns to drop
columns_to_drop = [
    'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets', 'Lsets', 'Comment',
    'CBW', 'CBL', 'IWW', 'IWL', 'B365W', 'B365L', 
    'EXW', 'EXL', 'PSW', 'PSL', 'WPts', 'LPts', 'UBW', 'UBL', 'LBW', 'LBL', 'SJW', 'SJL',
    'MaxW', 'MaxL', 'AvgW', 'AvgL'
]

# Drop the columns
all_matches_538 = all_matches_538.drop(columns=columns_to_drop)

## ELO Setup:

## 538 Model

In [5]:
import pandas as pd

# Initialize Elo ratings
initial_elo = 1500
players_elo = {}
surface_elo = {'Hard': {}, 'Clay': {}, 'Grass': {}, 'Carpet': {}}

def win_probability(E_i, E_j):
    return 1 / (1 + 10 ** ((E_j - E_i) / 400))

def get_player_elo(player_name, surface=None):
    if surface:
        if player_name not in surface_elo[surface]:
            surface_elo[surface][player_name] = initial_elo
        return surface_elo[surface][player_name]
    if player_name not in players_elo:
        players_elo[player_name] = initial_elo
    return players_elo[player_name]

def set_player_elo(player_name, elo, surface=None):
    if surface:
        surface_elo[surface][player_name] = elo
    else:
        players_elo[player_name] = elo

players_games_played = {}

def get_games_played(player_name):
    if player_name not in players_games_played:
        players_games_played[player_name] = 0
    return players_games_played[player_name]

def increment_games_played(player_name):
    players_games_played[player_name] = players_games_played.get(player_name, 0) + 1

def update_elo_538(E_i, E_j, outcome, games_played_i):
    K = 100 / (games_played_i + 10) ** 0.15
    pi_j = win_probability(E_i, E_j)  # Calculate win probability for player i against player j
    if outcome == 'win':
        delta_E_i = K * (1 - pi_j)  # Elo rating change for a win
        new_E_i = E_i + delta_E_i
    elif outcome == 'loss':
        delta_E_i = K * (pi_j - 1)  # Elo rating change for a loss
        new_E_i = E_j + delta_E_i
    else:
        raise ValueError("Outcome must be 'win' or 'loss'")
    return new_E_i

def update_elo_538_param(E_i, E_j, outcome, games_played_i, delta, nu, sigma):
    K = delta / (games_played_i + nu) ** sigma
    pi_j = win_probability(E_i, E_j)  # Calculate win probability for player i against player j
    if outcome == 'win':
        delta_E_i = K * (1 - pi_j)  # Elo rating change for a win
        new_E_i = E_i + delta_E_i
    elif outcome == 'loss':
        delta_E_i = K * (pi_j - 1)  # Elo rating change for a loss
        new_E_i = E_j + delta_E_i
    else:
        raise ValueError("Outcome must be 'win' or 'loss'")
    return new_E_i

def update_elo_and_probabilities_538_param(df, delta, nu, sigma):
    for index, match in df.iterrows():
        winner_name, loser_name = match['Winner'], match['Loser']
        surface = match['Surface']

        # Retrieve current Elo ratings overall and for the surface
        winner_elo_overall = get_player_elo(winner_name)
        loser_elo_overall = get_player_elo(loser_name)
        winner_elo_surface = get_player_elo(winner_name, surface)
        loser_elo_surface = get_player_elo(loser_name, surface)

        # Store initial Elo ratings
        df.at[index, 'winner_initial_elo_overall'] = winner_elo_overall
        df.at[index, 'loser_initial_elo_overall'] = loser_elo_overall
        df.at[index, 'winner_initial_elo_surface'] = winner_elo_surface
        df.at[index, 'loser_initial_elo_surface'] = loser_elo_surface

        # Get the number of games played by each player
        games_played_winner = get_games_played(winner_name)
        games_played_loser = get_games_played(loser_name)

        # Calculate win probabilities
        df.at[index, 'prob_winner_overall'] = win_probability(winner_elo_overall, loser_elo_overall)
        df.at[index, 'prob_winner_surface'] = win_probability(winner_elo_surface, loser_elo_surface)

        # Determine match outcomes based on probability and who was expected to win (overall)
        if match['higher_rank_won']:
            df.at[index, 'match_outcome_overall'] = int(df.at[index, 'prob_winner_overall'] > 0.5)
            df.at[index, 'prob_high_ranked_overall'] = df.at[index, 'prob_winner_overall']
        else:
            df.at[index, 'match_outcome_overall'] = int((1 - df.at[index, 'prob_winner_overall']) > 0.5)
            df.at[index, 'prob_high_ranked_overall'] = 1 - df.at[index, 'prob_winner_overall']

        # Determine match outcomes based on probability and who was expected to win (surface)
        if match['higher_rank_won']:
            df.at[index, 'match_outcome_surface'] = int(df.at[index, 'prob_winner_surface'] > 0.5)
            df.at[index, 'prob_high_ranked_surface'] = df.at[index, 'prob_winner_surface']
        else:
            df.at[index, 'match_outcome_surface'] = int((1 - df.at[index, 'prob_winner_surface']) > 0.5)
            df.at[index, 'prob_high_ranked_surface'] = 1 - df.at[index, 'prob_winner_surface']

        # Update Elo ratings overall
        new_winner_elo_overall = update_elo_538_param(winner_elo_overall, loser_elo_overall, 'win', games_played_winner, delta, nu, sigma)
        new_loser_elo_overall = update_elo_538_param(winner_elo_overall, loser_elo_overall, 'loss', games_played_loser, delta, nu, sigma)
        set_player_elo(winner_name, new_winner_elo_overall)
        set_player_elo(loser_name, new_loser_elo_overall)

        # Update Elo ratings for the surface
        new_winner_elo_surface = update_elo_538_param(winner_elo_surface, loser_elo_surface, 'win', games_played_winner, delta, nu, sigma)
        new_loser_elo_surface = update_elo_538_param(winner_elo_surface, loser_elo_surface, 'loss', games_played_loser, delta, nu, sigma)
        set_player_elo(winner_name, new_winner_elo_surface, surface)
        set_player_elo(loser_name, new_loser_elo_surface, surface)

        # Store new Elo ratings
        df.at[index, 'winner_new_elo_overall'] = new_winner_elo_overall
        df.at[index, 'loser_new_elo_overall'] = new_loser_elo_overall
        df.at[index, 'winner_new_elo_surface'] = new_winner_elo_surface
        df.at[index, 'loser_new_elo_surface'] = new_loser_elo_surface

        # Add Elo ratings for all surfaces
        for surf in ['Hard', 'Clay', 'Grass', 'Carpet']:
            df.at[index, f'winner_elo_{surf}'] = get_player_elo(winner_name, surf)
            df.at[index, f'loser_elo_{surf}'] = get_player_elo(loser_name, surf)

        # Increment the number of games played
        increment_games_played(winner_name)
        increment_games_played(loser_name)

# Update Elo ratings based on the selected best parameters
players_elo = {}
surface_elo = {'Hard': {}, 'Clay': {}, 'Grass': {}, 'Carpet': {}}
players_games_played = {}

update_elo_and_probabilities_538_param(all_matches_538, 120, 25, 0.35)


## Split Dataset:

In [6]:
all_matches_538['Date'] = pd.to_datetime(all_matches_538['Date'], format='%Y-%m-%d')
split_time = pd.to_datetime('2019-01-01', format='%Y-%m-%d')
all_matches_538_train = all_matches_538[all_matches_538['Date'] < split_time]
all_matches_538_validation = all_matches_538[all_matches_538['Date'] >= split_time]


In [9]:
all_matches_538_validation

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,winner_new_elo_surface,loser_new_elo_surface,winner_elo_Hard,loser_elo_Hard,winner_elo_Clay,loser_elo_Clay,winner_elo_Grass,loser_elo_Grass,winner_elo_Carpet,loser_elo_Carpet
37785,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Kudla D.,...,1475.079278,1585.979391,1475.079278,1585.979391,1454.214112,1517.926081,1519.283762,1438.680112,1500.000000,1500.0
37786,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Chardy J.,...,1604.415088,1605.362726,1604.415088,1605.362726,1584.931325,1493.590081,1531.862248,1425.752134,1504.068234,1500.0
37787,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Murray A.,...,1972.934009,1478.817245,1972.934009,1478.817245,1821.674934,1465.390817,1792.340833,1451.134231,1604.593435,1500.0
37788,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Kyrgios N.,...,1772.979267,1618.278177,1772.979267,1618.278177,1630.638119,1458.046079,1608.193785,1439.424557,1500.000000,1500.0
37789,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Tsonga J.W.,...,1789.179466,1511.519150,1789.179466,1511.519150,1704.193192,1471.714568,1675.966296,1488.689637,1584.911628,1500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40385,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Nadal R.,...,1520.776411,1517.589064,1520.776411,1517.589064,1500.000000,1500.000000,1500.000000,1500.000000,1500.000000,1500.0
40386,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,...,1517.085622,1444.548903,1517.085622,1444.548903,1500.000000,1500.000000,1500.000000,1500.000000,1500.000000,1500.0
40387,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Tsitsipas S.,...,1539.159052,1551.156930,1539.159052,1551.156930,1500.000000,1500.000000,1500.000000,1500.000000,1500.000000,1500.0
40388,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Thiem D.,...,1541.747292,1498.733451,1541.747292,1498.733451,1500.000000,1500.000000,1500.000000,1500.000000,1500.000000,1500.0


## Evaluate Model Performance


In [10]:
def calculate_metrics(data, prob_col, outcome_col, actual_col):
    # Calculate accuracy
    accuracy = np.mean(data[outcome_col] == data[actual_col])
    print(f'Accuracy: {accuracy:.4f}')

    # Calculate calibration
    calibration = np.sum(data[prob_col]) / np.sum(data[actual_col])
    print(f'Calibration: {calibration:.4f}')

    # Define log loss function
    def logloss(actual, predictions):
        epsilon = 1e-15
        predictions = np.clip(predictions, epsilon, 1 - epsilon)
        
        log_loss_value = -(1 / len(actual)) * np.sum(
            actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
        return log_loss_value

    # Calculate log loss
    log_loss_value = logloss(data[actual_col], data[prob_col])
    print(f'Log Loss: {log_loss_value:.4f}')

    return accuracy, calibration, log_loss_value

# Example usage with a dataset
print("Validation Stats for Overall Elo:")
accuracy_overall, calibration_overall, log_loss_overall = calculate_metrics(
    all_matches_538_validation, 'prob_high_ranked_overall', 'match_outcome_overall', 'higher_rank_won')

print("\nValidation Stats for Surface-Specific Elo:")
accuracy_surface, calibration_surface, log_loss_surface = calculate_metrics(
    all_matches_538_validation, 'prob_high_ranked_surface', 'match_outcome_surface', 'higher_rank_won')


Validation Stats for Overall Elo:
Accuracy: 0.6302
Calibration: 1.0324
Log Loss: 0.6330

Validation Stats for Surface-Specific Elo:
Accuracy: 0.6309
Calibration: 0.9753
Log Loss: 0.6323


In [12]:
# Create a DataFrame to store the validation statistics
validation_stats = pd.DataFrame({
    'model': [
        'Overall Elo', 'Surface-Specific Elo'
    ],
    'accuracy': [
        accuracy_overall, accuracy_surface
    ],
    'log_loss': [
        log_loss_overall, log_loss_surface
    ],
    'calibration': [
        calibration_overall, calibration_surface
    ]
})

# Print the validation statistics DataFrame
print(validation_stats)

                  model  accuracy  log_loss  calibration
0           Overall Elo  0.630158  0.633003     1.032392
1  Surface-Specific Elo  0.630929  0.632274     0.975266


## Optimisation: 