## Import Libraries:

In [1]:
import warnings
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation and analysis
import os  # For interacting with the operating system
import matplotlib.pyplot as plt  # For plotting graphs
from sklearn.model_selection import ParameterGrid  # For generating parameter grid for hyperparameter tuning
from sklearn.metrics import log_loss  # For calculating log loss metric

# Suppress specific UserWarnings from the 'openpyxl' module to prevent cluttering the output
warnings.filterwarnings("ignore", category=UserWarning, module='openpyxl')## Load the data:

## Load the data:

In [2]:
# Define the directory where your files are located
# data_dir = '.'  
data_dir = os.path.join(os.path.pardir) 

# List to hold the dataframes
dataframes = []

# Loop through the years and load the files
for year in range(2005, 2020):
    if year <= 2012:
        file_path = os.path.join(data_dir, f'{year}.xls')
    else:
        file_path = os.path.join(data_dir, f'{year}.xlsx')
    
    # Load the file into a dataframe
    df = pd.read_excel(file_path)
    
    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all the dataframes into one
betting_data = pd.concat(dataframes, ignore_index=True)## Load the data:

## Fixing Anomalies

In [3]:
def is_column_numeric(df, column_name):
    # Check if the column contains only numeric values
    return df[column_name].apply(lambda x: str(x).isnumeric()).all()

In [4]:
# Check if columns are numeric before converting
anomaly_column = ['WRank', 'LRank', 'EXW']
for column in anomaly_column:
    if is_column_numeric(betting_data, column):
        print(f"Column '{column}' is numeric.\n")
    else:
        print(f"Column '{column}' is not numeric.\n")

Column 'WRank' is not numeric.

Column 'LRank' is not numeric.

Column 'EXW' is not numeric.



In [5]:
def find_non_numeric_values(df, column_name):
    # Function to check if a value is numeric
    def is_numeric(value):
        try:
            float(value)
            return True
        except ValueError:
            return False

    # Apply the function to the column and filter non-numeric values
    non_numeric_values = df[~df[column_name].apply(is_numeric)]

    # Display the non-numeric values
    print(f"Non-numeric values in {column_name}:")
    print(non_numeric_values[[column_name]])
    print()

In [6]:
# WRank column
find_non_numeric_values(betting_data, 'Wsets')

Non-numeric values in Wsets:
Empty DataFrame
Columns: [Wsets]
Index: []



In [7]:
# LRank column
find_non_numeric_values(betting_data, 'LRank')

Non-numeric values in LRank:
Empty DataFrame
Columns: [LRank]
Index: []



In [8]:
# EXW column
find_non_numeric_values(betting_data, 'EXW')

Non-numeric values in EXW:
        EXW
23776  2.,3



In [9]:
# Convert WRank and LRank to numeric, coercing errors
betting_data['WRank'] = pd.to_numeric(betting_data['WRank'], errors='coerce')
betting_data['LRank'] = pd.to_numeric(betting_data['LRank'], errors='coerce')

# Fill NaN values with a high number
betting_data['WRank'].fillna(100000, inplace=True)
betting_data['LRank'].fillna(100000, inplace=True)


# Correct the typo in row 38294, column 'EXW'
if betting_data.at[38294, 'EXW'] == '2.,3':
    betting_data.at[38294, 'EXW'] = '2.3'

## Preprocess Dataset:

In [10]:
# Create a new column 'higher_rank_won' to indicate if the higher-ranked player won the match
betting_data['higher_rank_won'] = (betting_data['WRank'] < betting_data['LRank']).astype(int)

# Calculate the points for the higher-ranked player:
# If the higher-ranked player won, use their points ('WPts');
# Otherwise, use the opponent's points ('LPts')
betting_data['higher_rank_points'] = (
    betting_data['higher_rank_won'] * betting_data['WPts'] + 
    betting_data['LPts'] * (1 - betting_data['higher_rank_won'])
)

# Calculate the points for the lower-ranked player:
# If the higher-ranked player lost, use their points ('WPts');
# Otherwise, use the opponent's points ('LPts')
betting_data['lower_rank_points'] = (
    (1 - betting_data['higher_rank_won']) * betting_data['WPts'] + 
    betting_data['LPts'] * betting_data['higher_rank_won']
)

# Fill any missing values in 'higher_rank_points' with 0 to avoid issues in further calculations
betting_data['higher_rank_points'].fillna(0, inplace=True)

# Fill any missing values in 'lower_rank_points' with 0 to ensure consistency in the dataset
betting_data['lower_rank_points'].fillna(0, inplace=True)

# Calculate the difference in sets won between the winner and the loser
betting_data['sets_difference'] = betting_data['Wsets'] - betting_data['Lsets']

# Filter the DataFrame to include only rows where the match status is 'Completed'
# This ensures that only fully played matches are considered in the analysis
betting_data = betting_data.loc[betting_data['Comment'] == 'Completed']

# Fill missing values in 'sets_difference' with the mean of the column
mean_value = betting_data['sets_difference'].mean()
betting_data['sets_difference'].fillna(mean_value, inplace=True)

# Create a copy of the betting_data DataFrame, named 'all_matches_k', for further processing
all_matches_k = betting_data.copy()

# List of columns to drop from the dataset as they are not needed for the analysis
columns_to_drop = [
    'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets', 'Lsets', 'Comment',
    'CBW', 'CBL', 'IWW', 'IWL', 'B365W', 'B365L', 
    'EXW', 'EXL', 'PSW', 'PSL', 'WPts', 'LPts', 'UBW', 'UBL', 'LBW', 'LBL', 'SJW', 'SJL',
    'MaxW', 'MaxL', 'AvgW', 'AvgL'
]

# Drop the unnecessary columns from 'all_matches_k' to simplify the dataset for analysis
all_matches_k = all_matches_k.drop(columns=columns_to_drop)

## Linear MOV model:

## Function Definitions and Setup:

In [11]:
def win_probability(E_i, E_j):
    """
    Calculate the probability of Player i winning against Player j based on their ratings.

    Parameters:
    E_i (float): The rating of Player i.
    E_j (float): The rating of Player j.

    Returns:
    float: The probability of Player i winning.
    """
    # Calculate and return the win probability using the Elo rating formula
    return 1 / (1 + 10 ** ((E_j - E_i) / 400))

In [12]:
def get_player_elo(player_name, surface=None, blend=False):
    """
    Retrieve the Elo rating of a player. Optionally, the Elo rating can be surface-specific
    or a blend of surface and general Elo ratings.

    Parameters:
    player_name (str): The name of the player.
    surface (str, optional): The surface type (e.g., 'clay', 'grass', 'hard'). Defaults to None.
    blend (bool, optional): Whether to use a blend of surface-specific and general Elo ratings. Defaults to False.

    Returns:
    float: The Elo rating of the player, either surface-specific, blended, or general.
    """
    # Check if surface-specific and blended Elo ratings should be used
    if surface and blend:
        # If the player is not in the blended surface-specific Elo dictionary, initialize with the initial Elo
        if player_name not in surface_elo_blend[surface]:
            surface_elo_blend[surface][player_name] = initial_elo
        return surface_elo_blend[surface][player_name]
    
    # Check if surface-specific Elo ratings should be used
    elif surface:
        # If the player is not in the surface-specific Elo dictionary, initialize with the initial Elo
        if player_name not in surface_elo[surface]:
            surface_elo[surface][player_name] = initial_elo
        return surface_elo[surface][player_name]
    
    # If no surface is specified, return the general Elo rating
    if player_name not in players_elo:
        # If the player is not in the general Elo dictionary, initialize with the initial Elo
        players_elo[player_name] = initial_elo
    
    return players_elo[player_name]

In [13]:
def set_player_elo(player_name, elo, surface=None, blend=False):
    """
    Set the Elo rating for a player. The rating can be set as surface-specific, 
    a blend of surface-specific and general, or general based on the parameters.

    Parameters:
    player_name (str): The name of the player whose Elo rating is being set.
    elo (float): The Elo rating to assign to the player.
    surface (str, optional): The surface type (e.g., 'clay', 'grass', 'hard'). Defaults to None.
    blend (bool, optional): Whether to set a blend of surface-specific and general Elo ratings. Defaults to False.

    Returns:
    None: The function updates the relevant Elo dictionary in place.
    """
    # Set the Elo rating for the player in the blended surface-specific Elo dictionary
    if surface and blend:
        surface_elo_blend[surface][player_name] = elo
    
    # Set the Elo rating for the player in the surface-specific Elo dictionary
    elif surface:
        surface_elo[surface][player_name] = elo
    
    # Set the Elo rating for the player in the general Elo dictionary
    else:
        players_elo[player_name] = elo

In [14]:
def expected_margin(E_i, E_j, sigma):
    """
    Calculate the expected margin of victory based on the Elo ratings of two players.

    Parameters:
    E_i (float): The Elo rating of Player i.
    E_j (float): The Elo rating of Player j.
    sigma (float): The scaling parameter that controls the spread of expected margins.

    Returns:
    float: The expected margin of victory for Player i over Player j.
    """
    return (E_i - E_j) / sigma

In [15]:
def update_elo_mov(E_i, E_j, K, actual_margin, sigma):
    """
    Update the Elo rating for a player based on the match's margin of victory.

    Parameters:
    E_i (float): The current Elo rating of player i.
    E_j (float): The Elo rating of the opponent, player j.
    K (float): The K-factor, which determines how much the Elo rating changes after a match.
    actual_margin (int): The actual margin of victory (e.g., difference in sets won).
    sigma (float): The scaling parameter for the expected margin.

    Returns:
    float: The updated Elo rating of player i.
    """
    expected_m = expected_margin(E_i, E_j, sigma)
    delta_E_i = K * (actual_margin - expected_m)
    return E_i + delta_E_i

In [16]:
def update_elo_and_probabilities_mov(df, K, sigma):
    """
    Update the Elo ratings and win probabilities for each match in the dataset based on the MOV model.

    Parameters:
    df (pandas.DataFrame): DataFrame containing match data. Expected columns:
                           - 'Winner': Name of the player who won the match.
                           - 'Loser': Name of the player who lost the match.
                           - 'sets_difference': The margin of victory in sets (e.g., 2-0 -> 2).
    K (float): The K-factor, which determines how much the Elo rating changes after a match.
    sigma (float): The scaling parameter for the expected margin.

    Returns:
    None: The DataFrame is modified in place with updated Elo ratings and probabilities.
    """
    for index, match in df.iterrows():
        winner_name, loser_name = match['Winner'], match['Loser']
        margin_of_victory = match['sets_difference']
        surface = match['Surface']
     
        # Retrieve current Elo ratings overall, for the surface, and the blended surface
        winner_elo_overall = get_player_elo(winner_name)
        loser_elo_overall = get_player_elo(loser_name)
        winner_elo_surface = get_player_elo(winner_name, surface)
        loser_elo_surface = get_player_elo(loser_name, surface)
        winner_elo_blend = get_player_elo(winner_name, surface, blend=True)
        loser_elo_blend = get_player_elo(loser_name, surface, blend=True)

        # Store initial Elo ratings
        df.at[index, 'winner_initial_elo_overall'] = winner_elo_overall
        df.at[index, 'loser_initial_elo_overall'] = loser_elo_overall
        df.at[index, 'winner_initial_elo_surface'] = winner_elo_surface
        df.at[index, 'loser_initial_elo_surface'] = loser_elo_surface
        df.at[index, 'winner_initial_elo_blend'] = winner_elo_blend
        df.at[index, 'loser_initial_elo_blend'] = loser_elo_blend
        
        # Calculate win probabilities
        df.at[index, 'prob_winner_overall'] = win_probability(winner_elo_overall, loser_elo_overall)
        df.at[index, 'prob_winner_surface'] = win_probability(winner_elo_surface, loser_elo_surface)
        df.at[index, 'prob_winner_blend'] = win_probability(winner_elo_blend, loser_elo_blend)
        
        # Determine match outcomes based on probability and who was expected to win (overall)
        if match['higher_rank_won']:
            df.at[index, 'match_outcome_overall'] = int(df.at[index, 'prob_winner_overall'] > 0.5)
            df.at[index, 'prob_high_ranked_overall'] = df.at[index, 'prob_winner_overall']
        else:
            df.at[index, 'match_outcome_overall'] = int((1 - df.at[index, 'prob_winner_overall']) > 0.5)
            df.at[index, 'prob_high_ranked_overall'] = 1 - df.at[index, 'prob_winner_overall']

        # Determine match outcomes based on probability and who was expected to win (surface)
        if match['higher_rank_won']:
            df.at[index, 'match_outcome_surface'] = int(df.at[index, 'prob_winner_surface'] > 0.5)
            df.at[index, 'prob_high_ranked_surface'] = df.at[index, 'prob_winner_surface']
        else:
            df.at[index, 'match_outcome_surface'] = int((1 - df.at[index, 'prob_winner_surface']) > 0.5)
            df.at[index, 'prob_high_ranked_surface'] = 1 - df.at[index, 'prob_winner_surface']

        # Determine match outcomes based on probability and who was expected to win (blend)
        if match['higher_rank_won']:
            df.at[index, 'match_outcome_blend'] = int(df.at[index, 'prob_winner_blend'] > 0.5)
            df.at[index, 'prob_high_ranked_blend'] = df.at[index, 'prob_winner_blend']
        else:
            df.at[index, 'match_outcome_blend'] = int((1 - df.at[index, 'prob_winner_blend']) > 0.5)
            df.at[index, 'prob_high_ranked_blend'] = 1 - df.at[index, 'prob_winner_blend']
       
        # Update Elo ratings overall
        new_winner_elo_overall = update_elo_mov(winner_elo_overall, loser_elo_overall, K, margin_of_victory, sigma)
        new_loser_elo_overall = update_elo_mov(loser_elo_overall, winner_elo_overall, K, -margin_of_victory, sigma)  # Negative margin for the loser
        set_player_elo(winner_name, new_winner_elo_overall)
        set_player_elo(loser_name, new_loser_elo_overall)
        
        # Update Elo ratings for the surface
        new_winner_elo_surface = update_elo_mov(winner_elo_surface, loser_elo_surface, K, margin_of_victory, sigma)
        new_loser_elo_surface = update_elo_mov(loser_elo_surface, winner_elo_surface, K, -margin_of_victory, sigma)  # Negative margin for the loser
        set_player_elo(winner_name, new_winner_elo_surface, surface)
        set_player_elo(loser_name, new_loser_elo_surface, surface)
        
        # Blended Elo ratings after the update
        new_winner_elo_blend = (new_winner_elo_overall + new_winner_elo_surface) / 2
        new_loser_elo_blend = (new_loser_elo_overall + new_loser_elo_surface) / 2

        # Update the blended Elo ratings in the surface_elo_blend dictionary
        set_player_elo(winner_name, new_winner_elo_blend, surface, blend=True)
        set_player_elo(loser_name, new_loser_elo_blend, surface, blend=True)

        # Store new Elo ratings
        df.at[index, 'winner_new_elo_overall'] = new_winner_elo_overall
        df.at[index, 'loser_new_elo_overall'] = new_loser_elo_overall
        df.at[index, 'winner_new_elo_surface'] = new_winner_elo_surface
        df.at[index, 'loser_new_elo_surface'] = new_loser_elo_surface
        df.at[index, 'winner_new_elo_blend'] = new_winner_elo_blend
        df.at[index, 'loser_new_elo_blend'] = new_loser_elo_blend

In [17]:
def evaluate_mov_model(df):
    """
    Evaluate the performance of the Linear MOV model by calculating log loss, accuracy, and calibration.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the actual outcomes and predicted probabilities.
                           Expected columns:
                           - 'higher_rank_won': Actual outcome (1 if higher-ranked player won, 0 otherwise).
                           - 'prob_high_ranked': Predicted probability of the higher-ranked player winning.
                           - 'match_outcome': Actual match outcome (can be used to calculate accuracy).

    Returns:
    tuple: A tuple containing:
           - logloss_value (float): The log loss of the predictions.
           - accuracy_value (float): The accuracy of the model.
           - calibration_value (float): The calibration metric for the predictions.
    """
    # Calculate log loss between the actual outcomes and predicted probabilities
    logloss_value = log_loss(df.higher_rank_won, df.prob_high_ranked)
    
    # Calculate accuracy by comparing predicted match outcome with the actual outcome
    accuracy_value = np.mean(df.match_outcome == df.higher_rank_won)
    
    # Calculate calibration by dividing the sum of predicted probabilities by the sum of actual outcomes
    calibration_value = np.sum(df.prob_high_ranked) / np.sum(df.higher_rank_won)
    
    # Return the calculated metrics
    return accuracy_value, calibration_value, logloss_value

## Implementation of best parameters:

In [18]:
# Initialize the initial Elo rating for all players
initial_elo = 1500

# Initialize an empty dictionary to store general Elo ratings for all players
players_elo = {}

# Initialize dictionaries to store surface-specific Elo ratings for different surfaces
surface_elo = {
    'Hard': {},  # Elo ratings for matches played on Hard courts
    'Clay': {},  # Elo ratings for matches played on Clay courts
    'Grass': {},  # Elo ratings for matches played on Grass courts
    'Carpet': {}  # Elo ratings for matches played on Carpet courts
}

# Initialize dictionaries to store blended Elo ratings (general + surface-specific) for different surfaces
surface_elo_blend = {
    'Hard': {},  # Blended Elo ratings for Hard courts
    'Clay': {},  # Blended Elo ratings for Clay courts
    'Grass': {},  # Blended Elo ratings for Grass courts
    'Carpet': {}  # Blended Elo ratings for Carpet courts
}

update_elo_and_probabilities_mov(all_matches_k, K=6, sigma=165)

## Split Dataset:

In [19]:
all_matches_k['Date'] = pd.to_datetime(all_matches_k['Date'], format='%Y-%m-%d')
split_time = pd.to_datetime('2019-01-01', format='%Y-%m-%d')
all_matches_k_train = all_matches_k[all_matches_k['Date'] < split_time]
all_matches_k_validation = all_matches_k[all_matches_k['Date'] >= split_time]

## Evaluate Model Performance:

In [20]:
def calculate_metrics(data, prob_col, outcome_col, actual_col):
    # Calculate accuracy
    accuracy = np.mean(data[outcome_col] == data[actual_col])
    print(f'Accuracy: {accuracy:.4f}')

    # Calculate calibration
    calibration = np.sum(data[prob_col]) / np.sum(data[actual_col])
    print(f'Calibration: {calibration:.4f}')

    # Define log loss function
    def logloss(actual, predictions):
        epsilon = 1e-15
        predictions = np.clip(predictions, epsilon, 1 - epsilon)
        
        log_loss_value = -(1 / len(actual)) * np.sum(
            actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
        return log_loss_value

    # Calculate log loss
    log_loss_value = logloss(data[actual_col], data[prob_col])
    print(f'Log Loss: {log_loss_value:.4f}')

    return accuracy, calibration, log_loss_value

In [21]:
print("\033[1mValidation Stats for Overall Elo:\033[0m")
accuracy_overall, calibration_overall, log_loss_overall = calculate_metrics(
    all_matches_k_validation, 'prob_high_ranked_overall', 'match_outcome_overall', 'higher_rank_won')

[1mValidation Stats for Overall Elo:[0m
Accuracy: 0.6319
Calibration: 1.0024
Log Loss: 0.6288


In [22]:
print("\033[1mValidation Stats for Surface-Specific Elo:\033[0m")
accuracy_surface, calibration_surface, log_loss_surface = calculate_metrics(
    all_matches_k_validation, 'prob_high_ranked_surface', 'match_outcome_surface', 'higher_rank_won')

[1mValidation Stats for Surface-Specific Elo:[0m
Accuracy: 0.6363
Calibration: 0.9646
Log Loss: 0.6272


In [23]:
print("\033[1mValidation Stats for Surface-Specific Elo - (Blend):\033[0m") 
accuracy_surface_blend, calibration_surface_blend, log_loss_surface_blend = calculate_metrics(
    all_matches_k_validation, 'prob_high_ranked_blend', 'match_outcome_blend', 'higher_rank_won')

[1mValidation Stats for Surface-Specific Elo - (Blend):[0m
Accuracy: 0.6407
Calibration: 0.9834
Log Loss: 0.6237


## Filtering Top 50 and Top 100 Ranking players from the dataset:

In [24]:
# Load your dataset
df = all_matches_k_validation

In [25]:
# Define function to filter dataset for top N players
def filter_top_players(df, top_n):
    df_top = df[(df['WRank'] <= top_n) | (df['LRank'] <= top_n)]
    return df_top

In [26]:
# Filter the dataset for top 50 and top 100 players
df_top_50 = filter_top_players(df, 50)
df_top_100 = filter_top_players(df, 100)

## Metrics - Top 50 & Top 100:

In [27]:
print("\033[1mMetrics - Top 50\033[0m")
accuracy_surface_blend_50, calibration_surface_blend_50, log_loss_surface_blend_50 = calculate_metrics(
    df_top_50, 'prob_high_ranked_blend', 'match_outcome_blend', 'higher_rank_won')

[1mMetrics - Top 50[0m
Accuracy: 0.6591
Calibration: 0.9944
Log Loss: 0.6054


In [28]:
print("\033[1mMetrics - Top 100\033[0m")
accuracy_surface_blend_100, calibration_surface_blend_100, log_loss_surface_blend_100 = calculate_metrics(
    df_top_100, 'prob_high_ranked_blend', 'match_outcome_blend', 'higher_rank_won')

[1mMetrics - Top 100[0m
Accuracy: 0.6426
Calibration: 0.9894
Log Loss: 0.6220


In [29]:
# Create a DataFrame to store the validation statistics
validation_stats = pd.DataFrame({
    'Model': [
        'Whole Dataset', 'Top 50', 'Top 100'
    ],
    'Accuracy': [
        accuracy_surface_blend, accuracy_surface_blend_50, accuracy_surface_blend_100
    ],
    'Log_Loss': [
        log_loss_surface_blend, log_loss_surface_blend_50, log_loss_surface_blend_100
    ],
    'Calibration': [
        calibration_surface_blend, calibration_surface_blend_50, calibration_surface_blend_100
    ]
})

# Print the validation statistics DataFrame
validation_stats

Unnamed: 0,Model,Accuracy,Log_Loss,Calibration
0,Whole Dataset,0.640669,0.623686,0.983432
1,Top 50,0.659117,0.605368,0.994424
2,Top 100,0.64256,0.622027,0.989449
