## Import Libraries:

In [1]:
import warnings
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation and analysis
import os  # For interacting with the operating system

# Suppress specific UserWarnings from openpyxl
warnings.filterwarnings("ignore", category=UserWarning, module='openpyxl')

## Load the data:

In [2]:
# Define the directory where your files are located
# data_dir = '.' 
data_dir = os.path.join(os.path.pardir)  

In [3]:
# List to hold the dataframes
dataframes = []

# Loop through the years and load the files
for year in range(2005, 2020):
    if year <= 2012:
        file_path = os.path.join(data_dir, f'{year}.xls')
    else:
        file_path = os.path.join(data_dir, f'{year}.xlsx')
    
    # Load the file into a dataframe
    df = pd.read_excel(file_path)
    
    # Append the dataframe to the list
    dataframes.append(df)

In [4]:
# Concatenate all the dataframes into one
betting_data = pd.concat(dataframes, ignore_index=True)

# Display the first few rows of the combined dataframe
betting_data.head()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
0,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Saulnier C.,...,,,,,,,,,,
1,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Enqvist T.,...,,,,,,,,,,
2,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Melzer J.,...,,,,,,,,,,
3,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Rochus O.,...,,,,,,,,,,
4,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Mayer F.,...,,,,,,,,,,


## Preprocess dataset:

In [5]:
# Convert WRank and LRank to numeric, coercing errors
betting_data['WRank'] = pd.to_numeric(betting_data['WRank'], errors='coerce')
betting_data['LRank'] = pd.to_numeric(betting_data['LRank'], errors='coerce')

# Fill NaN values in WRank and LRank with a high number to represent unranked players
betting_data['WRank'].fillna(100000, inplace=True)
betting_data['LRank'].fillna(100000, inplace=True)

# Create a new column 'higher_rank_won' where 1 indicates the higher-ranked player won, and 0 otherwise
betting_data['higher_rank_won'] = (betting_data['WRank'] < betting_data['LRank']).astype(int)

# Filter the betting_data to include only rows where the match status is 'Completed'
betting_data = betting_data.loc[betting_data['Comment'] == 'Completed']

## Split the dataset:

In [6]:
# Convert 'tourney_date' to datetime format 
betting_data['Date'] = pd.to_datetime(betting_data['Date'], format='%Y-%m-%d')

# Define the split date for January 1, 2019
split_time = pd.to_datetime('2019-01-01', format='%Y-%m-%d')

In [7]:
# Splitting the dataset into training and validation (test) sets
betting_data_train = betting_data[betting_data['Date'] < split_time]
betting_data_test = betting_data[betting_data['Date'] >= split_time]

## Naive Model:

## Function Definitions and Setup:

In [8]:
def calculate_metrics(betting_data, naive_accuracy=True, train_accuracy=None):
    """
    Calculate metrics (naive accuracy, log loss, and calibration) for the dataset.
    
    Parameters:
    betting_data (pd.DataFrame): A DataFrame containing the dataset.
    naive_accuracy (bool): If True, calculate naive accuracy for the provided dataset.
                           If False, use the accuracy provided in `train_accuracy` for calibration.
    train_accuracy (float): The accuracy from the training set to be used for calibration.
                            Required if `naive_accuracy` is False.

    Returns:
    float: Naive accuracy (only if naive_accuracy=True).
    float: Log loss for the dataset.
    float: Calibration for the dataset.
    """
    # Number of observations
    N = len(betting_data)
    w = betting_data['higher_rank_won']

    if naive_accuracy:
        # Calculate naive accuracy
        accuracy_naive = w.mean()
        # Calculate log loss
        log_loss_naive = -1 / N * np.sum(w * np.log(accuracy_naive) + (1 - w) * np.log(1 - accuracy_naive))
        # Calculate calibration
        calibration_naive = accuracy_naive * N / np.sum(w)
        return accuracy_naive, log_loss_naive, calibration_naive
    else:
        if train_accuracy is None:
            raise ValueError("train_accuracy must be provided when naive_accuracy is set to False.")
        
        # Calculate naive accuracy
        accuracy_naive = w.mean()
        # Calculate log loss
        log_loss_naive = -1 / N * np.sum(w * np.log(accuracy_naive) + (1 - w) * np.log(1 - accuracy_naive))
        # Calculate calibration
        calibration_naive = train_accuracy * N / np.sum(w)
        return accuracy_naive, log_loss_naive, calibration_naive

In [9]:
def calculate_naive_metrics(betting_data, naive_accuracy=True):
    """
    Calculate naive accuracy, log loss, and calibration on the entire dataset.

    Parameters:
    betting_data (pd.DataFrame): A DataFrame containing the dataset.

    Returns:
    naive_accuracy (float): The naive accuracy of the dataset.
    log_loss_naive (float): The log loss for the naive model.
    calibration_naive (float): The calibration for the naive model.
    """
    # Calculate naive accuracy
    N = len(betting_data)
    naive_accuracy = betting_data['higher_rank_won'].mean()
    w = betting_data['higher_rank_won']
    pi_naive = naive_accuracy

    # Calculate log loss
    log_loss_naive = -1 / N * np.sum(w * np.log(pi_naive) + (1 - w) * np.log(1 - pi_naive))

    # Calculate calibration
    calibration_naive = pi_naive * N / np.sum(w)

    # Return the results
    return naive_accuracy, log_loss_naive, calibration_naive

## Naive Model - Training dataset:

In [10]:
# Calculate metrics for the training set
naive_accuracy_train, log_loss_naive_train, calibration_naive_train = calculate_metrics(betting_data_train, naive_accuracy=True)

# Store the results
validation_stats_train = pd.DataFrame({
    'Model': ['Naive_Train'],
    'Accuracy': [naive_accuracy_train],
    'Log_Loss': [log_loss_naive_train],
    'Calibration': [calibration_naive_train]
})

validation_stats_train

Unnamed: 0,Model,Accuracy,Log_Loss,Calibration
0,Naive_Train,0.668079,0.63553,1.0


## Naive Model - Validation dataset:

In [11]:
# Calculate metrics for the testing set
naive_accuracy_test, log_loss_naive_test, calibration_naive_test = calculate_metrics(betting_data_test, naive_accuracy=False, train_accuracy=naive_accuracy_train)

# Store the results
validation_stats_test = pd.DataFrame({
    'Model': ['Naiv_Test'],
    'Accuracy': [naive_accuracy_test],
    'Log_Loss': [log_loss_naive_test],
    'Calibration': [calibration_naive_test]
})

validation_stats_test

Unnamed: 0,Model,Accuracy,Log_Loss,Calibration
0,Naiv_Test,0.613609,0.667106,1.08877


## Filtering Top 50 and Top 100 Ranking players from the dataset

In [12]:
# Load your dataset
df = betting_data_test

# Define function to filter dataset for top N players
def filter_top_players(df, top_n):
    df_top = df[(df['WRank'] <= top_n) | (df['LRank'] <= top_n)]
    return df_top

# Filter the dataset for top 50 and top 100 players
df_top_50 = filter_top_players(df, 50)
df_top_100 = filter_top_players(df, 100)

In [13]:
# Calculate metrics for top 50 players
top_50_accuracy, top_50_log_loss, top_50_calibration = calculate_metrics(df_top_50, naive_accuracy=False, train_accuracy=naive_accuracy_train)

In [14]:
# Calculate metrics for top 100 players
top_100_accuracy, top_100_log_loss, top_100_calibration = calculate_metrics(df_top_100, naive_accuracy=False, train_accuracy=naive_accuracy_train)

## Metrics - Top 50 & Top 100:

In [15]:
# Store the results for the testing set, top 50, and top 100
validation_stats_test = pd.DataFrame({
    'Model': ['Naive_Test', 'Naive_Top_50', 'Naive_Top_100'],
    'Accuracy': [naive_accuracy_test, top_50_accuracy, top_100_accuracy],
    'Log_Loss': [log_loss_naive_test, top_50_log_loss, top_100_log_loss],
    'Calibration': [calibration_naive_test, top_50_calibration, top_100_calibration]
})

validation_stats_test

Unnamed: 0,Model,Accuracy,Log_Loss,Calibration
0,Naive_Test,0.613609,0.667106,1.08877
1,Naive_Top_50,0.636467,0.655424,1.049669
2,Naive_Top_100,0.614713,0.666593,1.086815
