## Load the data:

In [1]:
import warnings
import numpy as np
import pandas as pd
import os

# Suppress specific UserWarnings from openpyxl
warnings.filterwarnings("ignore", category=UserWarning, module='openpyxl')

# Define the directory where your files are located
# data_dir = '.' 
data_dir = os.path.join(os.path.pardir)  

# List to hold the dataframes
dataframes = []

# Loop through the years and load the files
for year in range(2005, 2020):
    if year <= 2012:
        file_path = os.path.join(data_dir, f'{year}.xls')
    else:
        file_path = os.path.join(data_dir, f'{year}.xlsx')
    
    # Load the file into a dataframe
    df = pd.read_excel(file_path)
    
    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all the dataframes into one
betting_data = pd.concat(dataframes, ignore_index=True)

# Display the first few rows of the combined dataframe
betting_data.head()


Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
0,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Saulnier C.,...,,,,,,,,,,
1,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Enqvist T.,...,,,,,,,,,,
2,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Melzer J.,...,,,,,,,,,,
3,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Rochus O.,...,,,,,,,,,,
4,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Mayer F.,...,,,,,,,,,,


## Preprocess dataset

In [2]:
# Convert WRank and LRank to numeric, coercing errors
betting_data['WRank'] = pd.to_numeric(betting_data['WRank'], errors='coerce')
betting_data['LRank'] = pd.to_numeric(betting_data['LRank'], errors='coerce')

# Fill NaN values with a high number
betting_data['WRank'].fillna(100000, inplace=True)
betting_data['LRank'].fillna(100000, inplace=True)

betting_data['higher_rank_won'] = (betting_data['WRank'] < betting_data['LRank']).astype(int)


## Split the dataset

In [3]:
# Convert 'tourney_date' to datetime format 
betting_data['Date'] = pd.to_datetime(betting_data['Date'], format='%Y-%m-%d')

# Define the split date for January 1, 2019
split_time = pd.to_datetime('2019-01-01', format='%Y-%m-%d')

# Splitting the dataset into training and validation (test) sets
betting_data_train = betting_data[betting_data['Date'] < split_time]
betting_data_test = betting_data[betting_data['Date'] >= split_time]


## Naive Model

In [4]:
# Define function to calculate accuracy, log loss, and calibration
def calculate_metrics(df, training_set_accuracy):
    N_test = len(df)
    naive_accuracy_test = df['higher_rank_won'].mean()
    w_test = df['higher_rank_won']
    
    # For the naive model, pi is constant and equal to the training set accuracy
    pi_naive = training_set_accuracy
    
    # Calculate log loss for the naive model on the testing set
    log_loss_naive = -1 / N_test * np.sum(w_test * np.log(pi_naive) + (1 - w_test) * np.log(1 - pi_naive))
    
    # Calculate calibration for the naive model on the testing set
    calibration_naive = pi_naive * N_test / np.sum(w_test)
    
    return naive_accuracy_test, log_loss_naive, calibration_naive


In [5]:
# Calculate naive accuracy on the entire dataset
N = len(betting_data)
naive_accuracy = betting_data['higher_rank_won'].mean()
w = betting_data['higher_rank_won']
pi_naive = naive_accuracy

# Calculate log loss and calibration for the entire dataset
log_loss_naive = -1 / N * np.sum(w * np.log(pi_naive) + (1 - w) * np.log(1 - pi_naive))
calibration_naive = pi_naive * N / np.sum(w)

# Store the results
validation_stats = pd.DataFrame({
    'model': ['naive'],
    'pred_acc': [naive_accuracy],
    'log_loss': [log_loss_naive],
    'calibration': [calibration_naive]
})


In [6]:
# Calculate naive accuracy on the training set
N_train = len(betting_data_train)
naive_accuracy_train = betting_data_train['higher_rank_won'].mean()

# Calculate metrics for the testing set
naive_accuracy_test, log_loss_naive_test, calibration_naive_test = calculate_metrics(betting_data_test, naive_accuracy_train)

# Create a DataFrame to store the validation statistics
validation_stats_test = pd.DataFrame({
    'model': ['naive'],
    'pred_acc_train': [naive_accuracy_train],
    'pred_acc_test': [naive_accuracy_test],
    'log_loss': [log_loss_naive_test],
    'calibration': [calibration_naive_test]
})

print(validation_stats_test)


   model  pred_acc_train  pred_acc_test  log_loss  calibration
0  naive        0.662804       0.613961  0.672167     1.079554


## Extension 1:

In [7]:
# Load your dataset
df = betting_data_test

# Define function to filter dataset for top N players
def filter_top_players(df, top_n):
    df_top = df[(df['WRank'] <= top_n) | (df['LRank'] <= top_n)]
    return df_top

# Filter the dataset for top 50 and top 100 players
df_top_50 = filter_top_players(df, 50)
df_top_100 = filter_top_players(df, 100)

# Calculate metrics for top 50 players
top_50_accuracy, top_50_log_loss, top_50_calibration = calculate_metrics(df_top_50, naive_accuracy_train)

# Calculate metrics for top 100 players
top_100_accuracy, top_100_log_loss, top_100_calibration = calculate_metrics(df_top_100, naive_accuracy_train)

# Store the results for the testing set, top 50, and top 100
validation_stats_test = pd.DataFrame({
    'model': ['naive_test', 'naive_top_50', 'naive_top_100'],
    'pred_acc_test': [naive_accuracy_test, top_50_accuracy, top_100_accuracy],
    'log_loss': [log_loss_naive_test, top_50_log_loss, top_100_log_loss],
    'calibration': [calibration_naive_test, top_50_calibration, top_100_calibration]
})

# Display the validation statistics
print("Validation Stats (Entire Dataset):")
print(validation_stats)

print("\nValidation Stats (Test, Top 50, Top 100):")
print(validation_stats_test)


Validation Stats (Entire Dataset):
   model  pred_acc  log_loss  calibration
0  naive  0.659668  0.641255          1.0

Validation Stats (Test, Top 50, Top 100):
           model  pred_acc_test  log_loss  calibration
0     naive_test       0.613961  0.672167     1.079554
1   naive_top_50       0.634773  0.658102     1.044159
2  naive_top_100       0.615044  0.671435     1.077652
