## Load the data:

In [1]:
import warnings
import numpy as np
import pandas as pd
import os

# Suppress specific UserWarnings from openpyxl
warnings.filterwarnings("ignore", category=UserWarning, module='openpyxl')

# Define the directory where your files are located
# data_dir = '.'  
data_dir = os.path.join(os.path.pardir)  

# List to hold the dataframes
dataframes = []

# Loop through the years and load the files
for year in range(2005, 2020):
    if year <= 2012:
        file_path = os.path.join(data_dir, f'{year}.xls')
    else:
        file_path = os.path.join(data_dir, f'{year}.xlsx')
    
    # Load the file into a dataframe
    df = pd.read_excel(file_path)
    
    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all the dataframes into one
betting_data = pd.concat(dataframes, ignore_index=True)

# Display the first few rows of the combined dataframe
betting_data.head()


Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
0,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Saulnier C.,...,,,,,,,,,,
1,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Enqvist T.,...,,,,,,,,,,
2,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Melzer J.,...,,,,,,,,,,
3,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Rochus O.,...,,,,,,,,,,
4,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Mayer F.,...,,,,,,,,,,


## Fixing Anomalies

In [2]:
def is_column_numeric(df, column_name):
    # Check if the column contains only numeric values
    return df[column_name].apply(lambda x: str(x).isnumeric()).all()

# Check if columns are numeric before converting
anomaly_column = ['WRank', 'LRank', 'EXW']
for column in anomaly_column:
    if is_column_numeric(betting_data, column):
        print(f"Column '{column}' is numeric.\n")
    else:
        print(f"Column '{column}' is not numeric.\n")

def find_non_numeric_values(df, column_name):
    # Function to check if a value is numeric
    def is_numeric(value):
        try:
            float(value)
            return True
        except ValueError:
            return False

    # Apply the function to the column and filter non-numeric values
    non_numeric_values = df[~df[column_name].apply(is_numeric)]

    # Display the non-numeric values
    print(f"Non-numeric values in {column_name}:")
    print(non_numeric_values[[column_name]])

# WRank column
find_non_numeric_values(betting_data, 'WRank')

# LRank column
find_non_numeric_values(betting_data, 'LRank')

# EXW column
find_non_numeric_values(betting_data, 'EXW')

Column 'WRank' is not numeric.

Column 'LRank' is not numeric.

Column 'EXW' is not numeric.

Non-numeric values in WRank:
Empty DataFrame
Columns: [WRank]
Index: []
Non-numeric values in LRank:
Empty DataFrame
Columns: [LRank]
Index: []
Non-numeric values in EXW:
        EXW
23776  2.,3


In [3]:
# Convert WRank and LRank to numeric, coercing errors
betting_data['WRank'] = pd.to_numeric(betting_data['WRank'], errors='coerce')
betting_data['LRank'] = pd.to_numeric(betting_data['LRank'], errors='coerce')

# Fill NaN values with a high number
betting_data['WRank'].fillna(100000, inplace=True)
betting_data['LRank'].fillna(100000, inplace=True)

# Correct the typo in row 38294, column 'EXW'
if betting_data.at[38294, 'EXW'] == '2.,3':
    betting_data.at[38294, 'EXW'] = '2.3'


## Feature Engineering:

In [4]:
# Now perform the calculations
betting_data['higher_rank_won'] = (betting_data['WRank'] < betting_data['LRank']).astype(int)
betting_data['higher_rank_points'] = betting_data['higher_rank_won'] * betting_data['WPts'] + betting_data['LPts'] * (1 - betting_data['higher_rank_won'])
betting_data['lower_rank_points'] = (1 - betting_data['higher_rank_won']) * betting_data['WPts'] + betting_data['LPts'] * betting_data['higher_rank_won']


In [5]:
# Ensure all columns are displayed
pd.set_option('display.max_columns', None)

# Display the DataFrame (or any part of it)
betting_data


Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,WRank,LRank,WPts,LPts,W1,L1,W2,L2,W3,L3,W4,L4,W5,L5,Wsets,Lsets,Comment,B365W,B365L,CBW,CBL,EXW,EXL,IWW,IWL,PSW,PSL,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL,higher_rank_won,higher_rank_points,lower_rank_points
0,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Saulnier C.,Baccanello P.,53.0,324.0,,,6.0,2.0,7,6,,,,,,,2.0,0.0,Completed,1.286,3.250,1.25,3.7,1.3,3.35,1.30,2.70,1.305,3.780,,,,,,,,,,,1,,
1,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Enqvist T.,Sluiter R.,72.0,82.0,,,6.0,3.0,6,1,,,,,,,2.0,0.0,Completed,1.833,1.833,1.85,1.9,1.8,1.95,1.75,1.75,1.990,1.840,,,,,,,,,,,1,,
2,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Melzer J.,Berdych T.,39.0,45.0,,,6.0,4.0,4,6,7,6,,,,,2.0,1.0,Completed,1.800,1.909,1.75,2.0,1.9,1.85,1.85,1.65,1.901,1.917,,,,,,,,,,,1,,
3,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Rochus O.,Dupuis A.,66.0,79.0,,,6.0,3.0,3,6,6,1,,,,,2.0,1.0,Completed,1.667,2.100,1.58,2.3,1.6,2.25,1.55,2.00,1.621,2.410,,,,,,,,,,,1,,
4,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Mayer F.,Arthurs W.,35.0,101.0,,,6.0,4.0,3,6,7,5,,,,,2.0,1.0,Completed,1.615,2.200,1.75,2.0,1.8,1.95,1.55,2.00,1.787,2.070,,,,,,,,,,,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40385,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Nadal R.,Tsitsipas S.,1.0,6.0,9585.0,4000.0,6.0,7.0,6.0,4.0,7.0,5.0,,,,,2.0,1.0,Completed,1.440,2.750,,,,,,,1.390,3.260,,,,,,,1.48,3.30,1.41,2.93,1,9585.0,4000.0
40386,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,Medvedev D.,7.0,4.0,2945.0,5705.0,6.0,4.0,7.0,6.0,,,,,,,2.0,0.0,Completed,1.900,1.900,,,,,,,2.140,1.790,,,,,,,2.24,2.06,1.92,1.90,0,5705.0,2945.0
40387,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Tsitsipas S.,Federer R.,6.0,3.0,4000.0,6190.0,6.0,3.0,6.0,4.0,,,,,,,2.0,0.0,Completed,3.500,1.300,,,,,,,3.750,1.330,,,,,,,3.75,1.40,3.39,1.33,0,6190.0,4000.0
40388,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Thiem D.,Zverev A.,5.0,7.0,5025.0,2945.0,7.0,5.0,6.0,3.0,,,,,,,2.0,0.0,Completed,1.800,2.000,,,,,,,1.840,2.100,,,,,,,1.87,2.20,1.78,2.06,1,5025.0,2945.0


## Computing Missing Data using Mean

In [6]:
betting_data

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,WRank,LRank,WPts,LPts,W1,L1,W2,L2,W3,L3,W4,L4,W5,L5,Wsets,Lsets,Comment,B365W,B365L,CBW,CBL,EXW,EXL,IWW,IWL,PSW,PSL,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL,higher_rank_won,higher_rank_points,lower_rank_points
0,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Saulnier C.,Baccanello P.,53.0,324.0,,,6.0,2.0,7,6,,,,,,,2.0,0.0,Completed,1.286,3.250,1.25,3.7,1.3,3.35,1.30,2.70,1.305,3.780,,,,,,,,,,,1,,
1,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Enqvist T.,Sluiter R.,72.0,82.0,,,6.0,3.0,6,1,,,,,,,2.0,0.0,Completed,1.833,1.833,1.85,1.9,1.8,1.95,1.75,1.75,1.990,1.840,,,,,,,,,,,1,,
2,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Melzer J.,Berdych T.,39.0,45.0,,,6.0,4.0,4,6,7,6,,,,,2.0,1.0,Completed,1.800,1.909,1.75,2.0,1.9,1.85,1.85,1.65,1.901,1.917,,,,,,,,,,,1,,
3,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Rochus O.,Dupuis A.,66.0,79.0,,,6.0,3.0,3,6,6,1,,,,,2.0,1.0,Completed,1.667,2.100,1.58,2.3,1.6,2.25,1.55,2.00,1.621,2.410,,,,,,,,,,,1,,
4,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Mayer F.,Arthurs W.,35.0,101.0,,,6.0,4.0,3,6,7,5,,,,,2.0,1.0,Completed,1.615,2.200,1.75,2.0,1.8,1.95,1.55,2.00,1.787,2.070,,,,,,,,,,,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40385,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Nadal R.,Tsitsipas S.,1.0,6.0,9585.0,4000.0,6.0,7.0,6.0,4.0,7.0,5.0,,,,,2.0,1.0,Completed,1.440,2.750,,,,,,,1.390,3.260,,,,,,,1.48,3.30,1.41,2.93,1,9585.0,4000.0
40386,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,Medvedev D.,7.0,4.0,2945.0,5705.0,6.0,4.0,7.0,6.0,,,,,,,2.0,0.0,Completed,1.900,1.900,,,,,,,2.140,1.790,,,,,,,2.24,2.06,1.92,1.90,0,5705.0,2945.0
40387,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Tsitsipas S.,Federer R.,6.0,3.0,4000.0,6190.0,6.0,3.0,6.0,4.0,,,,,,,2.0,0.0,Completed,3.500,1.300,,,,,,,3.750,1.330,,,,,,,3.75,1.40,3.39,1.33,0,6190.0,4000.0
40388,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Thiem D.,Zverev A.,5.0,7.0,5025.0,2945.0,7.0,5.0,6.0,3.0,,,,,,,2.0,0.0,Completed,1.800,2.000,,,,,,,1.840,2.100,,,,,,,1.87,2.20,1.78,2.06,1,5025.0,2945.0


In [7]:
# Define the column names for betting odds
betting_columns = ['CBW', 'CBL', 'IWW', 'IWL', 
                   'B365W', 'B365L', 'EXW', 'EXL', 
                   'PSW', 'PSL', 'UBW', 'UBL', 'LBW', 'LBL', 'SJW', 'SJL']

# Ensure all columns are numeric and convert if necessary
for col in betting_columns:
    if not pd.api.types.is_numeric_dtype(betting_data[col]):
        print(f"Converting column {col} to numeric.\n")
        betting_data[col] = pd.to_numeric(betting_data[col], errors='coerce')

# Display the number of missing values in the betting odds columns
missing_values_count = betting_data[betting_columns].isnull().sum()
print(f'Missing values in betting columns:\n{missing_values_count}\n')

# Calculate the mean of the available betting odds for each column
mean_betting_odds = betting_data[betting_columns].mean()
print(f'Mean of available betting odds:\n{mean_betting_odds}\n')

# Impute the missing values with the mean using .loc
for col in betting_columns:
    betting_data.loc[betting_data[col].isnull(), col] = mean_betting_odds[col]

# Verify that there are no more missing values
missing_values_count_after = betting_data[betting_columns].isnull().sum()
print(f'Missing values in betting columns after imputation:\n{missing_values_count_after}')


Converting column EXW to numeric.

Missing values in betting columns:
CBW      32337
CBL      32337
IWW      37571
IWL      37571
B365W      547
B365L      524
EXW       3611
EXL       3605
PSW       3150
PSL       3150
UBW      29719
UBL      29719
LBW      12259
LBL      12248
SJW      24818
SJL      24811
dtype: int64

Mean of available betting odds:
CBW      1.825494
CBL      3.338149
IWW      1.680738
IWL      2.642355
B365W    1.828470
B365L    3.648420
EXW      1.802534
EXL      3.295159
PSW      1.930257
PSL      4.206489
UBW      1.815867
UBL      3.542479
LBW      1.810226
LBL      3.451461
SJW      1.796538
SJL      3.557943
dtype: float64

Missing values in betting columns after imputation:
CBW      0
CBL      0
IWW      0
IWL      0
B365W    0
B365L    0
EXW      0
EXL      0
PSW      0
PSL      0
UBW      0
UBL      0
LBW      0
LBL      0
SJW      0
SJL      0
dtype: int64


## Split the dataset

In [8]:
# Convert 'tourney_date' to datetime format 
betting_data['Date'] = pd.to_datetime(betting_data['Date'], format='%Y-%m-%d')

# Define the split date for January 1, 2019
split_time = pd.to_datetime('2019-01-01', format='%Y-%m-%d')

# Splitting the dataset into training and validation (test) sets
betting_data_2019 = betting_data[betting_data['Date'] >= split_time]
betting_data = betting_data[betting_data['Date'] < split_time]

# Create a copy of the dataset
betting_data_copy = betting_data.copy()
betting_data_2019_copy = betting_data_2019.copy()

## BCM Model (2000 - 2018)

In [9]:
def process_betting_data(df, betting_columns):
    # Calculate raw implied probabilities
    for col in betting_columns:
        df.loc[:, f'implied_{col}'] = 1 / df[col]

    # Normalize the probabilities for each bookmaker
    for w_col, l_col in zip(betting_columns[::2], betting_columns[1::2]):
        df.loc[:, f'normalized_{w_col}'] = df[f'implied_{w_col}'] / (df[f'implied_{w_col}'] + df[f'implied_{l_col}'])
        df.loc[:, f'normalized_{l_col}'] = df[f'implied_{l_col}'] / (df[f'implied_{w_col}'] + df[f'implied_{l_col}'])

    # Calculate logit values for normalized probabilities and then the consensus probability
    logit_cols = []
    for col in betting_columns[::2]:  # Process only the winner columns
        logit_col = f'logit_normalized_{col}'
        df.loc[:, logit_col] = df[f'normalized_{col}'].apply(logit)
        logit_cols.append(logit_col)

    # Calculate the average logit for consensus probability
    df.loc[:, 'consensus_logit_W'] = df[logit_cols].mean(axis=1)
    df.loc[:, 'consensus_prob_W'] = df['consensus_logit_W'].apply(inv_logit)

    # Create the probability of higher-ranked player winning
    df.loc[:, 'prob_higher_rank_winning'] = df.apply(
        lambda row: row['consensus_prob_W'] if row['higher_rank_won'] == 1 else (1 - row['consensus_prob_W']), axis=1
    )

    # Create the outcome column
    df.loc[:, 'outcome'] = df['prob_higher_rank_winning'].apply(lambda x: 1 if x > 0.50 else 0)

    return df

In [10]:
# Function to calculate logit
def logit(p):
    p = np.clip(p, 1e-10, 1 - 1e-10)  # Ensure probabilities are within (0, 1)
    return np.log(p / (1 - p))

# Function to calculate inverse logit
def inv_logit(y):
    return np.exp(y) / (1 + np.exp(y))


In [11]:
betting_data = process_betting_data(betting_data, betting_columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'implied_{col}'] = 1 / df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'implied_{col}'] = 1 / df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'implied_{col}'] = 1 / df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'normalized_{l_col}'] = df[f'implied_{l_col}'] / (df[f'implied_{w_col}'] + df[f'implied_{l_col}'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'normalized_{w_col}'] = df[f'implied_{w_col}'] / (df[f'implied_{w_col}'] + df[f'implied_{l_col}'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

## Evaluate Model Performance
#### Accuracy:  
Calculate the accuracy of your model predictions.

In [12]:
accuracy_bcm = np.mean(betting_data.outcome == betting_data.higher_rank_won)
print(f'Accuracy: {accuracy_bcm}')


Accuracy: 0.8364420456649999


#### Calibration:  
Assess the calibration of your model.

In [13]:
# Calculate calibration
calibration_bcm = np.sum(betting_data.prob_higher_rank_winning) / np.sum(betting_data.higher_rank_won)
print(f'Calibration: {calibration_bcm}')


Calibration: 0.9286653821767712


##### Log-loss:
Compute the log-loss for your predictions.

In [14]:
def logloss(actual, predictions):
    epsilon = 1e-15
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    
    logr_logloss_all_predictors = -(1 / len(actual)) * np.sum(
        actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
    return logr_logloss_all_predictors


In [15]:
logloss_bcm = logloss(betting_data.higher_rank_won, betting_data.prob_higher_rank_winning)
print(f'Logloss: {logloss_bcm}')


Logloss: 0.48797186839899886


In [16]:
betting_data

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,WRank,LRank,WPts,LPts,W1,L1,W2,L2,W3,L3,W4,L4,W5,L5,Wsets,Lsets,Comment,B365W,B365L,CBW,CBL,EXW,EXL,IWW,IWL,PSW,PSL,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL,higher_rank_won,higher_rank_points,lower_rank_points,implied_CBW,implied_CBL,implied_IWW,implied_IWL,implied_B365W,implied_B365L,implied_EXW,implied_EXL,implied_PSW,implied_PSL,implied_UBW,implied_UBL,implied_LBW,implied_LBL,implied_SJW,implied_SJL,normalized_CBW,normalized_CBL,normalized_IWW,normalized_IWL,normalized_B365W,normalized_B365L,normalized_EXW,normalized_EXL,normalized_PSW,normalized_PSL,normalized_UBW,normalized_UBL,normalized_LBW,normalized_LBL,normalized_SJW,normalized_SJL,logit_normalized_CBW,logit_normalized_IWW,logit_normalized_B365W,logit_normalized_EXW,logit_normalized_PSW,logit_normalized_UBW,logit_normalized_LBW,logit_normalized_SJW,consensus_logit_W,consensus_prob_W,prob_higher_rank_winning,outcome
0,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Saulnier C.,Baccanello P.,53.0,324.0,,,6.0,2.0,7,6,,,,,,,2.0,0.0,Completed,1.286,3.250,1.250000,3.700000,1.300000,3.350000,1.300000,2.700000,1.305,3.780,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.800000,0.270270,0.769231,0.370370,0.777605,0.307692,0.769231,0.298507,0.766284,0.264550,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.747475,0.252525,0.675000,0.325000,0.716490,0.283510,0.720430,0.279570,0.743363,0.256637,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,1.085189,0.730888,0.927118,0.946596,1.063521,0.668264,0.645346,0.683321,0.843780,0.699261,0.699261,1
1,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Enqvist T.,Sluiter R.,72.0,82.0,,,6.0,3.0,6,1,,,,,,,2.0,0.0,Completed,1.833,1.833,1.850000,1.900000,1.800000,1.950000,1.750000,1.750000,1.990,1.840,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.540541,0.526316,0.571429,0.571429,0.545554,0.545554,0.555556,0.512821,0.502513,0.543478,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.506667,0.493333,0.500000,0.500000,0.500000,0.500000,0.520000,0.480000,0.480418,0.519582,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.026668,0.000000,0.000000,0.080043,-0.078369,0.668264,0.645346,0.683321,0.253159,0.562954,0.562954,1
2,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Melzer J.,Berdych T.,39.0,45.0,,,6.0,4.0,4,6,7,6,,,,,2.0,1.0,Completed,1.800,1.909,1.750000,2.000000,1.900000,1.850000,1.850000,1.650000,1.901,1.917,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.571429,0.500000,0.540541,0.606061,0.555556,0.523834,0.526316,0.540541,0.526039,0.521648,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.533333,0.466667,0.471429,0.528571,0.514694,0.485306,0.493333,0.506667,0.502095,0.497905,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.133531,-0.114410,0.058793,-0.026668,0.008381,0.668264,0.645346,0.683321,0.257070,0.563916,0.563916,1
3,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Rochus O.,Dupuis A.,66.0,79.0,,,6.0,3.0,3,6,6,1,,,,,2.0,1.0,Completed,1.667,2.100,1.580000,2.300000,1.600000,2.250000,1.550000,2.000000,1.621,2.410,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.632911,0.434783,0.645161,0.500000,0.599880,0.476190,0.625000,0.444444,0.616903,0.414938,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.592784,0.407216,0.563380,0.436620,0.557473,0.442527,0.584416,0.415584,0.597867,0.402133,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.375484,0.254892,0.230912,0.340927,0.396584,0.668264,0.645346,0.683321,0.449466,0.610512,0.610512,1
4,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Mayer F.,Arthurs W.,35.0,101.0,,,6.0,4.0,3,6,7,5,,,,,2.0,1.0,Completed,1.615,2.200,1.750000,2.000000,1.800000,1.950000,1.550000,2.000000,1.787,2.070,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.571429,0.500000,0.645161,0.500000,0.619195,0.454545,0.555556,0.512821,0.559597,0.483092,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.533333,0.466667,0.563380,0.436620,0.576671,0.423329,0.520000,0.480000,0.536687,0.463313,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.133531,0.254892,0.309122,0.080043,0.147010,0.668264,0.645346,0.683321,0.365191,0.590297,0.590297,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Darcis S.,Carballes Baena R.,100000.0,73.0,,714.0,6.0,3.0,6.0,4.0,,,,,,,2.0,0.0,Completed,2.370,1.530,1.825494,3.338149,1.802534,3.295159,1.680738,2.642355,2.440,1.610,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,2.47,1.65,2.35,1.59,0,,,0.547797,0.299567,0.594977,0.378450,0.421941,0.653595,0.554774,0.303475,0.409836,0.621118,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.646472,0.353528,0.611219,0.388781,0.392308,0.607692,0.646402,0.353598,0.397531,0.602469,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.603566,0.452438,-0.437622,0.603261,-0.415764,0.668264,0.645346,0.683321,0.350351,0.586703,0.413297,0
37839,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Munar J.,Albot R.,81.0,98.0,663.0,592.0,6.0,2.0,7.0,6.0,,,,,,,2.0,0.0,Completed,2.000,1.720,1.825494,3.338149,1.802534,3.295159,1.680738,2.642355,1.940,1.940,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,2.08,1.95,1.94,1.86,1,663.0,592.0,0.547797,0.299567,0.594977,0.378450,0.500000,0.581395,0.554774,0.303475,0.515464,0.515464,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.646472,0.353528,0.611219,0.388781,0.462366,0.537634,0.646402,0.353598,0.500000,0.500000,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.603566,0.452438,-0.150823,0.603261,0.000000,0.668264,0.645346,0.683321,0.438172,0.607823,0.607823,1
37840,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Donskoy E.,Andujar P.,97.0,82.0,594.0,658.0,6.0,3.0,5.0,7.0,7.0,6.0,,,,,2.0,1.0,Completed,1.500,2.500,1.825494,3.338149,1.802534,3.295159,1.680738,2.642355,1.540,2.620,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,1.57,2.65,1.51,2.53,0,658.0,594.0,0.547797,0.299567,0.594977,0.378450,0.666667,0.400000,0.554774,0.303475,0.649351,0.381679,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.646472,0.353528,0.611219,0.388781,0.625000,0.375000,0.646402,0.353598,0.629808,0.370192,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.603566,0.452438,0.510826,0.603261,0.531392,0.668264,0.645346,0.683321,0.587302,0.642746,0.357254,0
37841,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Mmoh M.,Gunneswaran P.,103.0,110.0,563.0,521.0,7.0,5.0,6.0,3.0,,,,,,,2.0,0.0,Completed,1.660,2.100,1.825494,3.338149,1.802534,3.295159,1.680738,2.642355,1.810,2.100,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,1.83,2.17,1.74,2.09,1,563.0,521.0,0.547797,0.299567,0.594977,0.378450,0.602410,0.476190,0.554774,0.303475,0.552486,0.476190,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.646472,0.353528,0.611219,0.388781,0.558511,0.441489,0.646402,0.353598,0.537084,0.462916,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.603566,0.452438,0.235120,0.603261,0.148610,0.668264,0.645346,0.683321,0.504991,0.623631,0.623631,1


## BCM (2019)

In [17]:
# Define the column names for betting odds
betting_columns = ['B365W', 'B365L','PSW', 'PSL']

betting_data_2019 = process_betting_data(betting_data_2019, betting_columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'implied_{col}'] = 1 / df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'implied_{col}'] = 1 / df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'implied_{col}'] = 1 / df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

## Evaluate Model Performance


In [18]:
#Calculate Accuracy
accuracy_2019 = np.mean(betting_data_2019.outcome == betting_data_2019.higher_rank_won)
print(f'Accuracy: {accuracy_2019}')

# Calculate calibration
calibration_2019 = np.sum(betting_data_2019.prob_higher_rank_winning) / np.sum(betting_data_2019.higher_rank_won)
print(f'Calibration: {calibration_2019}')

# Calculate logloss
def logloss(actual, predictions):
    epsilon = 1e-15
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    
    logr_logloss_all_predictors = -(1 / len(actual)) * np.sum(
        actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
    return logr_logloss_all_predictors

logloss_2019 = logloss(betting_data_2019.higher_rank_won, betting_data_2019.prob_higher_rank_winning)
print(f'Logloss: {logloss_2019}')


Accuracy: 0.6741226378711916
Calibration: 1.0231932252529874
Logloss: 0.5945646439823006


## Extension 1:

In [19]:
import pandas as pd

# Load your dataset
df = betting_data_copy

# Filter dataset for top 50 and top 100 players
def filter_top_players(df, top_n):
    df_top = df[(df['WRank'] <= top_n) & (df['LRank'] <= top_n)]
    return df_top

df_top_50 = filter_top_players(df, 50)
df_top_100 = filter_top_players(df, 100)


In [20]:
#Calucate BCM
betting_columns = ['CBW', 'CBL', 'IWW', 'IWL', 
                   'B365W', 'B365L', 'EXW', 'EXL', 
                   'PSW', 'PSL', 'UBW', 'UBL', 'LBW', 'LBL', 'SJW', 'SJL']

def process_betting_data(df, betting_columns):
    # Calculate raw implied probabilities
    for col in betting_columns:
        df.loc[:, f'implied_{col}'] = 1 / df[col]

    # Normalize the probabilities for each bookmaker
    for w_col, l_col in zip(betting_columns[::2], betting_columns[1::2]):
        df.loc[:, f'normalized_{w_col}'] = df[f'implied_{w_col}'] / (df[f'implied_{w_col}'] + df[f'implied_{l_col}'])
        df.loc[:, f'normalized_{l_col}'] = df[f'implied_{l_col}'] / (df[f'implied_{w_col}'] + df[f'implied_{l_col}'])

    # Calculate logit values for normalized probabilities and then the consensus probability
    logit_cols = []
    for col in betting_columns[::2]:  # Process only the winner columns
        logit_col = f'logit_normalized_{col}'
        df.loc[:, logit_col] = df[f'normalized_{col}'].apply(logit)
        logit_cols.append(logit_col)

    # Calculate the average logit for consensus probability
    df.loc[:, 'consensus_logit_W'] = df[logit_cols].mean(axis=1)
    df.loc[:, 'consensus_prob_W'] = df['consensus_logit_W'].apply(inv_logit)

    # Create the probability of higher-ranked player winning
    df.loc[:, 'prob_higher_rank_winning'] = df.apply(
        lambda row: row['consensus_prob_W'] if row['higher_rank_won'] == 1 else (1 - row['consensus_prob_W']), axis=1
    )

    # Create the outcome column
    df.loc[:, 'outcome'] = df['prob_higher_rank_winning'].apply(lambda x: 1 if x > 0.50 else 0)

    return df

In [21]:
df_top_50 = process_betting_data(df_top_50, betting_columns)
df_top_50

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'implied_{col}'] = 1 / df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'implied_{col}'] = 1 / df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'implied_{col}'] = 1 / df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'normalized_{w_col}'] = df[f'implied_{w_col}'] / (df[f'implied_{w_col}'] + df[f'implied_{l_col}'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'normalized_{l_col}'] = df[f'implied_{l_col}'] / (df[f'implied_{w_col}'] + df[f'implied_{l_col}'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,WRank,LRank,WPts,LPts,W1,L1,W2,L2,W3,L3,W4,L4,W5,L5,Wsets,Lsets,Comment,B365W,B365L,CBW,CBL,EXW,EXL,IWW,IWL,PSW,PSL,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL,higher_rank_won,higher_rank_points,lower_rank_points,implied_CBW,implied_CBL,implied_IWW,implied_IWL,implied_B365W,implied_B365L,implied_EXW,implied_EXL,implied_PSW,implied_PSL,implied_UBW,implied_UBL,implied_LBW,implied_LBL,implied_SJW,implied_SJL,normalized_CBW,normalized_CBL,normalized_IWW,normalized_IWL,normalized_B365W,normalized_B365L,normalized_EXW,normalized_EXL,normalized_PSW,normalized_PSL,normalized_UBW,normalized_UBL,normalized_LBW,normalized_LBL,normalized_SJW,normalized_SJL,logit_normalized_CBW,logit_normalized_IWW,logit_normalized_B365W,logit_normalized_EXW,logit_normalized_PSW,logit_normalized_UBW,logit_normalized_LBW,logit_normalized_SJW,consensus_logit_W,consensus_prob_W,prob_higher_rank_winning,outcome
2,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Melzer J.,Berdych T.,39.0,45.0,,,6.0,4.0,4,6,7,6,,,,,2.0,1.0,Completed,1.80,1.909,1.750000,2.000000,1.900000,1.850000,1.850000,1.650000,1.901,1.917,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.571429,0.500000,0.540541,0.606061,0.555556,0.523834,0.526316,0.540541,0.526039,0.521648,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.533333,0.466667,0.471429,0.528571,0.514694,0.485306,0.493333,0.506667,0.502095,0.497905,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.133531,-0.114410,0.058793,-0.026668,0.008381,0.668264,0.645346,0.683321,0.257070,0.563916,0.563916,1
19,1,Adelaide,Next Generation Hardcourts,2005-01-05,International,Outdoor,Hard,2nd Round,3,Johansson J.,Malisse X.,11.0,48.0,,,6.0,7.0,6,3,7,6,,,,,2.0,1.0,Completed,1.30,3.390,1.300000,3.400000,1.260000,3.590000,1.300000,2.700000,1.339,3.550,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.769231,0.294118,0.769231,0.370370,0.769231,0.294985,0.793651,0.278552,0.746826,0.281690,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.723404,0.276596,0.675000,0.325000,0.722814,0.277186,0.740206,0.259794,0.726120,0.273880,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.961411,0.730888,0.958466,1.047040,0.975025,0.668264,0.645346,0.683321,0.833720,0.697141,0.697141,1
26,1,Adelaide,Next Generation Hardcourts,2005-01-07,International,Outdoor,Hard,Quarterfinals,3,Chela J.I.,Melzer J.,26.0,39.0,,,7.0,6.0,5,7,6,2,,,,,2.0,1.0,Completed,1.90,1.800,1.880000,1.880000,1.900000,1.850000,1.900000,1.600000,1.820,2.020,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.531915,0.531915,0.526316,0.625000,0.526316,0.555556,0.526316,0.540541,0.549451,0.495050,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.500000,0.500000,0.457143,0.542857,0.486486,0.513514,0.493333,0.506667,0.526042,0.473958,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.000000,-0.171850,-0.054067,-0.026668,0.104261,0.668264,0.645346,0.683321,0.231076,0.557513,0.557513,1
27,1,Adelaide,Next Generation Hardcourts,2005-01-07,International,Outdoor,Hard,Quarterfinals,3,Dent T.,Hewitt L.,32.0,3.0,,,7.0,6.0,6,3,,,,,,,2.0,0.0,Completed,5.50,1.120,6.250000,1.110000,5.850000,1.100000,3.800000,1.150000,7.100,1.125,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,0,,,0.160000,0.900901,0.263158,0.869565,0.181818,0.892857,0.170940,0.909091,0.140845,0.888889,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.150815,0.849185,0.232323,0.767677,0.169184,0.830816,0.158273,0.841727,0.136778,0.863222,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,-1.728221,-1.195239,-1.591419,-1.671131,-1.842312,0.668264,0.645346,0.683321,-0.753924,0.319967,0.680033,1
29,1,Adelaide,Next Generation Hardcourts,2005-01-08,International,Outdoor,Hard,Semifinals,3,Dent T.,Chela J.I.,32.0,26.0,,,6.0,1.0,6,1,,,,,,,2.0,0.0,Completed,1.57,2.250,1.600000,2.300000,1.600000,2.300000,1.500000,2.200000,1.671,2.290,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,0,,,0.625000,0.434783,0.666667,0.454545,0.636943,0.444444,0.625000,0.434783,0.598444,0.436681,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.589744,0.410256,0.594595,0.405405,0.589005,0.410995,0.589744,0.410256,0.578137,0.421863,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.362905,0.382992,0.359855,0.362905,0.315130,0.668264,0.645346,0.683321,0.472590,0.615997,0.384003,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37776,67,London,Masters Cup,2018-11-16,Masters Cup,Indoor,Hard,Round Robin,3,Djokovic N.,Cilic M.,1.0,7.0,8045.0,4050.0,7.0,6.0,6.0,2.0,,,,,,,2.0,0.0,Completed,1.20,4.500,1.825494,3.338149,1.190000,5.500000,1.680738,2.642355,1.200,5.230,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,1.22,6.03,1.17,5.14,1,8045.0,4050.0,0.547797,0.299567,0.594977,0.378450,0.833333,0.222222,0.840336,0.181818,0.833333,0.191205,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.646472,0.353528,0.611219,0.388781,0.789474,0.210526,0.822123,0.177877,0.813375,0.186625,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.603566,0.452438,1.321756,1.530795,1.472090,0.668264,0.645346,0.683321,0.922197,0.715490,0.715490,1
37777,67,London,Masters Cup,2018-11-17,Masters Cup,Indoor,Hard,Semifinals,3,Zverev A.,Federer R.,5.0,3.0,5085.0,6020.0,7.0,5.0,7.0,6.0,,,,,,,2.0,0.0,Completed,3.20,1.360,1.825494,3.338149,3.200000,1.420000,1.680738,2.642355,3.240,1.410,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,3.40,1.45,3.14,1.38,0,6020.0,5085.0,0.547797,0.299567,0.594977,0.378450,0.312500,0.735294,0.312500,0.704225,0.308642,0.709220,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.646472,0.353528,0.611219,0.388781,0.298246,0.701754,0.307359,0.692641,0.303226,0.696774,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.603566,0.452438,-0.855666,-0.812494,-0.831984,0.668264,0.645346,0.683321,0.069099,0.517268,0.482732,0
37778,67,London,Masters Cup,2018-11-17,Masters Cup,Indoor,Hard,Semifinals,3,Djokovic N.,Anderson K.,1.0,6.0,8045.0,4310.0,6.0,2.0,6.0,2.0,,,,,,,2.0,0.0,Completed,1.11,7.000,1.825494,3.338149,1.140000,7.000000,1.680738,2.642355,1.120,7.720,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,1.15,7.72,1.12,6.52,1,8045.0,4310.0,0.547797,0.299567,0.594977,0.378450,0.900901,0.142857,0.877193,0.142857,0.892857,0.129534,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.646472,0.353528,0.611219,0.388781,0.863132,0.136868,0.859951,0.140049,0.873303,0.126697,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.603566,0.452438,1.841550,1.814882,1.930486,0.668264,0.645346,0.683321,1.079982,0.746490,0.746490,1
37779,67,London,Masters Cup,2018-11-18,Masters Cup,Indoor,Hard,The Final,3,Zverev A.,Djokovic N.,5.0,1.0,5085.0,8045.0,6.0,4.0,6.0,3.0,,,,,,,2.0,0.0,Completed,5.50,1.140,1.825494,3.338149,6.000000,1.170000,1.680738,2.642355,6.360,1.160,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,6.36,1.22,5.69,1.15,0,8045.0,5085.0,0.547797,0.299567,0.594977,0.378450,0.181818,0.877193,0.166667,0.854701,0.157233,0.862069,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.646472,0.353528,0.611219,0.388781,0.171687,0.828313,0.163180,0.836820,0.154255,0.845745,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.603566,0.452438,-1.573720,-1.634756,-1.701608,0.668264,0.645346,0.683321,-0.232144,0.442223,0.557777,1


In [22]:
#Calculate Accuracy
accuracy_top_50 = np.mean(df_top_50.outcome == df_top_50.higher_rank_won)
print(f'Accuracy: {accuracy_top_50}')

# Calculate calibration
calibration_top_50 = np.sum(df_top_50.prob_higher_rank_winning) / np.sum(df_top_50.higher_rank_won)
print(f'Calibration: {calibration_top_50}')

# Calculate logloss
def logloss(actual, predictions):
    epsilon = 1e-15
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    
    logr_logloss_all_predictors = -(1 / len(actual)) * np.sum(
        actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
    return logr_logloss_all_predictors

logloss_top_50 = logloss(df_top_50.higher_rank_won, df_top_50.prob_higher_rank_winning)
print(f'Logloss: {logloss_top_50}')


Accuracy: 0.8326984453824771
Calibration: 0.9261544270653103
Logloss: 0.49301020181382676


In [23]:
df_top_100 = process_betting_data(df_top_100, betting_columns)
df_top_100


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'implied_{col}'] = 1 / df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'implied_{col}'] = 1 / df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'implied_{col}'] = 1 / df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'normalized_{l_col}'] = df[f'implied_{l_col}'] / (df[f'implied_{w_col}'] + df[f'implied_{l_col}'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'normalized_{w_col}'] = df[f'implied_{w_col}'] / (df[f'implied_{w_col}'] + df[f'implied_{l_col}'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,WRank,LRank,WPts,LPts,W1,L1,W2,L2,W3,L3,W4,L4,W5,L5,Wsets,Lsets,Comment,B365W,B365L,CBW,CBL,EXW,EXL,IWW,IWL,PSW,PSL,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL,higher_rank_won,higher_rank_points,lower_rank_points,implied_CBW,implied_CBL,implied_IWW,implied_IWL,implied_B365W,implied_B365L,implied_EXW,implied_EXL,implied_PSW,implied_PSL,implied_UBW,implied_UBL,implied_LBW,implied_LBL,implied_SJW,implied_SJL,normalized_CBW,normalized_CBL,normalized_IWW,normalized_IWL,normalized_B365W,normalized_B365L,normalized_EXW,normalized_EXL,normalized_PSW,normalized_PSL,normalized_UBW,normalized_UBL,normalized_LBW,normalized_LBL,normalized_SJW,normalized_SJL,logit_normalized_CBW,logit_normalized_IWW,logit_normalized_B365W,logit_normalized_EXW,logit_normalized_PSW,logit_normalized_UBW,logit_normalized_LBW,logit_normalized_SJW,consensus_logit_W,consensus_prob_W,prob_higher_rank_winning,outcome
1,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Enqvist T.,Sluiter R.,72.0,82.0,,,6.0,3.0,6,1,,,,,,,2.0,0.0,Completed,1.833,1.833,1.850000,1.900000,1.800000,1.950000,1.750000,1.750000,1.990,1.840,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.540541,0.526316,0.571429,0.571429,0.545554,0.545554,0.555556,0.512821,0.502513,0.543478,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.506667,0.493333,0.500000,0.500000,0.500000,0.500000,0.520000,0.480000,0.480418,0.519582,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.026668,0.000000,0.000000,0.080043,-0.078369,0.668264,0.645346,0.683321,0.253159,0.562954,0.562954,1
2,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Melzer J.,Berdych T.,39.0,45.0,,,6.0,4.0,4,6,7,6,,,,,2.0,1.0,Completed,1.800,1.909,1.750000,2.000000,1.900000,1.850000,1.850000,1.650000,1.901,1.917,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.571429,0.500000,0.540541,0.606061,0.555556,0.523834,0.526316,0.540541,0.526039,0.521648,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.533333,0.466667,0.471429,0.528571,0.514694,0.485306,0.493333,0.506667,0.502095,0.497905,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.133531,-0.114410,0.058793,-0.026668,0.008381,0.668264,0.645346,0.683321,0.257070,0.563916,0.563916,1
3,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Rochus O.,Dupuis A.,66.0,79.0,,,6.0,3.0,3,6,6,1,,,,,2.0,1.0,Completed,1.667,2.100,1.580000,2.300000,1.600000,2.250000,1.550000,2.000000,1.621,2.410,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.632911,0.434783,0.645161,0.500000,0.599880,0.476190,0.625000,0.444444,0.616903,0.414938,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.592784,0.407216,0.563380,0.436620,0.557473,0.442527,0.584416,0.415584,0.597867,0.402133,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.375484,0.254892,0.230912,0.340927,0.396584,0.668264,0.645346,0.683321,0.449466,0.610512,0.610512,1
5,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Kiefer N.,Schalken S.,21.0,57.0,,,6.0,2.0,2,1,,,,,,,1.0,0.0,Retired,1.333,3.000,1.360000,3.000000,1.480000,2.530000,1.400000,2.300000,1.408,3.100,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.735294,0.333333,0.714286,0.434783,0.750188,0.333333,0.675676,0.395257,0.710227,0.322581,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.688073,0.311927,0.621622,0.378378,0.692361,0.307639,0.630923,0.369077,0.687666,0.312334,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.791128,0.496437,0.811180,0.536177,0.789232,0.668264,0.645346,0.683321,0.677636,0.663211,0.663211,1
6,1,Adelaide,Next Generation Hardcourts,2005-01-04,International,Outdoor,Hard,1st Round,3,Malisse X.,Martin A.,48.0,68.0,,,6.0,2.0,7,6,,,,,,,2.0,0.0,Completed,1.500,2.500,1.570000,2.350000,1.450000,2.600000,1.450000,2.200000,1.575,2.540,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.636943,0.425532,0.689655,0.454545,0.666667,0.400000,0.689655,0.384615,0.634921,0.393701,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.599490,0.400510,0.602740,0.397260,0.625000,0.375000,0.641975,0.358025,0.617254,0.382746,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.403340,0.416894,0.510826,0.583948,0.477909,0.668264,0.645346,0.683321,0.548731,0.633841,0.633841,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37811,2,Doha,Qatar Exxon Mobil Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Basilashvili N.,Ramos-Vinolas A.,21.0,65.0,1795.0,790.0,6.0,1.0,1.0,6.0,6.0,1.0,,,,,2.0,1.0,Completed,1.400,2.750,1.825494,3.338149,1.802534,3.295159,1.680738,2.642355,1.490,2.810,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,1.50,3.03,1.45,2.75,1,1795.0,790.0,0.547797,0.299567,0.594977,0.378450,0.714286,0.363636,0.554774,0.303475,0.671141,0.355872,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.646472,0.353528,0.611219,0.388781,0.662651,0.337349,0.646402,0.353598,0.653488,0.346512,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.603566,0.452438,0.675129,0.603261,0.634408,0.668264,0.645346,0.683321,0.620717,0.650382,0.650382,1
37812,2,Doha,Qatar Exxon Mobil Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Rublev A.,Seppi A.,68.0,37.0,760.0,1106.0,7.0,5.0,6.0,1.0,,,,,,,2.0,0.0,Completed,1.500,2.500,1.825494,3.338149,1.802534,3.295159,1.680738,2.642355,1.480,2.830,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,1.57,2.83,1.50,2.59,0,1106.0,760.0,0.547797,0.299567,0.594977,0.378450,0.666667,0.400000,0.554774,0.303475,0.675676,0.353357,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.646472,0.353528,0.611219,0.388781,0.625000,0.375000,0.646402,0.353598,0.656613,0.343387,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.603566,0.452438,0.510826,0.603261,0.648235,0.668264,0.645346,0.683321,0.601907,0.646092,0.353908,0
37813,2,Doha,Qatar Exxon Mobil Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Marterer M.,Gojowczyk P.,74.0,59.0,706.0,855.0,6.0,1.0,6.0,4.0,,,,,,,2.0,0.0,Completed,1.900,1.800,1.825494,3.338149,1.802534,3.295159,1.680738,2.642355,2.200,1.740,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,2.23,1.82,2.09,1.74,0,855.0,706.0,0.547797,0.299567,0.594977,0.378450,0.526316,0.555556,0.554774,0.303475,0.454545,0.574713,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.646472,0.353528,0.611219,0.388781,0.486486,0.513514,0.646402,0.353598,0.441624,0.558376,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.603566,0.452438,-0.054067,0.603261,-0.234572,0.668264,0.645346,0.683321,0.420945,0.603709,0.396291,0
37839,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Munar J.,Albot R.,81.0,98.0,663.0,592.0,6.0,2.0,7.0,6.0,,,,,,,2.0,0.0,Completed,2.000,1.720,1.825494,3.338149,1.802534,3.295159,1.680738,2.642355,1.940,1.940,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,2.08,1.95,1.94,1.86,1,663.0,592.0,0.547797,0.299567,0.594977,0.378450,0.500000,0.581395,0.554774,0.303475,0.515464,0.515464,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.646472,0.353528,0.611219,0.388781,0.462366,0.537634,0.646402,0.353598,0.500000,0.500000,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.603566,0.452438,-0.150823,0.603261,0.000000,0.668264,0.645346,0.683321,0.438172,0.607823,0.607823,1


In [24]:
#Calculate Accuracy
accuracy_top_100 = np.mean(df_top_100.outcome == df_top_100.higher_rank_won)
print(f'Accuracy: {accuracy_top_100}')

# Calculate calibration
calibration_top_100 = np.sum(df_top_100.prob_higher_rank_winning) / np.sum(df_top_100.higher_rank_won)
print(f'Calibration: {calibration_top_100}')

# Calculate logloss
def logloss(actual, predictions):
    epsilon = 1e-15
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    
    logr_logloss_all_predictors = -(1 / len(actual)) * np.sum(
        actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
    return logr_logloss_all_predictors

logloss_top_100 = logloss(df_top_100.higher_rank_won, df_top_100.prob_higher_rank_winning)
print(f'Logloss: {logloss_top_100}')


Accuracy: 0.8325875446446899
Calibration: 0.9340747707662658
Logloss: 0.49362619835701793


### BCM : 2019

In [25]:
import pandas as pd

# Load your dataset
df_2019 = betting_data_2019_copy

# Filter dataset for top 50 and top 100 players
def filter_top_players(df, top_n):
    # Assuming you have a column 'rank' for player rankings
    df_top = df_2019[(df['WRank'] <= top_n) & (df['LRank'] <= top_n)]
    return df_top

df_top_50_2019 = filter_top_players(df_2019, 50)
df_top_100_2019 = filter_top_players(df_2019, 100)


In [26]:
# Define the column names for betting odds
betting_columns = ['B365W', 'B365L','PSW', 'PSL']

df_top_50_2019 = process_betting_data(df_top_50_2019, betting_columns)
df_top_100_2019 = process_betting_data(df_top_100_2019, betting_columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'implied_{col}'] = 1 / df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'implied_{col}'] = 1 / df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'implied_{col}'] = 1 / df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, logit_col] = df[f'normalized_{col}'].apply(logit)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'consensus_logit_W'] = df[logit_cols].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'consensus_prob_W'] = df['consensus_logit_W'].apply(inv_logit)
A value is 

In [27]:
#Calculate Accuracy
accuracy_top_50_2019 = np.mean(df_top_50_2019.outcome == df_top_50_2019.higher_rank_won)
print(f'Accuracy: {accuracy_top_50_2019}')

# Calculate calibration
calibration_top_50_2019 = np.sum(df_top_50_2019.prob_higher_rank_winning) / np.sum(df_top_50_2019.higher_rank_won)
print(f'Calibration: {calibration_top_50_2019}')

# Calculate logloss
def logloss(actual, predictions):
    epsilon = 1e-15
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    
    logr_logloss_all_predictors = -(1 / len(actual)) * np.sum(
        actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
    return logr_logloss_all_predictors

logloss_top_50_2019 = logloss(df_top_50_2019.higher_rank_won, df_top_50_2019.prob_higher_rank_winning)
print(f'Logloss: {logloss_top_50_2019}')


Accuracy: 0.650381679389313
Calibration: 1.041422052501375
Logloss: 0.605936386676635


In [28]:
#Calculate Accuracy
accuracy_top_100_2019 = np.mean(df_top_100_2019.outcome == df_top_100_2019.higher_rank_won)
print(f'Accuracy: {accuracy_top_100_2019}')

# Calculate calibration
calibration_top_100_2019 = np.sum(df_top_100_2019.prob_higher_rank_winning) / np.sum(df_top_100_2019.higher_rank_won)
print(f'Calibration: {calibration_top_100_2019}')

# Calculate logloss
def logloss(actual, predictions):
    epsilon = 1e-15
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    
    logr_logloss_all_predictors = -(1 / len(actual)) * np.sum(
        actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
    return logr_logloss_all_predictors

logloss_top_100_2019 = logloss(df_top_100_2019.higher_rank_won, df_top_100_2019.prob_higher_rank_winning)
print(f'Logloss: {logloss_top_100_2019}')


Accuracy: 0.6617142857142857
Calibration: 1.034979269655389
Logloss: 0.6065928952027444


In [30]:
# Create a DataFrame to store the validation statistics
validation_stats = pd.DataFrame({
    'model': [
        'BCM(2000-2018)', 'BCM(2019)',
        'BCM(2000-2018) Top 50', 'BCM(2000-2018) Top 100',
        'BCM(2019) Top 50', 'BCM(2019) Top 100'
    ],
    'accuracy': [
        accuracy_bcm, accuracy_2019,
        accuracy_top_50, accuracy_top_100,
        accuracy_top_50_2019, accuracy_top_100_2019
    ],
    'log_loss': [
        logloss_bcm, logloss_2019,
        logloss_top_50, logloss_top_100,
        logloss_top_50_2019, logloss_top_100_2019
    ],
    'calibration': [
        calibration_bcm, calibration_2019,
        calibration_top_50, calibration_top_100,
        calibration_top_50_2019, calibration_top_100_2019
    ]
})

# Print the validation statistics DataFrame
print(validation_stats)


                    model  accuracy  log_loss  calibration
0          BCM(2000-2018)  0.836442  0.487972     0.928665
1               BCM(2019)  0.674123  0.594565     1.023193
2   BCM(2000-2018) Top 50  0.832698  0.493010     0.926154
3  BCM(2000-2018) Top 100  0.832588  0.493626     0.934075
4        BCM(2019) Top 50  0.650382  0.605936     1.041422
5       BCM(2019) Top 100  0.661714  0.606593     1.034979
