## Load the data:

In [1]:
import numpy as np
import pandas as pd
import os

# Define the directory where your files are located
data_dir = '.'  # Adjust the path according to your file location

# List to hold the dataframes
dataframes = []

# Loop through the years and load the files
for year in range(2000, 2019):
    if year <= 2012:
        file_path = os.path.join(data_dir, f'{year}.xls')
    else:
        file_path = os.path.join(data_dir, f'{year}.xlsx')
    
    # Load the file into a dataframe
    df = pd.read_excel(file_path)
    
    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all the dataframes into one
betting_data = pd.concat(dataframes, ignore_index=True)

# Display the first few rows of the combined dataframe
betting_data.head()


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
0,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Dosedel S.,...,,,,,,,,,,
1,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Enqvist T.,...,,,,,,,,,,
2,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Escude N.,...,,,,,,,,,,
3,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Federer R.,...,,,,,,,,,,
4,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Fromberg R.,...,,,,,,,,,,


In [2]:
betting_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52298 entries, 0 to 52297
Data columns (total 54 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ATP         52298 non-null  int64         
 1   Location    52298 non-null  object        
 2   Tournament  52298 non-null  object        
 3   Date        52298 non-null  datetime64[ns]
 4   Series      52298 non-null  object        
 5   Court       52298 non-null  object        
 6   Surface     52298 non-null  object        
 7   Round       52298 non-null  object        
 8   Best of     52298 non-null  int64         
 9   Winner      52298 non-null  object        
 10  Loser       52298 non-null  object        
 11  WRank       52283 non-null  object        
 12  LRank       52219 non-null  object        
 13  W1          52035 non-null  float64       
 14  L1          52037 non-null  float64       
 15  W2          51526 non-null  object        
 16  L2          51527 non-

## Fixing Anomalies

In [3]:
def is_column_numeric(df, column_name):
    # Check if the column contains only numeric values
    return df[column_name].apply(lambda x: str(x).isnumeric()).all()

# Check if columns are numeric before converting
anomaly_column = ['WRank', 'LRank', 'EXW']
for column in anomaly_column:
    if is_column_numeric(betting_data, column):
        print(f"Column '{column}' is numeric.\n")
    else:
        print(f"Column '{column}' is not numeric.\n")

def find_non_numeric_values(df, column_name):
    # Function to check if a value is numeric
    def is_numeric(value):
        try:
            float(value)
            return True
        except ValueError:
            return False

    # Apply the function to the column and filter non-numeric values
    non_numeric_values = df[~df[column_name].apply(is_numeric)]

    # Display the non-numeric values
    print(f"Non-numeric values in {column_name}:")
    print(non_numeric_values[[column_name]])

# WRank column
find_non_numeric_values(betting_data, 'WRank')

# LRank column
find_non_numeric_values(betting_data, 'LRank')

# EXW column
find_non_numeric_values(betting_data, 'EXW')

Column 'WRank' is not numeric.

Column 'LRank' is not numeric.

Column 'EXW' is not numeric.

Non-numeric values in WRank:
    WRank
744    NR
Non-numeric values in LRank:
     LRank
63      NR
377     NR
560     NR
611     NR
613     NR
618     NR
774     NR
1039    NR
1694    NR
1849    NR
3540    NR
3551    NR
4457    NR
5435    NR
5487    NR
5491    NR
5494    NR
5498    NR
5528    NR
5998    NR
6307    NR
6777    NR
7296    NR
7414    NR
7418    NR
8111    NR
9596    NR
Non-numeric values in EXW:
        EXW
38294  2.,3


In [4]:
# Convert WRank and LRank to numeric, coercing errors
betting_data['WRank'] = pd.to_numeric(betting_data['WRank'], errors='coerce')
betting_data['LRank'] = pd.to_numeric(betting_data['LRank'], errors='coerce')

# Fill NaN values with a high number
betting_data['WRank'].fillna(100000, inplace=True)
betting_data['LRank'].fillna(100000, inplace=True)

# Correct the typo in row 38294, column 'EXW'
if betting_data.at[38294, 'EXW'] == '2.,3':
    betting_data.at[38294, 'EXW'] = '2.3'


## Feature Engineering:

In [5]:
# Now perform the calculations
betting_data['higher_rank_won'] = (betting_data['WRank'] < betting_data['LRank']).astype(int)
betting_data['higher_rank_points'] = betting_data['higher_rank_won'] * betting_data['WPts'] + betting_data['LPts'] * (1 - betting_data['higher_rank_won'])
betting_data['lower_rank_points'] = (1 - betting_data['higher_rank_won']) * betting_data['WPts'] + betting_data['LPts'] * betting_data['higher_rank_won']


In [6]:
# Ensure all columns are displayed
pd.set_option('display.max_columns', None)

# Display the DataFrame (or any part of it)
betting_data


Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,WRank,LRank,W1,L1,W2,L2,W3,L3,W4,L4,W5,L5,Wsets,Lsets,Comment,CBW,CBL,GBW,GBL,IWW,IWL,SBW,SBL,B365W,B365L,B&WW,B&WL,EXW,EXL,PSW,PSL,WPts,LPts,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL,higher_rank_won,higher_rank_points,lower_rank_points
0,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,63.0,77.0,6.0,4.0,6.0,2.0,,,,,,,2.0,0.0,Completed,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,,
1,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Enqvist T.,Clement A.,5.0,56.0,6.0,3.0,6.0,3.0,,,,,,,2.0,0.0,Completed,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,,
2,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,40.0,655.0,6.0,7.0,7.0,5.0,6.0,3.0,,,,,2.0,1.0,Completed,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,,
3,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Federer R.,Knippschild J.,65.0,87.0,6.0,1.0,6.0,4.0,,,,,,,2.0,0.0,Completed,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,,
4,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,81.0,198.0,7.0,6.0,5.0,7.0,6.0,4.0,,,,,2.0,1.0,Completed,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52293,67,London,Masters Cup,2018-11-16,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,Isner J.,5.0,10.0,7.0,6.0,6.0,3.0,,,,,,,2.0,0.0,Completed,,,,,,,,,1.36,3.20,,,1.38,3.40,1.40,3.22,5085.0,3155.0,,,,,,,1.44,3.40,1.38,3.14,1,5085.0,3155.0
52294,67,London,Masters Cup,2018-11-16,Masters Cup,Indoor,Hard,Round Robin,3,Djokovic N.,Cilic M.,1.0,7.0,7.0,6.0,6.0,2.0,,,,,,,2.0,0.0,Completed,,,,,,,,,1.20,4.50,,,1.19,5.50,1.20,5.23,8045.0,4050.0,,,,,,,1.22,6.03,1.17,5.14,1,8045.0,4050.0
52295,67,London,Masters Cup,2018-11-17,Masters Cup,Indoor,Hard,Semifinals,3,Zverev A.,Federer R.,5.0,3.0,7.0,5.0,7.0,6.0,,,,,,,2.0,0.0,Completed,,,,,,,,,3.20,1.36,,,3.2,1.42,3.24,1.41,5085.0,6020.0,,,,,,,3.40,1.45,3.14,1.38,0,6020.0,5085.0
52296,67,London,Masters Cup,2018-11-17,Masters Cup,Indoor,Hard,Semifinals,3,Djokovic N.,Anderson K.,1.0,6.0,6.0,2.0,6.0,2.0,,,,,,,2.0,0.0,Completed,,,,,,,,,1.11,7.00,,,1.14,7.00,1.12,7.72,8045.0,4310.0,,,,,,,1.15,7.72,1.12,6.52,1,8045.0,4310.0


In [7]:
betting_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52298 entries, 0 to 52297
Data columns (total 57 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ATP                 52298 non-null  int64         
 1   Location            52298 non-null  object        
 2   Tournament          52298 non-null  object        
 3   Date                52298 non-null  datetime64[ns]
 4   Series              52298 non-null  object        
 5   Court               52298 non-null  object        
 6   Surface             52298 non-null  object        
 7   Round               52298 non-null  object        
 8   Best of             52298 non-null  int64         
 9   Winner              52298 non-null  object        
 10  Loser               52298 non-null  object        
 11  WRank               52298 non-null  float64       
 12  LRank               52298 non-null  float64       
 13  W1                  52035 non-null  float64   

## Computing Missing Data using Mean

In [8]:
# Define the column names for betting odds
betting_columns = ['CBW', 'CBL', 'GBW', 'GBL', 'IWW', 'IWL', 'SBW', 'SBL', 
                   'B365W', 'B365L','B&WW', 'B&WL', 'EXW', 'EXL', 
                   'PSW', 'PSL', 'UBW', 'UBL', 'LBW', 'LBL', 'SJW', 'SJL']

# Ensure all columns are numeric and convert if necessary
for col in betting_columns:
    if not pd.api.types.is_numeric_dtype(betting_data[col]):
        print(f"Converting column {col} to numeric.\n")
        betting_data[col] = pd.to_numeric(betting_data[col], errors='coerce')

# Display the number of missing values in the betting odds columns
missing_values_count = betting_data[betting_columns].isnull().sum()
print(f'Missing values in betting columns:\n{missing_values_count}\n')

# Calculate the mean of the available betting odds for each column
mean_betting_odds = betting_data[betting_columns].mean()
print(f'Mean of available betting odds:\n{mean_betting_odds}\n')

# Impute the missing values with the mean using .loc
for col in betting_columns:
    betting_data.loc[betting_data[col].isnull(), col] = mean_betting_odds[col]

# Verify that there are no more missing values
missing_values_count_after = betting_data[betting_columns].isnull().sum()
print(f'Missing values in betting columns after imputation:\n{missing_values_count_after}')


Converting column EXW to numeric.

Missing values in betting columns:
CBW      34792
CBL      34792
GBW      47243
GBL      47243
IWW      38940
IWL      38940
SBW      46874
SBL      46874
B365W     8655
B365L     8632
B&WW     51201
B&WL     51201
EXW      12887
EXL      12882
PSW      14959
PSL      14959
UBW      41627
UBL      41627
LBW      24167
LBL      24156
SJW      36726
SJL      36719
dtype: int64

Mean of available betting odds:
CBW      1.812080
CBL      3.000658
GBW      1.738742
GBL      2.401363
IWW      1.671585
IWL      2.387132
SBW      1.793035
SBL      2.672064
B365W    1.823768
B365L    3.551805
B&WW     1.728633
B&WL     2.547347
EXW      1.802502
EXL      3.262180
PSW      1.926742
PSL      4.186818
UBW      1.815867
UBL      3.542479
LBW      1.810226
LBL      3.451461
SJW      1.796538
SJL      3.557943
dtype: float64

Missing values in betting columns after imputation:
CBW      0
CBL      0
GBW      0
GBL      0
IWW      0
IWL      0
SBW      0
SBL      0
B3

In [9]:
betting_data

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,WRank,LRank,W1,L1,W2,L2,W3,L3,W4,L4,W5,L5,Wsets,Lsets,Comment,CBW,CBL,GBW,GBL,IWW,IWL,SBW,SBL,B365W,B365L,B&WW,B&WL,EXW,EXL,PSW,PSL,WPts,LPts,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL,higher_rank_won,higher_rank_points,lower_rank_points
0,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,63.0,77.0,6.0,4.0,6.0,2.0,,,,,,,2.0,0.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.823768,3.551805,1.728633,2.547347,1.802502,3.26218,1.926742,4.186818,,,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,
1,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Enqvist T.,Clement A.,5.0,56.0,6.0,3.0,6.0,3.0,,,,,,,2.0,0.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.823768,3.551805,1.728633,2.547347,1.802502,3.26218,1.926742,4.186818,,,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,
2,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,40.0,655.0,6.0,7.0,7.0,5.0,6.0,3.0,,,,,2.0,1.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.823768,3.551805,1.728633,2.547347,1.802502,3.26218,1.926742,4.186818,,,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,
3,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Federer R.,Knippschild J.,65.0,87.0,6.0,1.0,6.0,4.0,,,,,,,2.0,0.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.823768,3.551805,1.728633,2.547347,1.802502,3.26218,1.926742,4.186818,,,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,
4,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,81.0,198.0,7.0,6.0,5.0,7.0,6.0,4.0,,,,,2.0,1.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.823768,3.551805,1.728633,2.547347,1.802502,3.26218,1.926742,4.186818,,,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52293,67,London,Masters Cup,2018-11-16,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,Isner J.,5.0,10.0,7.0,6.0,6.0,3.0,,,,,,,2.0,0.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.360000,3.200000,1.728633,2.547347,1.380000,3.40000,1.400000,3.220000,5085.0,3155.0,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,1.44,3.40,1.38,3.14,1,5085.0,3155.0
52294,67,London,Masters Cup,2018-11-16,Masters Cup,Indoor,Hard,Round Robin,3,Djokovic N.,Cilic M.,1.0,7.0,7.0,6.0,6.0,2.0,,,,,,,2.0,0.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.200000,4.500000,1.728633,2.547347,1.190000,5.50000,1.200000,5.230000,8045.0,4050.0,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,1.22,6.03,1.17,5.14,1,8045.0,4050.0
52295,67,London,Masters Cup,2018-11-17,Masters Cup,Indoor,Hard,Semifinals,3,Zverev A.,Federer R.,5.0,3.0,7.0,5.0,7.0,6.0,,,,,,,2.0,0.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,3.200000,1.360000,1.728633,2.547347,3.200000,1.42000,3.240000,1.410000,5085.0,6020.0,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,3.40,1.45,3.14,1.38,0,6020.0,5085.0
52296,67,London,Masters Cup,2018-11-17,Masters Cup,Indoor,Hard,Semifinals,3,Djokovic N.,Anderson K.,1.0,6.0,6.0,2.0,6.0,2.0,,,,,,,2.0,0.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.110000,7.000000,1.728633,2.547347,1.140000,7.00000,1.120000,7.720000,8045.0,4310.0,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,1.15,7.72,1.12,6.52,1,8045.0,4310.0


## BCM Model (2000 - 2018)

In [10]:
# Calculate raw implied probabilities
for col in betting_columns:
    betting_data[f'implied_{col}'] = 1 / betting_data[col]


In [11]:
# Normalize the probabilities for each bookmaker
for w_col, l_col in zip(betting_columns[::2], betting_columns[1::2]):
    betting_data[f'normalized_{w_col}'] = betting_data[f'implied_{w_col}'] / (betting_data[f'implied_{w_col}'] + betting_data[f'implied_{l_col}'])
    betting_data[f'normalized_{l_col}'] = betting_data[f'implied_{l_col}'] / (betting_data[f'implied_{w_col}'] + betting_data[f'implied_{l_col}'])


In [12]:
# Function to calculate logit
def logit(p):
    p = np.clip(p, 1e-10, 1 - 1e-10)  # Ensure probabilities are within (0, 1)
    return np.log(p / (1 - p))

# Function to calculate inverse logit
def inv_logit(y):
    return np.exp(y) / (1 + np.exp(y))

# Calculate logit values for normalized probabilities and then the consensus probability
logit_cols = []
for col in betting_columns[::2]:  # Process only the winner columns
    logit_col = f'logit_normalized_{col}'
    betting_data[logit_col] = betting_data[f'normalized_{col}'].apply(logit)
    logit_cols.append(logit_col)

# Calculate the average logit for consensus probability
betting_data['consensus_logit_W'] = betting_data[logit_cols].mean(axis=1)
betting_data['consensus_prob_W'] = betting_data['consensus_logit_W'].apply(inv_logit)


In [43]:
betting_data.higher_rank_won.count()

52298

In [13]:
betting_data

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,WRank,LRank,W1,L1,W2,L2,W3,L3,W4,L4,W5,L5,Wsets,Lsets,Comment,CBW,CBL,GBW,GBL,IWW,IWL,SBW,SBL,B365W,B365L,B&WW,B&WL,EXW,EXL,PSW,PSL,WPts,LPts,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL,higher_rank_won,higher_rank_points,lower_rank_points,implied_CBW,implied_CBL,implied_GBW,implied_GBL,implied_IWW,implied_IWL,implied_SBW,implied_SBL,implied_B365W,implied_B365L,implied_B&WW,implied_B&WL,implied_EXW,implied_EXL,implied_PSW,implied_PSL,implied_UBW,implied_UBL,implied_LBW,implied_LBL,implied_SJW,implied_SJL,normalized_CBW,normalized_CBL,normalized_GBW,normalized_GBL,normalized_IWW,normalized_IWL,normalized_SBW,normalized_SBL,normalized_B365W,normalized_B365L,normalized_B&WW,normalized_B&WL,normalized_EXW,normalized_EXL,normalized_PSW,normalized_PSL,normalized_UBW,normalized_UBL,normalized_LBW,normalized_LBL,normalized_SJW,normalized_SJL,logit_normalized_CBW,logit_normalized_GBW,logit_normalized_IWW,logit_normalized_SBW,logit_normalized_B365W,logit_normalized_B&WW,logit_normalized_EXW,logit_normalized_PSW,logit_normalized_UBW,logit_normalized_LBW,logit_normalized_SJW,consensus_logit_W,consensus_prob_W
0,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,63.0,77.0,6.0,4.0,6.0,2.0,,,,,,,2.0,0.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.823768,3.551805,1.728633,2.547347,1.802502,3.26218,1.926742,4.186818,,,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.551852,0.33326,0.575129,0.41643,0.598235,0.418913,0.557714,0.374243,0.548315,0.281547,0.578492,0.392565,0.554784,0.306544,0.519011,0.238845,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.623483,0.376517,0.580025,0.419975,0.588149,0.411851,0.598433,0.401567,0.660730,0.339270,0.595734,0.404266,0.644104,0.355896,0.684841,0.315159,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.504356,0.322875,0.35632,0.398941,0.666551,0.387722,0.593220,0.776110,0.668264,0.645346,0.683321,0.545730,0.633144
1,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Enqvist T.,Clement A.,5.0,56.0,6.0,3.0,6.0,3.0,,,,,,,2.0,0.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.823768,3.551805,1.728633,2.547347,1.802502,3.26218,1.926742,4.186818,,,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.551852,0.33326,0.575129,0.41643,0.598235,0.418913,0.557714,0.374243,0.548315,0.281547,0.578492,0.392565,0.554784,0.306544,0.519011,0.238845,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.623483,0.376517,0.580025,0.419975,0.588149,0.411851,0.598433,0.401567,0.660730,0.339270,0.595734,0.404266,0.644104,0.355896,0.684841,0.315159,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.504356,0.322875,0.35632,0.398941,0.666551,0.387722,0.593220,0.776110,0.668264,0.645346,0.683321,0.545730,0.633144
2,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,40.0,655.0,6.0,7.0,7.0,5.0,6.0,3.0,,,,,2.0,1.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.823768,3.551805,1.728633,2.547347,1.802502,3.26218,1.926742,4.186818,,,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.551852,0.33326,0.575129,0.41643,0.598235,0.418913,0.557714,0.374243,0.548315,0.281547,0.578492,0.392565,0.554784,0.306544,0.519011,0.238845,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.623483,0.376517,0.580025,0.419975,0.588149,0.411851,0.598433,0.401567,0.660730,0.339270,0.595734,0.404266,0.644104,0.355896,0.684841,0.315159,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.504356,0.322875,0.35632,0.398941,0.666551,0.387722,0.593220,0.776110,0.668264,0.645346,0.683321,0.545730,0.633144
3,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Federer R.,Knippschild J.,65.0,87.0,6.0,1.0,6.0,4.0,,,,,,,2.0,0.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.823768,3.551805,1.728633,2.547347,1.802502,3.26218,1.926742,4.186818,,,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.551852,0.33326,0.575129,0.41643,0.598235,0.418913,0.557714,0.374243,0.548315,0.281547,0.578492,0.392565,0.554784,0.306544,0.519011,0.238845,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.623483,0.376517,0.580025,0.419975,0.588149,0.411851,0.598433,0.401567,0.660730,0.339270,0.595734,0.404266,0.644104,0.355896,0.684841,0.315159,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.504356,0.322875,0.35632,0.398941,0.666551,0.387722,0.593220,0.776110,0.668264,0.645346,0.683321,0.545730,0.633144
4,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,81.0,198.0,7.0,6.0,5.0,7.0,6.0,4.0,,,,,2.0,1.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.823768,3.551805,1.728633,2.547347,1.802502,3.26218,1.926742,4.186818,,,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.551852,0.33326,0.575129,0.41643,0.598235,0.418913,0.557714,0.374243,0.548315,0.281547,0.578492,0.392565,0.554784,0.306544,0.519011,0.238845,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.623483,0.376517,0.580025,0.419975,0.588149,0.411851,0.598433,0.401567,0.660730,0.339270,0.595734,0.404266,0.644104,0.355896,0.684841,0.315159,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.504356,0.322875,0.35632,0.398941,0.666551,0.387722,0.593220,0.776110,0.668264,0.645346,0.683321,0.545730,0.633144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52293,67,London,Masters Cup,2018-11-16,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,Isner J.,5.0,10.0,7.0,6.0,6.0,3.0,,,,,,,2.0,0.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.360000,3.200000,1.728633,2.547347,1.380000,3.40000,1.400000,3.220000,5085.0,3155.0,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,1.44,3.40,1.38,3.14,1,5085.0,3155.0,0.551852,0.33326,0.575129,0.41643,0.598235,0.418913,0.557714,0.374243,0.735294,0.312500,0.578492,0.392565,0.724638,0.294118,0.714286,0.310559,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.623483,0.376517,0.580025,0.419975,0.588149,0.411851,0.598433,0.401567,0.701754,0.298246,0.595734,0.404266,0.711297,0.288703,0.696970,0.303030,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.504356,0.322875,0.35632,0.398941,0.855666,0.387722,0.901692,0.832909,0.668264,0.645346,0.683321,0.596128,0.644770
52294,67,London,Masters Cup,2018-11-16,Masters Cup,Indoor,Hard,Round Robin,3,Djokovic N.,Cilic M.,1.0,7.0,7.0,6.0,6.0,2.0,,,,,,,2.0,0.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.200000,4.500000,1.728633,2.547347,1.190000,5.50000,1.200000,5.230000,8045.0,4050.0,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,1.22,6.03,1.17,5.14,1,8045.0,4050.0,0.551852,0.33326,0.575129,0.41643,0.598235,0.418913,0.557714,0.374243,0.833333,0.222222,0.578492,0.392565,0.840336,0.181818,0.833333,0.191205,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.623483,0.376517,0.580025,0.419975,0.588149,0.411851,0.598433,0.401567,0.789474,0.210526,0.595734,0.404266,0.822123,0.177877,0.813375,0.186625,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.504356,0.322875,0.35632,0.398941,1.321756,0.387722,1.530795,1.472090,0.668264,0.645346,0.683321,0.753799,0.680006
52295,67,London,Masters Cup,2018-11-17,Masters Cup,Indoor,Hard,Semifinals,3,Zverev A.,Federer R.,5.0,3.0,7.0,5.0,7.0,6.0,,,,,,,2.0,0.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,3.200000,1.360000,1.728633,2.547347,3.200000,1.42000,3.240000,1.410000,5085.0,6020.0,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,3.40,1.45,3.14,1.38,0,6020.0,5085.0,0.551852,0.33326,0.575129,0.41643,0.598235,0.418913,0.557714,0.374243,0.312500,0.735294,0.578492,0.392565,0.312500,0.704225,0.308642,0.709220,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.623483,0.376517,0.580025,0.419975,0.588149,0.411851,0.598433,0.401567,0.298246,0.701754,0.595734,0.404266,0.307359,0.692641,0.303226,0.696774,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.504356,0.322875,0.35632,0.398941,-0.855666,0.387722,-0.812494,-0.831984,0.668264,0.645346,0.683321,0.133364,0.533292
52296,67,London,Masters Cup,2018-11-17,Masters Cup,Indoor,Hard,Semifinals,3,Djokovic N.,Anderson K.,1.0,6.0,6.0,2.0,6.0,2.0,,,,,,,2.0,0.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.110000,7.000000,1.728633,2.547347,1.140000,7.00000,1.120000,7.720000,8045.0,4310.0,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,1.15,7.72,1.12,6.52,1,8045.0,4310.0,0.551852,0.33326,0.575129,0.41643,0.598235,0.418913,0.557714,0.374243,0.900901,0.142857,0.578492,0.392565,0.877193,0.142857,0.892857,0.129534,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.623483,0.376517,0.580025,0.419975,0.588149,0.411851,0.598433,0.401567,0.863132,0.136868,0.595734,0.404266,0.859951,0.140049,0.873303,0.126697,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.504356,0.322875,0.35632,0.398941,1.841550,0.387722,1.814882,1.930486,0.668264,0.645346,0.683321,0.868551,0.704444


In [24]:
# Create the probability of higher-ranked player winning
betting_data['prob_higher_rank_winning'] = betting_data.apply(
    lambda row: row['consensus_prob_W'] if row['higher_rank_won'] == 1 else (1 - row['consensus_prob_W']), axis=1
)

# Create the outcome column
betting_data['outcome'] = betting_data['prob_higher_rank_winning'].apply(lambda x: 1 if x > 0.50 else 0)

# Display the first few rows to check the results
betting_data.head()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,WRank,LRank,W1,L1,W2,L2,W3,L3,W4,L4,W5,L5,Wsets,Lsets,Comment,CBW,CBL,GBW,GBL,IWW,IWL,SBW,SBL,B365W,B365L,B&WW,B&WL,EXW,EXL,PSW,PSL,WPts,LPts,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL,higher_rank_won,higher_rank_points,lower_rank_points,implied_CBW,implied_CBL,implied_GBW,implied_GBL,implied_IWW,implied_IWL,implied_SBW,implied_SBL,implied_B365W,implied_B365L,implied_B&WW,implied_B&WL,implied_EXW,implied_EXL,implied_PSW,implied_PSL,implied_UBW,implied_UBL,implied_LBW,implied_LBL,implied_SJW,implied_SJL,normalized_CBW,normalized_CBL,normalized_GBW,normalized_GBL,normalized_IWW,normalized_IWL,normalized_SBW,normalized_SBL,normalized_B365W,normalized_B365L,normalized_B&WW,normalized_B&WL,normalized_EXW,normalized_EXL,normalized_PSW,normalized_PSL,normalized_UBW,normalized_UBL,normalized_LBW,normalized_LBL,normalized_SJW,normalized_SJL,logit_normalized_CBW,logit_normalized_GBW,logit_normalized_IWW,logit_normalized_SBW,logit_normalized_B365W,logit_normalized_B&WW,logit_normalized_EXW,logit_normalized_PSW,logit_normalized_UBW,logit_normalized_LBW,logit_normalized_SJW,consensus_logit_W,consensus_prob_W,prob_higher_rank_winning,outcome
0,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,63.0,77.0,6.0,4.0,6.0,2.0,,,,,,,2.0,0.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.823768,3.551805,1.728633,2.547347,1.802502,3.26218,1.926742,4.186818,,,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.551852,0.33326,0.575129,0.41643,0.598235,0.418913,0.557714,0.374243,0.548315,0.281547,0.578492,0.392565,0.554784,0.306544,0.519011,0.238845,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.623483,0.376517,0.580025,0.419975,0.588149,0.411851,0.598433,0.401567,0.66073,0.33927,0.595734,0.404266,0.644104,0.355896,0.684841,0.315159,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.504356,0.322875,0.35632,0.398941,0.666551,0.387722,0.59322,0.77611,0.668264,0.645346,0.683321,0.54573,0.633144,0.633144,1
1,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Enqvist T.,Clement A.,5.0,56.0,6.0,3.0,6.0,3.0,,,,,,,2.0,0.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.823768,3.551805,1.728633,2.547347,1.802502,3.26218,1.926742,4.186818,,,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.551852,0.33326,0.575129,0.41643,0.598235,0.418913,0.557714,0.374243,0.548315,0.281547,0.578492,0.392565,0.554784,0.306544,0.519011,0.238845,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.623483,0.376517,0.580025,0.419975,0.588149,0.411851,0.598433,0.401567,0.66073,0.33927,0.595734,0.404266,0.644104,0.355896,0.684841,0.315159,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.504356,0.322875,0.35632,0.398941,0.666551,0.387722,0.59322,0.77611,0.668264,0.645346,0.683321,0.54573,0.633144,0.633144,1
2,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,40.0,655.0,6.0,7.0,7.0,5.0,6.0,3.0,,,,,2.0,1.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.823768,3.551805,1.728633,2.547347,1.802502,3.26218,1.926742,4.186818,,,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.551852,0.33326,0.575129,0.41643,0.598235,0.418913,0.557714,0.374243,0.548315,0.281547,0.578492,0.392565,0.554784,0.306544,0.519011,0.238845,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.623483,0.376517,0.580025,0.419975,0.588149,0.411851,0.598433,0.401567,0.66073,0.33927,0.595734,0.404266,0.644104,0.355896,0.684841,0.315159,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.504356,0.322875,0.35632,0.398941,0.666551,0.387722,0.59322,0.77611,0.668264,0.645346,0.683321,0.54573,0.633144,0.633144,1
3,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Federer R.,Knippschild J.,65.0,87.0,6.0,1.0,6.0,4.0,,,,,,,2.0,0.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.823768,3.551805,1.728633,2.547347,1.802502,3.26218,1.926742,4.186818,,,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.551852,0.33326,0.575129,0.41643,0.598235,0.418913,0.557714,0.374243,0.548315,0.281547,0.578492,0.392565,0.554784,0.306544,0.519011,0.238845,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.623483,0.376517,0.580025,0.419975,0.588149,0.411851,0.598433,0.401567,0.66073,0.33927,0.595734,0.404266,0.644104,0.355896,0.684841,0.315159,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.504356,0.322875,0.35632,0.398941,0.666551,0.387722,0.59322,0.77611,0.668264,0.645346,0.683321,0.54573,0.633144,0.633144,1
4,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,81.0,198.0,7.0,6.0,5.0,7.0,6.0,4.0,,,,,2.0,1.0,Completed,1.81208,3.000658,1.738742,2.401363,1.671585,2.387132,1.793035,2.672064,1.823768,3.551805,1.728633,2.547347,1.802502,3.26218,1.926742,4.186818,,,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,,,,,1,,,0.551852,0.33326,0.575129,0.41643,0.598235,0.418913,0.557714,0.374243,0.548315,0.281547,0.578492,0.392565,0.554784,0.306544,0.519011,0.238845,0.550701,0.282288,0.552417,0.289732,0.556626,0.281061,0.623483,0.376517,0.580025,0.419975,0.588149,0.411851,0.598433,0.401567,0.66073,0.33927,0.595734,0.404266,0.644104,0.355896,0.684841,0.315159,0.661114,0.338886,0.655961,0.344039,0.66448,0.33552,0.504356,0.322875,0.35632,0.398941,0.666551,0.387722,0.59322,0.77611,0.668264,0.645346,0.683321,0.54573,0.633144,0.633144,1


## Evaluate Model Performance
#### Accuracy:  
Calculate the accuracy of your model predictions.

In [15]:
Accuracy = np.mean(betting_data.outcome == betting_data.higher_rank_won)
print(f'Accuracy: {Accuracy}')


Accuracy: 0.909174347011358


#### Calibration:  
Assess the calibration of your model.

In [16]:
# Calculate calibration
calibration = np.sum(betting_data.prob_higher_rank_winning) / np.sum(betting_data.higher_rank_won)
print(f'Calibration: {calibration}')


Calibration: 0.8922163686060297


##### Log-loss:
Compute the log-loss for your predictions.

In [38]:
def logloss(actual, predictions):
    epsilon = 1e-15
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    
    logr_logloss_all_predictors = -(1 / len(actual)) * np.sum(
        actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
    return logr_logloss_all_predictors


In [37]:
logloss = logloss(betting_data.higher_rank_won, betting_data.prob_higher_rank_winning)
print(f'Logloss: {logloss}')


Logloss: 0.4909952927081262


## BCM (2019)

In [27]:
## Load the data:

# Define the directory where your files are located
data_dir = '.'  # Adjust the path according to your file location

# Load the file for 2019
file_path = os.path.join(data_dir, '2019.xlsx')
betting_data_2019 = pd.read_excel(file_path)

# Display the first few rows of the dataframe
betting_data_2019.head()

betting_data_2019.info()

## Fixing Anomalies

# Check if columns are numeric before converting
anomaly_column = ['WRank', 'LRank']
for column in anomaly_column:
    if is_column_numeric(betting_data_2019, column):
        print(f"Column '{column}' is numeric.\n")
    else:
        print(f"Column '{column}' is not numeric.\n")

# WRank column
find_non_numeric_values(betting_data_2019, 'WRank')

# LRank column
find_non_numeric_values(betting_data_2019, 'LRank')

# Convert WRank and LRank to numeric, coercing errors
betting_data_2019['WRank'] = pd.to_numeric(betting_data_2019['WRank'], errors='coerce')
betting_data_2019['LRank'] = pd.to_numeric(betting_data_2019['LRank'], errors='coerce')

# Fill NaN values with a high number
betting_data_2019['WRank'].fillna(100000, inplace=True)
betting_data_2019['LRank'].fillna(100000, inplace=True)

## Feature Engineering:

# Now perform the calculations
betting_data_2019['higher_rank_won'] = (betting_data_2019['WRank'] < betting_data_2019['LRank']).astype(int)
betting_data_2019['higher_rank_points'] = betting_data_2019['higher_rank_won'] * betting_data_2019['WPts'] + betting_data_2019['LPts'] * (1 - betting_data_2019['higher_rank_won'])
betting_data_2019['lower_rank_points'] = (1 - betting_data_2019['higher_rank_won']) * betting_data_2019['WPts'] + betting_data_2019['LPts'] * betting_data_2019['higher_rank_won']

# Display the DataFrame (or any part of it)
betting_data_2019

## Computing Missing Data using Mean

# Define the column names for betting odds
betting_columns = ['B365W', 'B365L','PSW', 'PSL']

# Ensure all columns are numeric and convert if necessary
for col in betting_columns:
    if not pd.api.types.is_numeric_dtype(betting_data_2019[col]):
        print(f"Converting column {col} to numeric.\n")
        betting_data_2019[col] = pd.to_numeric(betting_data_2019[col], errors='coerce')

# Display the number of missing values in the betting odds columns
missing_values_count = betting_data_2019[betting_columns].isnull().sum()
print(f'Missing values in betting columns:\n{missing_values_count}\n')

# Calculate the mean of the available betting odds for each column
mean_betting_odds = betting_data_2019[betting_columns].mean()
print(f'Mean of available betting odds:\n{mean_betting_odds}\n')

# Impute the missing values with the mean using .loc
for col in betting_columns:
    betting_data_2019.loc[betting_data_2019[col].isnull(), col] = mean_betting_odds[col]

# Verify that there are no more missing values
missing_values_count_after = betting_data_2019[betting_columns].isnull().sum()
print(f'Missing values in betting columns after imputation:\n{missing_values_count_after}')

betting_data_2019

## BCM Model (2019)

# Calculate raw implied probabilities
for col in betting_columns:
    betting_data_2019[f'implied_{col}'] = 1 / betting_data_2019[col]

# Normalize the probabilities for each bookmaker
for w_col, l_col in zip(betting_columns[::2], betting_columns[1::2]):
    betting_data_2019[f'normalized_{w_col}'] = betting_data_2019[f'implied_{w_col}'] / (betting_data_2019[f'implied_{w_col}'] + betting_data_2019[f'implied_{l_col}'])
    betting_data_2019[f'normalized_{l_col}'] = betting_data_2019[f'implied_{l_col}'] / (betting_data_2019[f'implied_{w_col}'] + betting_data_2019[f'implied_{l_col}'])

# Calculate logit values for normalized probabilities and then the consensus probability
logit_cols = []
for col in betting_columns[::2]:  # Process only the winner columns
    logit_col = f'logit_normalized_{col}'
    betting_data_2019[logit_col] = betting_data_2019[f'normalized_{col}'].apply(logit)
    logit_cols.append(logit_col)

# Calculate the average logit for consensus probability
betting_data_2019['consensus_logit_W'] = betting_data_2019[logit_cols].mean(axis=1)
betting_data_2019['consensus_prob_W'] = betting_data_2019['consensus_logit_W'].apply(inv_logit)

betting_data_2019

# Create the probability of higher-ranked player winning
betting_data_2019['prob_higher_rank_winning'] = betting_data_2019.apply(
    lambda row: row['consensus_prob_W'] if row['higher_rank_won'] == 1 else (1 - row['consensus_prob_W']), axis=1
)

# Create the outcome column
betting_data_2019['outcome'] = betting_data_2019['prob_higher_rank_winning'].apply(lambda x: 1 if x > 0.50 else 0)

# Display the first few rows to check the results
betting_data_2019.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2610 entries, 0 to 2609
Data columns (total 36 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ATP         2610 non-null   int64         
 1   Location    2610 non-null   object        
 2   Tournament  2610 non-null   object        
 3   Date        2610 non-null   datetime64[ns]
 4   Series      2610 non-null   object        
 5   Court       2610 non-null   object        
 6   Surface     2610 non-null   object        
 7   Round       2610 non-null   object        
 8   Best of     2610 non-null   int64         
 9   Winner      2610 non-null   object        
 10  Loser       2610 non-null   object        
 11  WRank       2606 non-null   float64       
 12  LRank       2597 non-null   float64       
 13  WPts        2607 non-null   float64       
 14  LPts        2597 non-null   float64       
 15  W1          2589 non-null   float64       
 16  L1          2589 non-nul

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,WRank,LRank,WPts,LPts,W1,L1,W2,L2,W3,L3,W4,L4,W5,L5,Wsets,Lsets,Comment,B365W,B365L,PSW,PSL,MaxW,MaxL,AvgW,AvgL,higher_rank_won,higher_rank_points,lower_rank_points,implied_B365W,implied_B365L,implied_PSW,implied_PSL,normalized_B365W,normalized_B365L,normalized_PSW,normalized_PSL,logit_normalized_B365W,logit_normalized_PSW,consensus_logit_W,consensus_prob_W,prob_higher_rank_winning,outcome
0,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Dimitrov G.,Nishioka Y.,19.0,75.0,1835.0,701.0,6.0,3.0,6.0,4.0,,,,,,,2.0,0.0,Completed,1.36,3.0,1.36,3.37,1.42,3.6,1.35,3.18,1,1835.0,701.0,0.735294,0.333333,0.735294,0.296736,0.688073,0.311927,0.712474,0.287526,0.791128,0.907428,0.849278,0.700416,0.700416,1
1,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Raonic M.,Bedene A.,18.0,67.0,1855.0,780.0,6.0,0.0,6.0,3.0,,,,,,,2.0,0.0,Completed,1.18,4.5,1.23,4.68,1.27,4.84,1.22,4.26,1,1855.0,780.0,0.847458,0.222222,0.813008,0.213675,0.792254,0.207746,0.791878,0.208122,1.338563,1.336284,1.337423,0.792066,0.792066,1
2,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Kecmanovic M.,Mayer L.,131.0,56.0,433.0,895.0,6.0,3.0,6.0,1.0,,,,,,,2.0,0.0,Completed,1.57,2.25,1.67,2.32,1.71,2.4,1.63,2.28,0,895.0,433.0,0.636943,0.444444,0.598802,0.431034,0.589005,0.410995,0.581454,0.418546,0.359855,0.328744,0.344299,0.585234,0.414766,0
3,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Millman J.,Sandgren T.,38.0,61.0,1083.0,814.0,7.0,6.0,6.0,7.0,6.0,0.0,,,,,2.0,1.0,Completed,1.4,2.75,1.41,3.13,1.45,3.2,1.4,2.95,1,1083.0,814.0,0.714286,0.363636,0.70922,0.319489,0.662651,0.337349,0.689427,0.310573,0.675129,0.797443,0.736286,0.676183,0.676183,1
4,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Uchiyama Y.,Humbert U.,185.0,102.0,275.0,572.0,6.0,4.0,7.0,6.0,,,,,,,2.0,0.0,Completed,2.62,1.44,2.73,1.51,3.26,1.53,2.69,1.47,0,572.0,275.0,0.381679,0.694444,0.3663,0.662252,0.35468,0.64532,0.356132,0.643868,-0.598531,-0.592192,-0.595362,0.355406,0.644594,1


## Evaluate Model Performance


In [39]:
#Calculate Accuracy
accuracy_2019 = np.mean(betting_data_2019.outcome == betting_data_2019.higher_rank_won)
print(f'Accuracy: {accuracy_2019}')

# Calculate calibration
calibration_2019 = np.sum(betting_data_2019.prob_higher_rank_winning) / np.sum(betting_data_2019.higher_rank_won)
print(f'Calibration: {calibration_2019}')

# Calculate logloss
def logloss(actual, predictions):
    epsilon = 1e-15
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    
    logr_logloss_all_predictors = -(1 / len(actual)) * np.sum(
        actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
    return logr_logloss_all_predictors

logloss_2019 = logloss(betting_data_2019.higher_rank_won, betting_data_2019.prob_higher_rank_winning)
print(f'Logloss: {logloss_2019}')


Accuracy: 0.6739463601532567
Calibration: 1.0230901760739093
Logloss: 0.5946757499493406


## Naive Model

In [40]:
import pandas as pd
import numpy as np

# Assuming betting_data is your DataFrame

# Calculate naive accuracy
N = len(betting_data)
naive_accuracy = betting_data['higher_rank_won'].mean()
w = betting_data['higher_rank_won']

# For the naive model, pi is constant and equal to the accuracy we have already calculated
pi_naive = naive_accuracy

# Calculate log loss for the naive model
log_loss_naive = -1 / N * np.sum(w * np.log(pi_naive) + (1 - w) * np.log(1 - pi_naive))

# Calculate calibration for the naive model
calibration_naive = pi_naive * N / np.sum(w)

# Create a DataFrame to store the validation statistics
validation_stats = pd.DataFrame({
    'model': ['naive'],
    'pred_acc': [naive_accuracy],
    'log_loss': [log_loss_naive],
    'calibration': [calibration_naive]
})

print(validation_stats)


   model  pred_acc  log_loss  calibration
0  naive  0.655398  0.644041          1.0


In [47]:
# Split the data into training and testing sets by time
betting_data_train = betting_data
betting_data_test = betting_data_2019


# Calculate naive accuracy on the training set
N_train = len(betting_data_train)
naive_accuracy_train = betting_data_train['higher_rank_won'].mean()

# Calculate naive accuracy on the testing set
N_test = len(betting_data_test)
naive_accuracy_test = betting_data_test['higher_rank_won'].mean()
w_test = betting_data_test['higher_rank_won']

# For the naive model, pi is constant and equal to the training set accuracy
pi_naive = naive_accuracy_train

# Calculate log loss for the naive model on the testing set
log_loss_naive = -1 / N_test * np.sum(w_test * np.log(pi_naive) + (1 - w_test) * np.log(1 - pi_naive))

# Calculate calibration for the naive model on the testing set
calibration_naive = pi_naive * N_test / np.sum(w_test)

# Create a DataFrame to store the validation statistics
validation_stats = pd.DataFrame({
    'model': ['naive'],
    'pred_acc_train': [naive_accuracy_train],
    'pred_acc_test': [naive_accuracy_test],
    'log_loss': [log_loss_naive],
    'calibration': [calibration_naive]
})

print(validation_stats)


   model  pred_acc_train  pred_acc_test  log_loss  calibration
0  naive        0.655398       0.613793  0.670787     1.067783
