## Import Libraries:

In [1]:
import warnings
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation and analysis
import os  # For interacting with the operating system

# Suppress specific UserWarnings from openpyxl
warnings.filterwarnings("ignore", category=UserWarning, module='openpyxl')

## Load the data:

In [2]:
# Define the directory where your files are located
# data_dir = '.'  
data_dir = os.path.join(os.path.pardir)  

In [3]:
# List to hold the dataframes
dataframes = []

# Loop through the years and load the files
for year in range(2005, 2020):
    if year <= 2012:
        file_path = os.path.join(data_dir, f'{year}.xls')
    else:
        file_path = os.path.join(data_dir, f'{year}.xlsx')
    
    # Load the file into a dataframe
    df = pd.read_excel(file_path)
    
    # Append the dataframe to the list
    dataframes.append(df)

In [4]:
# Concatenate all the dataframes into one
betting_data = pd.concat(dataframes, ignore_index=True)

# Display the first few rows of the combined dataframe
betting_data.head()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
0,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Saulnier C.,...,,,,,,,,,,
1,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Enqvist T.,...,,,,,,,,,,
2,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Melzer J.,...,,,,,,,,,,
3,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Rochus O.,...,,,,,,,,,,
4,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Mayer F.,...,,,,,,,,,,


## Fixing Anomalies

In [5]:
def is_column_numeric(df, column_name):
    # Check if the column contains only numeric values
    return df[column_name].apply(lambda x: str(x).isnumeric()).all()

# Check if columns are numeric before converting
anomaly_column = ['WRank', 'LRank', 'EXW']
for column in anomaly_column:
    if is_column_numeric(betting_data, column):
        print(f"Column '{column}' is numeric.\n")
    else:
        print(f"Column '{column}' is not numeric.\n")

Column 'WRank' is not numeric.

Column 'LRank' is not numeric.

Column 'EXW' is not numeric.



In [6]:
def find_non_numeric_values(df, column_name):
    # Function to check if a value is numeric
    def is_numeric(value):
        try:
            float(value)
            return True
        except ValueError:
            return False

    # Apply the function to the column and filter non-numeric values
    non_numeric_values = df[~df[column_name].apply(is_numeric)]

    # Display the non-numeric values
    print(f"Non-numeric values in {column_name}:")
    print(non_numeric_values[[column_name]])
    print()

In [7]:
# WRank column
find_non_numeric_values(betting_data, 'WRank')

Non-numeric values in WRank:
Empty DataFrame
Columns: [WRank]
Index: []



In [8]:
# LRank column
find_non_numeric_values(betting_data, 'LRank')

Non-numeric values in LRank:
Empty DataFrame
Columns: [LRank]
Index: []



In [9]:
# EXW column
find_non_numeric_values(betting_data, 'EXW')

Non-numeric values in EXW:
        EXW
23776  2.,3



In [10]:
# Convert WRank and LRank to numeric, coercing errors
betting_data['WRank'] = pd.to_numeric(betting_data['WRank'], errors='coerce')
betting_data['LRank'] = pd.to_numeric(betting_data['LRank'], errors='coerce')

# Fill NaN values with a high number
betting_data['WRank'].fillna(100000, inplace=True)
betting_data['LRank'].fillna(100000, inplace=True)

# Correct the typo in row 38294, column 'EXW'
if betting_data.at[38294, 'EXW'] == '2.,3':
    betting_data.at[38294, 'EXW'] = '2.3'

## Preprocess the dataset:

In [11]:
betting_data['higher_rank_won'] = (betting_data['WRank'] < betting_data['LRank']).astype(int)
betting_data['higher_rank_points'] = betting_data['higher_rank_won'] * betting_data['WPts'] + betting_data['LPts'] * (1 - betting_data['higher_rank_won'])
betting_data['lower_rank_points'] = (1 - betting_data['higher_rank_won']) * betting_data['WPts'] + betting_data['LPts'] * betting_data['higher_rank_won']

# Filter the betting_data to include only rows where the match status is 'Completed'
betting_data = betting_data.loc[betting_data['Comment'] == 'Completed']

## Computing Missing Data using Mean:

In [12]:
# Define the column names for betting odds
betting_columns = ['CBW', 'CBL', 'IWW', 'IWL', 
                   'B365W', 'B365L', 'EXW', 'EXL', 
                   'PSW', 'PSL', 'UBW', 'UBL', 'LBW', 'LBL', 'SJW', 'SJL']

# Ensure all columns are numeric and convert if necessary
for col in betting_columns:
    if not pd.api.types.is_numeric_dtype(betting_data[col]):
        print(f"Converting column {col} to numeric.\n")
        betting_data[col] = pd.to_numeric(betting_data[col], errors='coerce')

Converting column EXW to numeric.



In [13]:
# Display the number of missing values in the betting odds columns
missing_values_count = betting_data[betting_columns].isnull().sum()
print(f'Missing values in betting columns:\n{missing_values_count}\n')

Missing values in betting columns:
CBW      31076
CBL      31076
IWW      36113
IWL      36113
B365W      523
B365L      503
EXW       3487
EXL       3481
PSW       3019
PSL       3019
UBW      28593
UBL      28593
LBW      11840
LBL      11829
SJW      23926
SJL      23919
dtype: int64



In [14]:
# Calculate the mean of the available betting odds for each column
mean_betting_odds = betting_data[betting_columns].mean()
print(f'Mean of available betting odds:\n{mean_betting_odds}\n')

Mean of available betting odds:
CBW      1.808398
CBL      3.363808
IWW      1.666774
IWL      2.653584
B365W    1.812985
B365L    3.671545
EXW      1.788476
EXL      3.315206
PSW      1.913329
PSL      4.234952
UBW      1.798580
UBL      3.572174
LBW      1.795990
LBL      3.473173
SJW      1.783411
SJL      3.579899
dtype: float64



In [15]:
# Impute the missing values with the mean using .loc
for col in betting_columns:
    betting_data.loc[betting_data[col].isnull(), col] = mean_betting_odds[col]

## Split the dataset:

In [16]:
# Convert 'tourney_date' to datetime format 
betting_data['Date'] = pd.to_datetime(betting_data['Date'], format='%Y-%m-%d')

# Define the split date for January 1, 2019
split_time = pd.to_datetime('2019-01-01', format='%Y-%m-%d')

In [17]:
# Splitting the dataset into training and validation (test) sets
betting_data_2019 = betting_data[betting_data['Date'] >= split_time]
betting_data = betting_data[betting_data['Date'] < split_time]

# Create a copy of the dataset
betting_data_copy = betting_data.copy()
betting_data_2019_copy = betting_data_2019.copy()

## BCM Function Setup and Definitions:

In [18]:
def process_bcm(df, betting_columns):
    # Make a copy of the dataframe to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Calculate raw implied probabilities
    for col in betting_columns:
        df.loc[:, f'implied_{col}'] = 1 / df[col]

    # Normalize the probabilities for each bookmaker
    for w_col, l_col in zip(betting_columns[::2], betting_columns[1::2]):
        df.loc[:, f'normalized_{w_col}'] = df[f'implied_{w_col}'] / (df[f'implied_{w_col}'] + df[f'implied_{l_col}'])
        df.loc[:, f'normalized_{l_col}'] = df[f'implied_{l_col}'] / (df[f'implied_{w_col}'] + df[f'implied_{l_col}'])

    # Calculate logit values for normalized probabilities and then the consensus probability
    logit_cols = []
    for col in betting_columns[::2]:  # Process only the winner columns
        logit_col = f'logit_normalized_{col}'
        df.loc[:, logit_col] = df[f'normalized_{col}'].apply(logit)
        logit_cols.append(logit_col)

    # Calculate the average logit for consensus probability
    df.loc[:, 'consensus_logit_W'] = df[logit_cols].mean(axis=1)
    df.loc[:, 'consensus_prob_W'] = df['consensus_logit_W'].apply(inv_logit)

    # Create the probability of higher-ranked player winning
    df.loc[:, 'prob_higher_rank_winning'] = df.apply(
        lambda row: row['consensus_prob_W'] if row['higher_rank_won'] == 1 else (1 - row['consensus_prob_W']), axis=1
    )

    # Create the outcome column
    df.loc[:, 'outcome'] = df['prob_higher_rank_winning'].apply(lambda x: 1 if x > 0.50 else 0)

    return df

In [19]:
# Function to calculate logit
def logit(p):
    p = np.clip(p, 1e-10, 1 - 1e-10)  # Ensure probabilities are within (0, 1)
    return np.log(p / (1 - p))

In [20]:
# Function to calculate inverse logit
def inv_logit(y):
    return np.exp(y) / (1 + np.exp(y))

In [21]:
# Function to evaluate Model Performance
def evaluate_model_performance(df, outcome_col='outcome', higher_rank_won_col='higher_rank_won', prob_col='prob_higher_rank_winning'):
    # Accuracy
    accuracy_bcm = np.mean(df[outcome_col] == df[higher_rank_won_col])
    print(f'Accuracy: {accuracy_bcm}')

    # Calibration
    calibration_bcm = np.sum(df[prob_col]) / np.sum(df[higher_rank_won_col])
    print(f'Calibration: {calibration_bcm}')

    # Log-loss
    def logloss(actual, predictions):
        epsilon = 1e-15
        predictions = np.clip(predictions, epsilon, 1 - epsilon)
        logr_logloss_all_predictors = -(1 / len(actual)) * np.sum(
            actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
        return logr_logloss_all_predictors

    logloss_bcm = logloss(df[higher_rank_won_col], df[prob_col])
    print(f'Logloss: {logloss_bcm}')

    return {
        'accuracy': accuracy_bcm,
        'calibration': calibration_bcm,
        'logloss': logloss_bcm
    }

In [22]:
def print_bold(text):
    print(f"\033[1m{text}\033[0m")

## BCM (2000 - 2018) - Training:

In [23]:
betting_data = process_bcm(betting_data, betting_columns)

In [24]:
print_bold('BCM (2000-2018)') 
accuracy_bcm, calibration_bcm, logloss_bcm = evaluate_model_performance(betting_data).values()

[1mBCM (2000-2018)[0m
Accuracy: 0.8433973688556173
Calibration: 0.9233709981751599
Logloss: 0.4824082873856712


## BCM (2019) - Validation:

In [25]:
# Define the column names for betting odds
betting_columns = ['B365W', 'B365L','PSW', 'PSL']

betting_data_2019 = process_bcm(betting_data_2019, betting_columns)

In [26]:
print_bold('BCM (2019)')
accuracy_bcm_2019, calibration_bcm_2019, logloss_bcm_2019 = evaluate_model_performance(betting_data_2019).values()

[1mBCM (2019)[0m
Accuracy: 0.6744926382809391
Calibration: 1.024454324646844
Logloss: 0.5926237820296479


## Filtering Top 50 and Top 100 Ranking players from the dataset

In [27]:
df_2019 = betting_data_2019_copy

In [28]:
# Filter dataset for top 50 and top 100 players
def filter_top_players(df, top_n):
    df_top = df[(df['WRank'] <= top_n) | (df['LRank'] <= top_n)]
    return df_top

df_top_50_2019 = filter_top_players(df_2019, 50)
df_top_100_2019 = filter_top_players(df_2019, 100)

In [29]:
# Define the column names for betting odds
betting_columns = ['B365W', 'B365L','PSW', 'PSL']

df_top_50_2019 = process_bcm(df_top_50_2019, betting_columns)
df_top_100_2019 = process_bcm(df_top_100_2019, betting_columns)

## Metrics - Top 50 & Top 100:

In [30]:
print_bold('BCM (2019) : Top 50')
accuracy_bcm_top_50_2019, calibration_bcm_top_50_2019, logloss_bcm_top_50_2019 = evaluate_model_performance(df_top_50_2019).values()

[1mBCM (2019) : Top 50[0m
Accuracy: 0.6800679501698754
Calibration: 1.0312642112782036
Logloss: 0.5824990665905565


In [31]:
print_bold('BCM (2019) : Top 100')
accuracy_bcm_top_100_2019, calibration_bcm_top_100_2019, logloss_bcm_top_100_2019 = evaluate_model_performance(df_top_100_2019).values()

[1mBCM (2019) : Top 100[0m
Accuracy: 0.6749792186201163
Calibration: 1.026936090843872
Logloss: 0.5927659169372209


In [32]:
# Create a DataFrame to store the validation statistics
validation_stats = pd.DataFrame({
    'Model': [
        'BCM(2000-2018)', 'BCM(2019)',
        'BCM(2019) Top 50', 'BCM(2019) Top 100'
    ],
    'Accuracy': [
        accuracy_bcm, accuracy_bcm_2019,
        accuracy_bcm_top_50_2019, accuracy_bcm_top_100_2019
    ],
    'Log_Loss': [
        logloss_bcm, logloss_bcm_2019,
        logloss_bcm_top_50_2019, logloss_bcm_top_100_2019
    ],
    'Calibration': [
        calibration_bcm, calibration_bcm_2019,
        calibration_bcm_top_50_2019, calibration_bcm_top_100_2019
    ]
})

# Print the validation statistics DataFrame
validation_stats

Unnamed: 0,Model,Accuracy,Log_Loss,Calibration
0,BCM(2000-2018),0.843397,0.482408,0.923371
1,BCM(2019),0.674493,0.592624,1.024454
2,BCM(2019) Top 50,0.680068,0.582499,1.031264
3,BCM(2019) Top 100,0.674979,0.592766,1.026936
