## Import Libraries:

In [1]:
import warnings
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation and analysis
import os  # For interacting with the operating system

# Suppress specific UserWarnings from the 'openpyxl' module to prevent cluttering the output
warnings.filterwarnings("ignore", category=UserWarning, module='openpyxl')

# Import Logistic Regression model from scikit-learn
from sklearn.linear_model import LogisticRegression

# Import metrics to evaluate accuracy and log loss
from sklearn.metrics import accuracy_score, log_loss

# Import StandardScaler for feature scaling
from sklearn.preprocessing import StandardScaler

## Load Data:

In [2]:
# Define the directory where your files are located
# data_dir = '.'  
data_dir = os.path.join(os.path.pardir)  

In [3]:
# List to hold the dataframes
dataframes = []

# Loop through the years and load the files
for year in range(2005, 2020):
    if year <= 2012:
        file_path = os.path.join(data_dir, f'{year}.xls')
    else:
        file_path = os.path.join(data_dir, f'{year}.xlsx')
    
    # Load the file into a dataframe
    df = pd.read_excel(file_path)
    
    # Append the dataframe to the list
    dataframes.append(df)

In [4]:
# Concatenate all the dataframes into one
betting_data = pd.concat(dataframes, ignore_index=True)

# Display the first few rows of the combined dataframe
betting_data.head()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
0,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Saulnier C.,...,,,,,,,,,,
1,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Enqvist T.,...,,,,,,,,,,
2,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Melzer J.,...,,,,,,,,,,
3,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Rochus O.,...,,,,,,,,,,
4,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Mayer F.,...,,,,,,,,,,


## Handling Missing Data:

In [5]:
# List of columns to convert
columns_to_convert = ['WRank', 'LRank']

# Convert specified columns to numeric, coercing errors to NaN
for column in columns_to_convert:
    betting_data[column] = pd.to_numeric(betting_data[column], errors='coerce')

In [6]:
# Impute missing ranks with 0
betting_data['WRank'].fillna(0, inplace=True)
betting_data['LRank'].fillna(0, inplace=True)

# List of columns to impute with 0
columns_to_impute_specific = ['WPts', 'LPts']
betting_data[columns_to_impute_specific] = betting_data[columns_to_impute_specific].fillna(0)

## Feature Engineering:

In [7]:
def engineer_data(df):
    # Feature engineering
    df['higher_rank_won'] = (df['WRank'] < df['LRank']).astype(int)
    df['higher_rank_points'] = df['higher_rank_won'] * df['WPts'] + df['LPts'] * (1 - df['higher_rank_won'])
    df['lower_rank_points'] = (1 - df['higher_rank_won']) * df['WPts'] + df['LPts'] * df['higher_rank_won']
    df['points_diff'] = df['higher_rank_points'] - df['lower_rank_points']
    
    # Handle zero and missing values before taking logarithms
    df['higher_rank_points'].replace(0, 1, inplace=True)
    df['lower_rank_points'].replace(0, 1, inplace=True)
    df['higher_rank_points'].fillna(1, inplace=True)
    df['lower_rank_points'].fillna(1, inplace=True)
    
    # Filter the betting_data to include only rows where the match status is 'Completed'
    df = df.loc[df['Comment'] == 'Completed']
    
    # Select essential columns
    df = df[['Date', 'WRank', 'LRank', 'WPts', 'LPts', 
             'higher_rank_points', 'lower_rank_points', 'points_diff', 
             'higher_rank_won']]
    
    return df

# Apply preprocessing
betting_data = engineer_data(betting_data)

## Split the dataset:

In [8]:
# Convert 'Date' to datetime format using .loc[] to avoid SettingWithCopyWarning
betting_data.loc[:, 'Date'] = pd.to_datetime(betting_data['Date'], format='%Y-%m-%d')

# Define the split date for January 1, 2019
split_time = pd.to_datetime('2019-01-01', format='%Y-%m-%d')

In [9]:
# Splitting the dataset into training and validation (test) sets
betting_data_train = betting_data[betting_data['Date'] < split_time].copy()
betting_data_validation = betting_data[betting_data['Date'] >= split_time].copy()

# Display the first few rows of the train set to confirm the split
betting_data_train.head()

Unnamed: 0,Date,WRank,LRank,WPts,LPts,higher_rank_points,lower_rank_points,points_diff,higher_rank_won
0,2005-01-03,53.0,324.0,0.0,0.0,1.0,1.0,0.0,1
1,2005-01-03,72.0,82.0,0.0,0.0,1.0,1.0,0.0,1
2,2005-01-03,39.0,45.0,0.0,0.0,1.0,1.0,0.0,1
3,2005-01-03,66.0,79.0,0.0,0.0,1.0,1.0,0.0,1
4,2005-01-03,35.0,101.0,0.0,0.0,1.0,1.0,0.0,1


In [10]:
# Display the first few rows of the validation set to confirm the split
betting_data_validation.head()

Unnamed: 0,Date,WRank,LRank,WPts,LPts,higher_rank_points,lower_rank_points,points_diff,higher_rank_won
37785,2019-01-01,63.0,49.0,810.0,974.0,974.0,810.0,164.0,0
37786,2019-01-01,40.0,57.0,1050.0,875.0,1050.0,875.0,175.0,1
37787,2019-01-01,240.0,234.0,200.0,206.0,206.0,200.0,6.0,0
37788,2019-01-01,35.0,62.0,1125.0,810.0,1125.0,810.0,315.0,1
37789,2019-01-01,239.0,146.0,200.0,367.0,367.0,200.0,167.0,0


## Univariate Logistic Model Implementation (points_diff):

In [11]:
def evaluate_logistic_regression(train_data, validation_data):
    # Features and target
    X_train = train_data['points_diff'].values.reshape(-1, 1)
    y_train = train_data['higher_rank_won'].values
    X_validation = validation_data['points_diff'].values.reshape(-1, 1)
    y_validation = validation_data['higher_rank_won'].values

    # Initialize and fit the logistic regression model
    logistic_model = LogisticRegression(solver='liblinear', fit_intercept=False)
    logistic_model.fit(X_train, y_train)

    # Make predictions
    validation_predictions = logistic_model.predict(X_validation)
    validation_prediction_probs = logistic_model.predict_proba(X_validation)[:, 1]

    # Evaluate the model
    accuracy = np.mean(validation_predictions == y_validation)

    # Calculate calibration
    calibration = np.sum(validation_prediction_probs) / np.sum(y_validation)

    # Calculate log loss
    def logloss(actual, predictions):
        epsilon = 1e-15
        predictions = np.clip(predictions, epsilon, 1 - epsilon)
        
        log_loss_value = -(1 / len(actual)) * np.sum(
            actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
        return log_loss_value

    log_loss_value = logloss(y_validation, validation_prediction_probs)

    # Output the results
    print("\033[1mUnivariate Logistic Regression Model Results:\033[0m")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Calibration: {calibration:.4f}")
    print(f'Log Loss: {log_loss_value:.4f}')

    return accuracy, calibration, log_loss_value

accuracy, calibration, log_loss = evaluate_logistic_regression(betting_data_train, betting_data_validation)

[1mUnivariate Logistic Regression Model Results:[0m
Accuracy: 0.6144
Calibration: 1.0122
Log Loss: 0.6507


## Filtering Top 50 and Top 100 Ranking players from the dataset

In [12]:
def split_dataset(df, top_n, split_date):
    # Filter the dataset for top N players
    df_top = df[(df['WRank'] <= top_n) | (df['LRank'] <= top_n)].copy()
    
    # Convert 'Date' to datetime format
    df_top['Date'] = pd.to_datetime(df_top['Date'], format='%Y-%m-%d')
    
    # Define the split date
    split_time = pd.to_datetime(split_date, format='%Y-%m-%d')
    
    # Splitting the dataset into training and validation (test) sets
    df_top_train = df_top[df_top['Date'] < split_time]
    df_top_validation = df_top[df_top['Date'] >= split_time]
    
    return df_top_train, df_top_validation

split_date = '2019-01-01'
df_top_50_train, df_top_50_validation = split_dataset(betting_data, 50, split_date)
df_top_100_train, df_top_100_validation = split_dataset(betting_data, 100, split_date)

In [13]:
# Calculate metrics for top 50 players
accuracy_50, calibration_50, log_loss_50 = evaluate_logistic_regression(df_top_50_train, df_top_50_validation)

[1mUnivariate Logistic Regression Model Results:[0m
Accuracy: 0.6377
Calibration: 1.0291
Log Loss: 0.6361


In [14]:
# Calculate metrics for top 100 players
accuracy_100, calibration_100, log_loss_100 = evaluate_logistic_regression(df_top_100_train, df_top_100_validation)

[1mUnivariate Logistic Regression Model Results:[0m
Accuracy: 0.6158
Calibration: 1.0161
Log Loss: 0.6493


## Metrics - Top 50 & Top 100:

In [15]:
# Create a DataFrame to store the validation statistics
validation_stats = pd.DataFrame({
    'Model': [
        'Logistic Regression', 'Logistic Regression Top 50', 'Logistic Regression Top 100'
    ],
    'Accuracy': [
        accuracy, accuracy_50, accuracy_100
    ],
    'Log_Loss': [
        log_loss, log_loss_50, log_loss_100
    ],
    'Calibration': [
        calibration, calibration_50, calibration_100
    ]
})

# Print the validation statistics DataFrame
validation_stats

Unnamed: 0,Model,Accuracy,Log_Loss,Calibration
0,Logistic Regression,0.614405,0.650701,1.012161
1,Logistic Regression Top 50,0.637746,0.63609,1.029149
2,Logistic Regression Top 100,0.615831,0.649291,1.016094
