## Load Data

In [1]:
import warnings
import numpy as np
import pandas as pd
import os

# Suppress specific UserWarnings from openpyxl
warnings.filterwarnings("ignore", category=UserWarning, module='openpyxl')

# Define the directory where your files are located
# data_dir = '.'  
data_dir = os.path.join(os.path.pardir)  

# List to hold the dataframes
dataframes = []

# Loop through the years and load the files
for year in range(2005, 2020):
    if year <= 2012:
        file_path = os.path.join(data_dir, f'{year}.xls')
    else:
        file_path = os.path.join(data_dir, f'{year}.xlsx')
    
    # Load the file into a dataframe
    df = pd.read_excel(file_path)
    
    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all the dataframes into one
betting_data = pd.concat(dataframes, ignore_index=True)

## Display Data

In [2]:
# Display the first few rows of the combined dataframe
betting_data

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
0,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Saulnier C.,...,,,,,,,,,,
1,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Enqvist T.,...,,,,,,,,,,
2,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Melzer J.,...,,,,,,,,,,
3,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Rochus O.,...,,,,,,,,,,
4,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Mayer F.,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40385,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Nadal R.,...,,,,,,,1.48,3.30,1.41,2.93
40386,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,...,,,,,,,2.24,2.06,1.92,1.90
40387,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Tsitsipas S.,...,,,,,,,3.75,1.40,3.39,1.33
40388,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Thiem D.,...,,,,,,,1.87,2.20,1.78,2.06


## Handling Missing Data

In [4]:
# List of columns to convert
columns_to_convert = ['WRank', 'LRank', 'W2', 'L2', 'W3', 'L3', 'EXW']

# Display rows with non-numeric values in 'LRank' before conversion
non_numeric_lrank_rows = betting_data[betting_data['LRank'].isin(['NR'])]

print("Rows with non-numeric values in 'LRank' before conversion:")
print(non_numeric_lrank_rows[['LRank']])

# Correct the typo in row 38294, column 'EXW'
if betting_data.at[38294, 'EXW'] == '2.,3':
    print(f'\n Converting EXW string value 2.,3 to 2.3')
    betting_data.at[38294, 'EXW'] = '2.3'


# Convert specified columns to numeric, coercing errors to NaN
for column in columns_to_convert:
    betting_data[column] = pd.to_numeric(betting_data[column], errors='coerce')


Rows with non-numeric values in 'LRank' before conversion:
Empty DataFrame
Columns: [LRank]
Index: []


In [5]:
def check_missing_values_in_numeric_columns(df):

    # Identify numeric columns
    numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

    # Check for missing values in numeric columns
    missing_values = df[numeric_columns].isnull().sum()
    return missing_values

# Display the count of missing values for each numeric column
print("Missing values in numeric columns:")
print(check_missing_values_in_numeric_columns(betting_data))


Missing values in numeric columns:
ATP            0
Best of        0
WRank         15
LRank         87
WPts        1689
LPts        1759
W1           235
L1           233
W2           627
L2           626
W3         21664
L3         21664
W4         36743
L4         36743
W5         39012
L5         39012
Wsets        230
Lsets        231
B365W        547
B365L        524
CBW        32337
CBL        32337
EXW         3611
EXL         3605
IWW        37571
IWL        37571
PSW         3150
PSL         3150
UBW        29719
UBL        29719
LBW        12259
LBL        12248
SJW        24818
SJL        24811
MaxW       15036
MaxL       15036
AvgW       15036
AvgL       15036
dtype: int64


In [6]:
# Impute missing ranks with 0
betting_data['WRank'].fillna(0, inplace=True)
betting_data['LRank'].fillna(0, inplace=True)

# List of columns to impute with 0
columns_to_impute_specific = ['W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'WPts', 'LPts', 'Wsets', 'Lsets']
betting_data[columns_to_impute_specific] = betting_data[columns_to_impute_specific].fillna(0)

# # Alternatively, list of columns to impute with mean
# columns_to_impute_mean = ['WPts', 'LPts', 'Wsets', 'Lsets']
# betting_data[columns_to_impute_mean] = betting_data[columns_to_impute_mean].apply(lambda x: x.fillna(x.mean()))

# Median Imputation 
# betting_data[columns_to_impute_mean] = betting_data[columns_to_impute_mean].apply(lambda x: x.fillna(x.median()))

# Verify the changes
print("Missing values after imputation:")
print(betting_data[['WRank', 'LRank']].isnull().sum())
print(betting_data[columns_to_impute_specific].isnull().sum())


Missing values after imputation:
WRank    0
LRank    0
dtype: int64
W1       0
L1       0
W2       0
L2       0
W3       0
L3       0
W4       0
L4       0
W5       0
L5       0
WPts     0
LPts     0
Wsets    0
Lsets    0
dtype: int64


In [7]:
# Display the count of missing values for each numeric column
print("Missing values in numeric columns:")
print(check_missing_values_in_numeric_columns(betting_data))


Missing values in numeric columns:
ATP            0
Best of        0
WRank          0
LRank          0
WPts           0
LPts           0
W1             0
L1             0
W2             0
L2             0
W3             0
L3             0
W4             0
L4             0
W5             0
L5             0
Wsets          0
Lsets          0
B365W        547
B365L        524
CBW        32337
CBL        32337
EXW         3611
EXL         3605
IWW        37571
IWL        37571
PSW         3150
PSL         3150
UBW        29719
UBL        29719
LBW        12259
LBL        12248
SJW        24818
SJL        24811
MaxW       15036
MaxL       15036
AvgW       15036
AvgL       15036
dtype: int64


## Feature Engineering:

In [8]:
def engineer_data(df):
    # Feature engineering
    df['higher_rank_won'] = (df['WRank'] < df['LRank']).astype(int)
    df['higher_rank_points'] = df['higher_rank_won'] * df['WPts'] + df['LPts'] * (1 - df['higher_rank_won'])
    df['lower_rank_points'] = (1 - df['higher_rank_won']) * df['WPts'] + df['LPts'] * df['higher_rank_won']
    df['points_diff'] = df['higher_rank_points'] - df['lower_rank_points']
    df['points_ratio'] = df.apply(
        lambda row: row['higher_rank_points'] / row['lower_rank_points'] if row['lower_rank_points'] != 0 else np.nan,
        axis=1
    )
    df['points_mean'] = (df['higher_rank_points'] + df['lower_rank_points']) / 2

    # Handle zero and missing values before taking logarithms
    df['higher_rank_points'].replace(0, 1, inplace=True)
    df['lower_rank_points'].replace(0, 1, inplace=True)
    df['higher_rank_points'].fillna(1, inplace=True)
    df['lower_rank_points'].fillna(1, inplace=True)

    df['log_winner_points'] = np.log(df['higher_rank_points'])
    df['log_loser_points'] = np.log(df['lower_rank_points'])
    df['diff_log_rank'] = df['log_winner_points'] - df['log_loser_points']

    df['ranking_difference'] = (df['WRank'] - df['LRank']).abs()
    df['TotalGamesWon_MatchWinner'] = df[['W1', 'W2', 'W3', 'W4', 'W5']].sum(axis=1, skipna=True)
    df['TotalGamesWon_MatchLoser'] = df[['L1', 'L2', 'L3', 'L4', 'L5']].sum(axis=1, skipna=True)
    df['GameDifference'] = df['TotalGamesWon_MatchWinner'] - df['TotalGamesWon_MatchLoser']

    # Handle missing values in points_ratio
    df['points_ratio'].fillna(df['points_ratio'].mean(), inplace=True)

    return df

# Apply preprocessing
betting_data = engineer_data(betting_data)


In [9]:
# Display the count of missing values for each numeric column
print("Missing values in numeric columns:")
print(check_missing_values_in_numeric_columns(betting_data))


Missing values in numeric columns:
ATP                              0
Best of                          0
WRank                            0
LRank                            0
WPts                             0
LPts                             0
W1                               0
L1                               0
W2                               0
L2                               0
W3                               0
L3                               0
W4                               0
L4                               0
W5                               0
L5                               0
Wsets                            0
Lsets                            0
B365W                          547
B365L                          524
CBW                          32337
CBL                          32337
EXW                           3611
EXL                           3605
IWW                          37571
IWL                          37571
PSW                           3150
PSL                 

## Split the dataset

In [10]:
# Convert 'tourney_date' to datetime format 
betting_data['Date'] = pd.to_datetime(betting_data['Date'], format='%Y-%m-%d')

# Define the split date for January 1, 2019
split_time = pd.to_datetime('2019-01-01', format='%Y-%m-%d')

# Splitting the dataset into training and validation (test) sets
betting_data_train = betting_data[betting_data['Date'] < split_time]
betting_data_validation = betting_data[betting_data['Date'] >= split_time]

# Display the first few rows of the train set to confirm the split
betting_data_train


Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,points_diff,points_ratio,points_mean,log_winner_points,log_loser_points,diff_log_rank,ranking_difference,TotalGamesWon_MatchWinner,TotalGamesWon_MatchLoser,GameDifference
0,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Saulnier C.,...,0.0,4.656752,0.0,0.000000,0.000000,0.000000,271.0,13.0,8.0,5.0
1,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Enqvist T.,...,0.0,4.656752,0.0,0.000000,0.000000,0.000000,10.0,12.0,4.0,8.0
2,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Melzer J.,...,0.0,4.656752,0.0,0.000000,0.000000,0.000000,6.0,17.0,16.0,1.0
3,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Rochus O.,...,0.0,4.656752,0.0,0.000000,0.000000,0.000000,13.0,15.0,10.0,5.0
4,1,Adelaide,Next Generation Hardcourts,2005-01-03,International,Outdoor,Hard,1st Round,3,Mayer F.,...,0.0,4.656752,0.0,0.000000,0.000000,0.000000,66.0,16.0,15.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Darcis S.,...,-714.0,0.000000,357.0,0.000000,6.570883,-6.570883,73.0,12.0,7.0,5.0
37839,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Munar J.,...,71.0,1.119932,627.5,6.496775,6.383507,0.113268,17.0,13.0,8.0,5.0
37840,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Donskoy E.,...,64.0,1.107744,626.0,6.489205,6.386879,0.102326,15.0,18.0,16.0,2.0
37841,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Mmoh M.,...,42.0,1.080614,542.0,6.333280,6.255750,0.077530,7.0,13.0,8.0,5.0


In [11]:
# Display the first few rows of the validation set to confirm the split
betting_data_validation


Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,points_diff,points_ratio,points_mean,log_winner_points,log_loser_points,diff_log_rank,ranking_difference,TotalGamesWon_MatchWinner,TotalGamesWon_MatchLoser,GameDifference
37785,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Kudla D.,...,164.0,1.202469,892.0,6.881411,6.697034,0.184377,14.0,19.0,17.0,2.0
37786,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Chardy J.,...,175.0,1.200000,962.5,6.956545,6.774224,0.182322,17.0,16.0,13.0,3.0
37787,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Murray A.,...,6.0,1.030000,203.0,5.327876,5.298317,0.029559,6.0,12.0,7.0,5.0
37788,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Kyrgios N.,...,315.0,1.388889,967.5,7.025538,6.697034,0.328504,27.0,19.0,19.0,0.0
37789,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Tsonga J.W.,...,167.0,1.835000,283.5,5.905362,5.298317,0.607044,93.0,13.0,10.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40385,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Nadal R.,...,5585.0,2.396250,6792.5,9.167955,8.294050,0.873905,5.0,19.0,16.0,3.0
40386,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,...,2760.0,1.937182,4325.0,8.649098,7.987864,0.661234,3.0,13.0,10.0,3.0
40387,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Tsitsipas S.,...,2190.0,1.547500,5095.0,8.730690,8.294050,0.436641,3.0,12.0,7.0,5.0
40388,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Thiem D.,...,2080.0,1.706282,3985.0,8.522181,7.987864,0.534317,2.0,13.0,8.0,5.0


## Model Implementation:


## Univariate Logistic Model (points_diff)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import StandardScaler
import numpy as np

def evaluate_logistic_regression(train_data, validation_data):
    # Features and target
    X_train = train_data['points_diff'].values.reshape(-1, 1)
    y_train = train_data['higher_rank_won'].values
    X_validation = validation_data['points_diff'].values.reshape(-1, 1)
    y_validation = validation_data['higher_rank_won'].values

    # Verify the shapes
    print(X_train.shape, X_validation.shape)

    # Initialize and fit the logistic regression model
    logistic_model = LogisticRegression(solver='liblinear', max_iter=1000)
    logistic_model.fit(X_train, y_train)

    # Make predictions
    validation_predictions = logistic_model.predict(X_validation)
    validation_prediction_probs = logistic_model.predict_proba(X_validation)[:, 1]

    # Evaluate the model
    accuracy = np.mean(validation_predictions == y_validation)

    # Calculate calibration
    calibration = np.sum(validation_prediction_probs) / np.sum(y_validation)

    # Calculate log loss
    def logloss(actual, predictions):
        epsilon = 1e-15
        predictions = np.clip(predictions, epsilon, 1 - epsilon)
        
        log_loss_value = -(1 / len(actual)) * np.sum(
            actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
        return log_loss_value

    log_loss_value = logloss(y_validation, validation_prediction_probs)

    # Output the results
    print("Single Feature Logistic Regression Model Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Calibration: {calibration:.4f}")
    print(f'Log Loss: {log_loss_value:.4f}')

    return accuracy, calibration, log_loss_value

accuracy, calibration, log_loss = evaluate_logistic_regression(betting_data_train, betting_data_validation)


(37797, 1) (2593, 1)
Single Feature Logistic Regression Model Results:
Accuracy: 0.6113
Calibration: 1.0880
Log Loss: 0.6540


## Preprocess Dataset:

In [13]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

def preprocess_dataset(df):
  
    df_selected = df[["Tournament", "Series", "Court", "Surface", "Round", "Best of", 
                  "WRank", "LRank", 'ranking_difference', 'Wsets', 'Lsets', 'TotalGamesWon_MatchWinner', 'TotalGamesWon_MatchLoser', 'GameDifference', 
                  "B365W", "B365L", "CBW", "CBL", "EXW", "EXL", "LBW", "LBL", 
                      "PSW", "PSL", "IWW", "IWL", "UBW", "UBL", "SJW", "SJL", 
                      "higher_rank_points", "lower_rank_points", "higher_rank_won"]]
     
    betting_columns = ["B365W", "B365L", "CBW", "CBL", "EXW", "EXL", "LBW", "LBL", 
                       "PSW", "PSL", "IWW", "IWL", "UBW", "UBL", "SJW", "SJL"]
    
    # Calculate the mean of the available betting odds for each column, excluding NaN values
    mean_betting_odds = df_selected[betting_columns].mean(skipna=True)
    print("Mean betting odds for each column:")
    print(mean_betting_odds)

    # Impute the missing values with the mean using .fillna
    for col in betting_columns:
        if mean_betting_odds[col] != mean_betting_odds[col]:  # Check if the mean is NaN
            continue
        df_selected[col].fillna(mean_betting_odds[col], inplace=True)

    # Verify that there are no more missing values in the betting columns
    missing_values_after_imputation = df_selected[betting_columns].isnull().sum()
    print("Missing values after imputation in betting columns:")
    print(missing_values_after_imputation)
    # Encode categorical features only if they exist and are valid
    categorical_features = ['Best of', 'Round', 'Court', 'Surface', 'Series', 'Tournament']
    if not df_selected[categorical_features].empty:
        encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
        encoded_features = encoder.fit_transform(df_selected[categorical_features])
        df_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))
        df_selected = pd.concat([df_selected.reset_index(drop=True), df_encoded], axis=1).drop(columns=categorical_features)

    return df_selected


In [14]:
betting_data_train_preprocessed = preprocess_dataset(betting_data_train)

Mean betting odds for each column:
B365W    1.827268
B365L    3.681962
CBW      1.825494
CBL      3.338149
EXW      1.802534
EXL      3.295159
LBW      1.810226
LBL      3.451461
PSW      1.929950
PSL      4.260632
IWW      1.680738
IWL      2.642355
UBW      1.815867
UBL      3.542479
SJW      1.796538
SJL      3.557943
dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected[col].fillna(mean_betting_odds[col], inplace=True)


Missing values after imputation in betting columns:
B365W    0
B365L    0
CBW      0
CBL      0
EXW      0
EXL      0
LBW      0
LBL      0
PSW      0
PSL      0
IWW      0
IWL      0
UBW      0
UBL      0
SJW      0
SJL      0
dtype: int64


In [15]:
betting_data_validation_preprocessed = preprocess_dataset(betting_data_validation)


Mean betting odds for each column:
B365W    1.845830
B365L    3.163674
CBW           NaN
CBL           NaN
EXW           NaN
EXL           NaN
LBW           NaN
LBL           NaN
PSW      1.934390
PSL      3.479733
IWW           NaN
IWL           NaN
UBW           NaN
UBL           NaN
SJW           NaN
SJL           NaN
dtype: float64
Missing values after imputation in betting columns:
B365W       0
B365L       0
CBW      2593
CBL      2593
EXW      2593
EXL      2593
LBW      2593
LBL      2593
PSW         0
PSL         0
IWW      2593
IWL      2593
UBW      2593
UBL      2593
SJW      2593
SJL      2593
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected[col].fillna(mean_betting_odds[col], inplace=True)


## Testing Multiple Logistic Regression: 

In [16]:
# Ensure both sets have the same features
betting_data_validation_preprocessed = betting_data_validation_preprocessed.dropna(axis=1)
common_columns = betting_data_validation_preprocessed.columns.intersection(betting_data_train_preprocessed.columns)
betting_data_train_preprocessed = betting_data_train_preprocessed[common_columns]
betting_data_validation_preprocessed = betting_data_validation_preprocessed[common_columns]

# Define the columns you want to drop(High correlation features)
columns_to_drop = ['WRank', 'LRank', 'Wsets', 'Lsets', 'TotalGamesWon_MatchLoser', 'TotalGamesWon_MatchWinner',
                   'PSW', 'PSL', 'Wsets', 'Best of_3', 'Best of_5', 'Round_Round Robin', 'Series_Grand Slam', 
                   'Series_Masters Cup', 'Court_Outdoor', 'Surface_Hard']

# Drop columns that exist in the DataFrame
betting_data_train_preprocessed = betting_data_train_preprocessed.drop(
    columns=[col for col in columns_to_drop if col in betting_data_train_preprocessed.columns]
)
betting_data_validation_preprocessed = betting_data_validation_preprocessed.drop(
    columns=[col for col in columns_to_drop if col in betting_data_validation_preprocessed.columns]
)

# Features and target
X_train = betting_data_train_preprocessed.drop(columns=['higher_rank_won'])  
y_train = betting_data_train_preprocessed['higher_rank_won']
X_validation = betting_data_validation_preprocessed.drop(columns=['higher_rank_won']) 
y_validation = betting_data_validation_preprocessed['higher_rank_won']

# Feature scaling (standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_validation = scaler.transform(X_validation)

# Verify the shapes
print(X_train.shape, X_validation.shape)

# Initialize and fit the logistic regression model
model = LogisticRegression(solver='liblinear', max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_validation)
prediction_probs = model.predict_proba(X_validation)[:, 1]


# Evaluate the model
accuracy_result = np.mean(predictions == y_validation)

# Calculate calibration
calibration_2019 = np.sum(prediction_probs) / np.sum(y_validation)

# Output the results
print("Multiple Logistic Regression Model Results:")
print(f"Accuracy: {accuracy_result:.4f}")
print(f"Calibration: {calibration_2019:.4f}")

# Calculate log loss
def logloss(actual, predictions):
    epsilon = 1e-15
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    
    logr_logloss_all_predictors = -(1 / len(actual)) * np.sum(
        actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
    return logr_logloss_all_predictors

logloss_2019 = logloss(y_validation, prediction_probs)
print(f'Logloss: {logloss_2019:.4f}')


(37797, 78) (2593, 78)
Multiple Logistic Regression Model Results:
Accuracy: 0.7682
Calibration: 1.0444
Logloss: 0.4810


## Extension 1:

In [17]:
def split_dataset(df, top_n, split_date):
    # Filter the dataset for top N players
    df_top = df[(df['WRank'] <= top_n) | (df['LRank'] <= top_n)].copy()
    
    # Convert 'Date' to datetime format
    df_top['Date'] = pd.to_datetime(df_top['Date'], format='%Y-%m-%d')
    
    # Define the split date
    split_time = pd.to_datetime(split_date, format='%Y-%m-%d')
    
    # Splitting the dataset into training and validation (test) sets
    df_top_train = df_top[df_top['Date'] < split_time]
    df_top_validation = df_top[df_top['Date'] >= split_time]
    
    return df_top_train, df_top_validation

split_date = '2019-01-01'
df_top_50_train, df_top_50_validation = split_dataset(betting_data, 50, split_date)
df_top_100_train, df_top_100_validation = split_dataset(betting_data, 100, split_date)



In [18]:
accuracy_50, calibration_50, log_loss_50 = evaluate_logistic_regression(df_top_50_train, df_top_50_validation)


(27140, 1) (1838, 1)
Single Feature Logistic Regression Model Results:
Accuracy: 0.6311
Calibration: 1.1010
Log Loss: 0.6425


In [19]:
accuracy_100, calibration_100, log_loss_100 = evaluate_logistic_regression(df_top_100_train, df_top_100_validation)


(35765, 1) (2493, 1)
Single Feature Logistic Regression Model Results:
Accuracy: 0.6125
Calibration: 1.0930
Log Loss: 0.6535


In [20]:
# Create a DataFrame to store the validation statistics
validation_stats = pd.DataFrame({
    'model': [
        'Logistic Regression', 'Logistic Regression Top 50', 'Logistic Regression Top 100'
    ],
    'accuracy': [
        accuracy, accuracy_50, accuracy_100
    ],
    'log_loss': [
        log_loss, log_loss_50, log_loss_100
    ],
    'calibration': [
        calibration, calibration_50, calibration_100
    ]
})

# Print the validation statistics DataFrame
print(validation_stats)

                         model  accuracy  log_loss  calibration
0          Logistic Regression  0.611261  0.653971     1.087981
1   Logistic Regression Top 50  0.631121  0.642495     1.100985
2  Logistic Regression Top 100  0.612515  0.653478     1.093043
