In [1]:
import warnings
import numpy as np
import pandas as pd
import os

# Suppress specific UserWarnings from openpyxl
warnings.filterwarnings("ignore", category=UserWarning, module='openpyxl')

# Define the directory where your files are located
data_dir = '.'  # Adjust the path according to your file location

# List to hold the dataframes
dataframes = []

# Loop through the years and load the files
for year in range(2000, 2020):
    if year <= 2012:
        file_path = os.path.join(data_dir, f'{year}.xls')
    else:
        file_path = os.path.join(data_dir, f'{year}.xlsx')
    
    # Load the file into a dataframe
    df = pd.read_excel(file_path)
    
    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all the dataframes into one
betting_data = pd.concat(dataframes, ignore_index=True)

# Ensure all columns are displayed
pd.set_option('display.max_columns', None)
# Ensure all columns are displayed
pd.set_option('display.max_columns', None)
# Display the first few rows of the combined dataframe
betting_data


Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,WRank,LRank,W1,L1,W2,L2,W3,L3,W4,L4,W5,L5,Wsets,Lsets,Comment,CBW,CBL,GBW,GBL,IWW,IWL,SBW,SBL,B365W,B365L,B&WW,B&WL,EXW,EXL,PSW,PSL,WPts,LPts,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
0,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,63,77,6.0,4.0,6.0,2.0,,,,,,,2.0,0.0,Completed,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Enqvist T.,Clement A.,5,56,6.0,3.0,6.0,3.0,,,,,,,2.0,0.0,Completed,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,40,655,6.0,7.0,7.0,5.0,6.0,3.0,,,,,2.0,1.0,Completed,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Federer R.,Knippschild J.,65,87,6.0,1.0,6.0,4.0,,,,,,,2.0,0.0,Completed,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,81,198,7.0,6.0,5.0,7.0,6.0,4.0,,,,,2.0,1.0,Completed,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54903,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Nadal R.,Tsitsipas S.,1.0,6.0,6.0,7.0,6.0,4.0,7.0,5.0,,,,,2.0,1.0,Completed,,,,,,,,,1.44,2.75,,,,,1.39,3.26,9585.0,4000.0,,,,,,,1.48,3.30,1.41,2.93
54904,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,Medvedev D.,7.0,4.0,6.0,4.0,7.0,6.0,,,,,,,2.0,0.0,Completed,,,,,,,,,1.90,1.90,,,,,2.14,1.79,2945.0,5705.0,,,,,,,2.24,2.06,1.92,1.90
54905,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Tsitsipas S.,Federer R.,6.0,3.0,6.0,3.0,6.0,4.0,,,,,,,2.0,0.0,Completed,,,,,,,,,3.50,1.30,,,,,3.75,1.33,4000.0,6190.0,,,,,,,3.75,1.40,3.39,1.33
54906,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Thiem D.,Zverev A.,5.0,7.0,7.0,5.0,6.0,3.0,,,,,,,2.0,0.0,Completed,,,,,,,,,1.80,2.00,,,,,1.84,2.10,5025.0,2945.0,,,,,,,1.87,2.20,1.78,2.06


## Feature Engineering:

In [2]:
# Convert WRank and LRank to numeric, coercing errors
betting_data['WRank'] = pd.to_numeric(betting_data['WRank'], errors='coerce')
betting_data['LRank'] = pd.to_numeric(betting_data['LRank'], errors='coerce')

# Fill NaN values with a high number
betting_data['WRank'].fillna(100000, inplace=True)
betting_data['LRank'].fillna(100000, inplace=True)

# Correct the typo in row 38294, column 'EXW'
if betting_data.at[38294, 'EXW'] == '2.,3':
    betting_data.at[38294, 'EXW'] = '2.3'

#Adding custom features
betting_data['higher_rank_won'] = (betting_data['WRank'] < betting_data['LRank']).astype(int)
betting_data['higher_rank_points'] = betting_data['higher_rank_won'] * betting_data['WPts'] + betting_data['LPts'] * (1 - betting_data['higher_rank_won'])
betting_data['lower_rank_points'] = (1 - betting_data['higher_rank_won']) * betting_data['WPts'] + betting_data['LPts'] * betting_data['higher_rank_won']
betting_data['points_diff'] = betting_data['higher_rank_points'] - betting_data['lower_rank_points']
betting_data['points_ratio'] = betting_data['higher_rank_points'] / betting_data['lower_rank_points']
betting_data['points_mean'] = (betting_data['higher_rank_points'] + betting_data['lower_rank_points']) / 2
betting_data['log_winner_points'] = np.log(betting_data['higher_rank_points'])
betting_data['log_loser_points'] = np.log(betting_data['lower_rank_points'])
betting_data['diff_log_rank'] = betting_data['log_winner_points'] - betting_data['log_loser_points']
betting_data['ranking_difference'] = (betting_data['WRank'] - betting_data['LRank']).abs()

betting_data['TotalGamesWon_MatchWinner'] = betting_data[['W1', 'W2', 'W3', 'W4', 'W5']].sum(axis=1, skipna=True)
betting_data['TotalGamesWon_MatchLoser'] = betting_data[['L1', 'L2', 'L3', 'L4', 'L5']].sum(axis=1, skipna=True)
betting_data['GameDifference'] = betting_data['TotalGamesWon_MatchWinner'] - betting_data['TotalGamesWon_MatchLoser']


  betting_data['TotalGamesWon_MatchWinner'] = betting_data[['W1', 'W2', 'W3', 'W4', 'W5']].sum(axis=1, skipna=True)
  betting_data['TotalGamesWon_MatchLoser'] = betting_data[['L1', 'L2', 'L3', 'L4', 'L5']].sum(axis=1, skipna=True)


In [3]:
betting_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54908 entries, 0 to 54907
Data columns (total 67 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   ATP                        54908 non-null  int64         
 1   Location                   54908 non-null  object        
 2   Tournament                 54908 non-null  object        
 3   Date                       54908 non-null  datetime64[ns]
 4   Series                     54908 non-null  object        
 5   Court                      54908 non-null  object        
 6   Surface                    54908 non-null  object        
 7   Round                      54908 non-null  object        
 8   Best of                    54908 non-null  int64         
 9   Winner                     54908 non-null  object        
 10  Loser                      54908 non-null  object        
 11  WRank                      54908 non-null  float64       
 12  LRan

In [4]:
import pandas as pd

# Example DataFrame
# df = pd.read_excel('your_file.xlsx')  # Replace with your actual DataFrame

# Find columns with missing values
columns_with_missing_values = betting_data.columns[betting_data.isnull().any()].tolist()

# Print the columns with missing values
print("Columns with missing values:", columns_with_missing_values)


Columns with missing values: ['W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets', 'Lsets', 'CBW', 'CBL', 'GBW', 'GBL', 'IWW', 'IWL', 'SBW', 'SBL', 'B365W', 'B365L', 'B&WW', 'B&WL', 'EXW', 'EXL', 'PSW', 'PSL', 'WPts', 'LPts', 'UBW', 'UBL', 'LBW', 'LBL', 'SJW', 'SJL', 'MaxW', 'MaxL', 'AvgW', 'AvgL', 'higher_rank_points', 'lower_rank_points', 'points_diff', 'points_ratio', 'points_mean', 'log_winner_points', 'log_loser_points', 'diff_log_rank']


In [5]:
betting_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54908 entries, 0 to 54907
Data columns (total 67 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   ATP                        54908 non-null  int64         
 1   Location                   54908 non-null  object        
 2   Tournament                 54908 non-null  object        
 3   Date                       54908 non-null  datetime64[ns]
 4   Series                     54908 non-null  object        
 5   Court                      54908 non-null  object        
 6   Surface                    54908 non-null  object        
 7   Round                      54908 non-null  object        
 8   Best of                    54908 non-null  int64         
 9   Winner                     54908 non-null  object        
 10  Loser                      54908 non-null  object        
 11  WRank                      54908 non-null  float64       
 12  LRan

## Preprocess Dataset:

In [6]:
# Convert 'tourney_date' to datetime format 
betting_data['Date'] = pd.to_datetime(betting_data['Date'], format='%Y-%m-%d')

# Define the split date for January 1, 2019
split_time = pd.to_datetime('2019-01-01', format='%Y-%m-%d')

# Splitting the dataset into training and validation (test) sets
betting_data_train = betting_data[betting_data['Date'] < split_time]
betting_data_validation = betting_data[betting_data['Date'] >= split_time]

# Display the first few rows of the validation set to confirm the split
print(betting_data_validation.head())


       ATP  Location              Tournament       Date  Series    Court  \
52303    1  Brisbane  Brisbane International 2019-01-01  ATP250  Outdoor   
52304    1  Brisbane  Brisbane International 2019-01-01  ATP250  Outdoor   
52305    1  Brisbane  Brisbane International 2019-01-01  ATP250  Outdoor   
52306    1  Brisbane  Brisbane International 2019-01-01  ATP250  Outdoor   
52307    1  Brisbane  Brisbane International 2019-01-01  ATP250  Outdoor   

      Surface      Round  Best of       Winner          Loser  WRank  LRank  \
52303    Hard  1st Round        3     Kudla D.       Fritz T.   63.0   49.0   
52304    Hard  1st Round        3    Chardy J.    Struff J.L.   40.0   57.0   
52305    Hard  1st Round        3    Murray A.   Duckworth J.  240.0  234.0   
52306    Hard  1st Round        3   Kyrgios N.    Harrison R.   35.0   62.0   
52307    Hard  1st Round        3  Tsonga J.W.  Kokkinakis T.  239.0  146.0   

        W1   L1   W2   L2   W3   L3  W4  L4  W5  L5  Wsets  Lsets   

In [7]:
betting_data_train.tail()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,WRank,LRank,W1,L1,W2,L2,W3,L3,W4,L4,W5,L5,Wsets,Lsets,Comment,CBW,CBL,GBW,GBL,IWW,IWL,SBW,SBL,B365W,B365L,B&WW,B&WL,EXW,EXL,PSW,PSL,WPts,LPts,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL,higher_rank_won,higher_rank_points,lower_rank_points,points_diff,points_ratio,points_mean,log_winner_points,log_loser_points,diff_log_rank,ranking_difference,TotalGamesWon_MatchWinner,TotalGamesWon_MatchLoser,GameDifference
52356,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Darcis S.,Carballes Baena R.,100000.0,73.0,6.0,3.0,6.0,4.0,,,,,,,2.0,0.0,Completed,,,,,,,,,2.37,1.53,,,,,2.44,1.61,,714.0,,,,,,,2.47,1.65,2.35,1.59,0,,,,,,,,,99927.0,6.0,3.0,3.0
52357,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Munar J.,Albot R.,81.0,98.0,6.0,2.0,7.0,6.0,,,,,,,2.0,0.0,Completed,,,,,,,,,2.0,1.72,,,,,1.94,1.94,663.0,592.0,,,,,,,2.08,1.95,1.94,1.86,1,663.0,592.0,71.0,1.119932,627.5,6.496775,6.383507,0.113268,17.0,6.0,2.0,4.0
52358,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Donskoy E.,Andujar P.,97.0,82.0,6.0,3.0,5.0,7.0,7.0,6.0,,,,,2.0,1.0,Completed,,,,,,,,,1.5,2.5,,,,,1.54,2.62,594.0,658.0,,,,,,,1.57,2.65,1.51,2.53,0,658.0,594.0,64.0,1.107744,626.0,6.489205,6.386879,0.102326,15.0,6.0,3.0,3.0
52359,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Mmoh M.,Gunneswaran P.,103.0,110.0,7.0,5.0,6.0,3.0,,,,,,,2.0,0.0,Completed,,,,,,,,,1.66,2.1,,,,,1.81,2.1,563.0,521.0,,,,,,,1.83,2.17,1.74,2.09,1,563.0,521.0,42.0,1.080614,542.0,6.33328,6.25575,0.07753,7.0,7.0,5.0,2.0
52360,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Gulbis E.,Sousa P.,95.0,104.0,6.0,4.0,4.0,6.0,6.0,4.0,,,,,2.0,1.0,Completed,,,,,,,,,1.33,3.25,,,,,1.36,3.39,612.0,550.0,,,,,,,1.4,3.5,1.35,3.19,1,612.0,550.0,62.0,1.112727,581.0,6.416732,6.309918,0.106814,9.0,6.0,4.0,2.0


In [8]:
# def preprocess_dataset(df):
#     #df['tourney_date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d')
    
#     df_selected = df[["Tournament", "Series", "Court", "Surface", "Round", "Best of", 
#                   "WRank", "LRank", 'Wsets', 'Lsets', 
#                   "B365W", "B365L", "EXW", "EXL", "LBW", "LBL", "PSW", "PSL", 
#                   "higher_rank_points", "lower_rank_points","GameDifference"]]

#     return df_selected.info()

# preprocess_dataset(betting_data)

In [32]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

def preprocess_dataset(df):
    #df['tourney_date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d')
    
    df_selected = df[["Round", "Best of", "Court",
                  "WRank", "LRank", 'Wsets', 'Lsets', 
                  "B365W", "B365L", "PSW", "PSL", 
                  "higher_rank_points", "lower_rank_points","GameDifference", "higher_rank_won"]]
    
    betting_columns = ["B365W", "B365L", "PSW", "PSL"]
    
    # Ensure all columns are numeric and convert if necessary
    for col in betting_columns:
        if not pd.api.types.is_numeric_dtype(betting_data[col]):
            print(f"Converting column {col} to numeric.\n")
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Calculate the mean of the available betting odds for each column
    mean_betting_odds = df[betting_columns].mean()
    
    df_selected.info()
    print(mean_betting_odds)
    print()
    # Impute the missing values with the mean using .loc
    for col in betting_columns:
        df.loc[df[col].isnull(), col] = mean_betting_odds[col]

    df_selected['Wsets'] = df_selected['Wsets'].fillna(0)   
    df_selected['Lsets'] = df_selected['Lsets'].fillna(0)
    df_selected['higher_rank_points'] = df_selected['higher_rank_points'].fillna(0)   
    df_selected['lower_rank_points'] = df_selected['lower_rank_points'].fillna(0)

    df_selected = df_selected.astype({'Round': 'category', 'Court': 'category', 'Best of': 'category'})
    
    df_selected = df_selected.dropna()
    print()
    print(df_selected)
#     categorical_features = ['Best of', 'Round', 'Court', 'Surface', 'Series', 'Tournament']
    categorical_features = ['Best of', 'Round', 'Court']

    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)  # Ensure output is a dense array
    
    encoded_features = encoder.fit_transform(df_selected[categorical_features])
    
    df_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))
    
    df_selected.reset_index(drop=True, inplace=True)
    
    df_final = pd.concat([df_selected.drop(columns=categorical_features), df_encoded], axis=1)

    return df_final


In [33]:
# import numpy as np
# from sklearn.preprocessing import OneHotEncoder

# def preprocess_dataset(df):
#     #df['tourney_date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d')
    
#     df_selected = df[["Tournament", "Series", "Court", "Surface", "Round", "Best of", 
#                   "WRank", "LRank", 'Wsets', 'Lsets', 
#                   "B365W", "B365L", "EXW", "EXL", "LBW", "LBL", "PSW", "PSL", 
#                   "higher_rank_points", "lower_rank_points","GameDifference"]]
    
#     betting_columns = ["B365W", "B365L", "CBW", "CBL", "EXW", "EXL", "LBW", 
#                        "LBL","PSW", "PSL", "SBW", "SBL"]
    
#     # Ensure all columns are numeric and convert if necessary
#     for col in betting_columns:
#         if not pd.api.types.is_numeric_dtype(betting_data[col]):
#             print(f"Converting column {col} to numeric.\n")
#             df[col] = pd.to_numeric(df[col], errors='coerce')

#     # Calculate the mean of the available betting odds for each column
#     mean_betting_odds = df[betting_columns].mean()
    
#     df_selected.info()
#     print(mean_betting_odds)
#     print()
#     # Impute the missing values with the mean using .loc
#     for col in betting_columns:
#         df.loc[df[col].isnull(), col] = mean_betting_odds[col]

#     df_selected['Wsets'] = df_selected['Wsets'].fillna(0)   
#     df_selected['Lsets'] = df_selected['Lsets'].fillna(0)
#     df_selected['higher_rank_points'] = df_selected['higher_rank_points'].fillna(0)   
#     df_selected['lower_rank_points'] = df_selected['lower_rank_points'].fillna(0)

#     df_selected = df_selected.astype({'Tournament': 'category', 'Best of': 'category', 'Round': 'category', 
#                                       'Surface': 'category', 'Court': 'category', 'Series': 'category'})
    
#     df_selected = df_selected.dropna()
#     print()
# #     print(df_selected)
#     categorical_features = ['Best of', 'Round', 'Court', 'Surface', 'Series', 'Tournament']
#     encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)  # Ensure output is a dense array
    
#     encoded_features = encoder.fit_transform(df_selected[categorical_features])
    
#     df_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))
    
#     df_selected.reset_index(drop=True, inplace=True)
    
#     df_final = pd.concat([df_selected.drop(columns=categorical_features), df_encoded], axis=1)

#     return df_final


In [35]:
betting_data_train_preprocessed = preprocess_dataset(betting_data_train)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52315 entries, 0 to 52360
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Round               52315 non-null  object 
 1   Best of             52315 non-null  int64  
 2   Court               52315 non-null  object 
 3   WRank               52315 non-null  float64
 4   LRank               52315 non-null  float64
 5   Wsets               52091 non-null  float64
 6   Lsets               52090 non-null  float64
 7   B365W               52315 non-null  float64
 8   B365L               52315 non-null  float64
 9   PSW                 52315 non-null  float64
 10  PSL                 52315 non-null  float64
 11  higher_rank_points  36040 non-null  float64
 12  lower_rank_points   36040 non-null  float64
 13  GameDifference      52315 non-null  float64
 14  higher_rank_won     52315 non-null  int32  
dtypes: float64(11), int32(1), int64(1), object(2)
memory 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Wsets'] = df_selected['Wsets'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Lsets'] = df_selected['Lsets'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['higher_rank_points'] = df_selected['higher_rank_points'].fillna(0)
A value is trying


           Round Best of    Court     WRank  LRank  Wsets  Lsets     B365W  \
0      1st Round       3  Outdoor      63.0   77.0    2.0    0.0  1.823723   
1      1st Round       3  Outdoor       5.0   56.0    2.0    0.0  1.823723   
2      1st Round       3  Outdoor      40.0  655.0    2.0    1.0  1.823723   
3      1st Round       3  Outdoor      65.0   87.0    2.0    0.0  1.823723   
4      1st Round       3  Outdoor      81.0  198.0    2.0    1.0  1.823723   
...          ...     ...      ...       ...    ...    ...    ...       ...   
52356  1st Round       3  Outdoor  100000.0   73.0    2.0    0.0  2.370000   
52357  1st Round       3  Outdoor      81.0   98.0    2.0    0.0  2.000000   
52358  1st Round       3  Outdoor      97.0   82.0    2.0    1.0  1.500000   
52359  1st Round       3  Outdoor     103.0  110.0    2.0    0.0  1.660000   
52360  1st Round       3  Outdoor      95.0  104.0    2.0    1.0  1.330000   

          B365L       PSW      PSL  higher_rank_points  lower_

In [34]:
betting_data_validation_preprocessed = preprocess_dataset(betting_data_validation)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2593 entries, 52303 to 54907
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Round               2593 non-null   object 
 1   Best of             2593 non-null   int64  
 2   Court               2593 non-null   object 
 3   WRank               2593 non-null   float64
 4   LRank               2593 non-null   float64
 5   Wsets               2572 non-null   float64
 6   Lsets               2572 non-null   float64
 7   B365W               2593 non-null   float64
 8   B365L               2593 non-null   float64
 9   PSW                 2593 non-null   float64
 10  PSL                 2593 non-null   float64
 11  higher_rank_points  2578 non-null   float64
 12  lower_rank_points   2578 non-null   float64
 13  GameDifference      2593 non-null   float64
 14  higher_rank_won     2593 non-null   int32  
dtypes: float64(11), int32(1), int64(1), object(2)
memo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Wsets'] = df_selected['Wsets'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Lsets'] = df_selected['Lsets'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['higher_rank_points'] = df_selected['higher_rank_points'].fillna(0)
A value is trying

In [22]:
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import OneHotEncoder

# def preprocess_dataset(df):
#     #df['tourney_date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d')
    
#     df_selected = df[["Tournament", "Series", "Court", "Surface", "Round", "Best of", 
#                       "WRank", "LRank", 'Wsets', 'Lsets', 
#                       "B365W", "B365L", "EXW", "EXL", "LBW", "LBL", "PSW", "PSL", 
#                       "higher_rank_points", "lower_rank_points", "GameDifference"]]

#     # Display the number of missing values in the betting odds columns
#     betting_columns = ["B365W", "B365L", "CBW", "CBL", "EXW", "EXL", "LBW", 
#                        "LBL", "PSW", "PSL", "SBW", "SBL"]
    
#     # Filter existing betting columns
#     existing_betting_columns = [col for col in betting_columns if col in df.columns]
    
#     missing_values_count = df[existing_betting_columns].isnull().sum()
#     print(f'Missing values in betting columns:\n{missing_values_count}\n')
    
#     # Calculate the mean of the available betting odds for each column
#     mean_betting_odds = df[existing_betting_columns].mean(numeric_only=True)
#     print(f'Mean of available betting odds:\n{mean_betting_odds}\n')
    
#     # Impute the missing values with the mean using .loc
#     for col in existing_betting_columns:
#         df.loc[df[col].isnull(), col] = mean_betting_odds[col]

#     # Ensure the columns are in df_selected before filling NAs
#     if 'Wsets' in df_selected.columns:
#         df_selected['Wsets'] = df_selected['Wsets'].fillna(0)   
#     if 'Lsets' in df_selected.columns:
#         df_selected['Lsets'] = df_selected['Lsets'].fillna(0)
#     if 'higher_rank_points' in df_selected.columns:
#         df_selected['higher_rank_points'] = df_selected['higher_rank_points'].fillna(0)   
#     if 'lower_rank_points' in df_selected.columns:
#         df_selected['lower_rank_points'] = df_selected['lower_rank_points'].fillna(0)

#     df_selected = df_selected.astype({'Tournament': 'category', 'Best of': 'category', 'Round': 'category', 
#                                       'Surface': 'category', 'Court': 'category', 'Series': 'category'})
    
#     df_selected = df_selected.dropna()

#     categorical_features = ['Best of', 'Round', 'Court', 'Surface', 'Series', 'Tournament']
#     encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)  # Ensure output is a dense array
    
#     # Filter existing categorical features
#     existing_categorical_features = [col for col in categorical_features if col in df_selected.columns]
    
#     encoded_features = encoder.fit_transform(df_selected[existing_categorical_features])
    
#     df_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(existing_categorical_features))
    
#     df_selected.reset_index(drop=True, inplace=True)
#     df_encoded.reset_index(drop=True, inplace=True)
    
#     df_final = pd.concat([df_selected.drop(columns=existing_categorical_features), df_encoded], axis=1)

#     return df_final

# # Example usage
# # betting_data_train_preprocessed = preprocess_dataset(betting_data_train)

# # Example usage
# betting_data_train_preprocessed = preprocess_dataset(betting_data_train)


## Model Implementation:

In [38]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss

# Ensure the dataset does not include non-numeric columns, especially dates or identifiers
feature_columns = [col for col in betting_data_train_preprocessed.columns 
                   if col not in ('higher_rank_won', 'higher_rank_points', 'Wsets', 'WRank')]  

# Preparing the data
X_train = betting_data_train_preprocessed[feature_columns]
y_train = betting_data_train_preprocessed['higher_rank_won']
X_validation = betting_data_validation_preprocessed[feature_columns]
y_validation = betting_data_validation_preprocessed['higher_rank_won']

# Initialize and fit the logistic regression model
model = LogisticRegression(solver='liblinear', max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_validation)
prediction_probs = model.predict_proba(X_validation)[:, 1]

# Evaluate the model
accuracy_result = accuracy_score(y_validation, predictions)
log_loss_result = log_loss(y_validation, prediction_probs)

# Output the results
print("Multiple Logistic Regression Model Results:")
print(f"Accuracy: {accuracy_result:.4f}")
print(f"Log Loss: {log_loss_result:.4f}")


Multiple Logistic Regression Model Results:
Accuracy: 0.8299
Log Loss: 0.3820
