### Import Modules

In [1]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

### Read the Merged Tennis Dataset

In [None]:
dataframe = pd.read_csv('../merged_tennis_files/merged_file.csv')

### Output Tennis Dataset to check its successfully been read

In [3]:
dataframe

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2019-M020,Brisbane,Hard,32.0,A,20181231,300,105453,2.0,,...,54.0,34.0,20.0,14.0,10.0,15.0,9.0,3590.0,16.0,1977.0
1,2019-M020,Brisbane,Hard,32.0,A,20181231,299,106421,4.0,,...,52.0,36.0,7.0,10.0,10.0,13.0,16.0,1977.0,239.0,200.0
2,2019-M020,Brisbane,Hard,32.0,A,20181231,298,105453,2.0,,...,27.0,15.0,6.0,8.0,1.0,5.0,9.0,3590.0,40.0,1050.0
3,2019-M020,Brisbane,Hard,32.0,A,20181231,297,104542,,PR,...,60.0,38.0,9.0,11.0,4.0,6.0,239.0,200.0,31.0,1298.0
4,2019-M020,Brisbane,Hard,32.0,A,20181231,296,106421,4.0,,...,56.0,46.0,19.0,15.0,2.0,4.0,16.0,1977.0,18.0,1855.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194991,2014-605,Tour Finals,Hard,8.0,F,20141109,514,104925,1.0,,...,38.0,29.0,5.0,11.0,3.0,9.0,1.0,10010.0,5.0,4625.0
194992,2014-605,Tour Finals,Hard,8.0,F,20141109,515,104925,1.0,,...,,,,,,,1.0,10010.0,2.0,8700.0
194993,2014-D015,Davis Cup WG F: FRA vs SUI,Clay,4.0,D,20141121,1,104527,,,...,,,,,,,4.0,5295.0,12.0,2740.0
194994,2014-D015,Davis Cup WG F: FRA vs SUI,Clay,4.0,D,20141121,2,104792,,,...,,,,,,,19.0,1825.0,2.0,9700.0


### Label Map String Values

#### We do this because NN's only understands numbers

In [4]:
surface_map = {
    "Hard": 0,
    "Clay": 1,
    "Grass": 2
}

# map values listed above
dataframe["surface"] = dataframe["surface"].map(surface_map)

### Sort Dataframe data by date

In [5]:
dataframe = dataframe.sort_values('tourney_date').reset_index(drop=True)

### Init an Elo System

** An Elo rating system helps a neural network predict tennis matches by providing a compact, history-aware measure of player strength before each match. Because Elo updates only using past results, it summarizes thousands of previous matches into a single number that reflects form and relative skill. Feeding pre-match Elo ratings (or their difference) into a neural network gives it a strong baseline signal about which player is more likely to win, reducing the amount of raw match history the model must learn from scratch and improving both convergence speed and predictive accuracy.
**

In [6]:
default_elo = 1500
player_elos = {}

def get_elo(player):
    return player_elos.get(player, default_elo)

def update_elo(winner, loser, k=32):
    winner_elo = get_elo(winner)
    loser_elo = get_elo(loser)

    expected_winner = 1 / (1+10**((loser_elo-winner_elo)/400))
    expected_loser = 1 - expected_winner

    new_winner_elo = winner_elo + k * (1-expected_winner)
    new_loser_elo = winner_elo + k * (1-expected_loser)

    player_elos[winner] = new_winner_elo
    player_elos[loser] = new_loser_elo

def assign_elo(row, player_col):
    p = row[player_col]
    w = row['winner_name']

    # If any is Series, convert to string by joining unique values (unlikely, but safe)
    if isinstance(p, pd.Series):
        p = ','.join(p.unique().astype(str))
    else:
        p = str(p)

    if isinstance(w, pd.Series):
        w = ','.join(w.unique().astype(str))
    else:
        w = str(w)

    if p == w:
        return row['winner_elo_before']
    else:
        return row['loser_elo_before']

### Add Elo Columns before each match

In [7]:
# Define 2 new columns in dataframes
dataframe['winner_elo_before'] = 0.0
dataframe['loser_elo_before'] = 0.0

# itterate through the dataframe
for index, row in dataframe.iterrows():

    # Define winners and losers so we can pull their elo
    winner = row['winner_name']
    loser = row['loser_name']

    # On a player's first appearance, this returns the default Elo (1500);
    # otherwise it returns their current Elo based on previous matches
    dataframe.at[index, 'winner_elo_before'] = get_elo(winner)
    dataframe.at[index, 'loser_elo_before'] = get_elo(loser)

    update_elo(winner, loser)

### Randomize player_1 and player_2 so NN doesn't fixate on one col

In [8]:
def randomize_players(row):
    if np.random.rand() > 0.5:
        return pd.Series({
            'player_1': row['winner_name'],
            'player_2': row['loser_name'],
            'player_1_wins': 1
        })
    else:
        return pd.Series({
            'player_1': row['loser_name'],
            'player_2': row['winner_name'],
            'player_1_wins': 0
        })

dataframe_randomized = dataframe.apply(randomize_players, axis=1)
dataframe = pd.concat([dataframe, dataframe_randomized], axis=1)

dataframe['player_1_elo_before'] = dataframe.apply(lambda row: assign_elo(row, 'player_1'), axis=1)
dataframe['player_2_elo_before'] = dataframe.apply(lambda row: assign_elo(row, 'player_2'), axis=1)

### Prepare final dataframe for modeling

In [9]:
model_dataframe = dataframe[['player_1', 'player_2', 'player_1_elo_before', 'player_2_elo_before', 'player_1_wins']].copy()
model_dataframe[:3] # Display the first 3 rows

Unnamed: 0,player_1,player_2,player_1_elo_before,player_2_elo_before,player_1_wins
0,Jaidip Mukerjea,Alex Metreveli,1500.0,1500.0,0
1,Premjit Lall,Ion Tiriac,1500.0,1500.0,0
2,Alex Metreveli,Ion Tiriac,1516.0,1516.0,1


In [10]:
player_labelencoder = LabelEncoder()

# Store all unique players
all_players = pd.concat([model_dataframe['player_1'], model_dataframe['player_2']]).unique()

# Fit all players with the label encoder so the NN understands it
player_labelencoder.fit(all_players)

all_players

array(['Jaidip Mukerjea', 'Premjit Lall', 'Alex Metreveli', ...,
       'Yousaf Khalil', 'Demetris Azoides', 'Siddharth Vishwakarma'],
      shape=(7572,), dtype=object)

In [11]:
# Transform both columns with the same encoder
model_dataframe['player_1_id'] = player_labelencoder.transform(model_dataframe['player_1'])
model_dataframe['player_2_id'] = player_labelencoder.transform(model_dataframe['player_2'])

In [12]:
model_dataframe['player_1_id'][:3]

0    3190
1    5815
2     278
Name: player_1_id, dtype: int64

In [13]:
model_dataframe['player_2_id'][:3]

0     278
1    3033
2    3033
Name: player_2_id, dtype: int64

In [14]:
X = model_dataframe[["player_1_elo_before", "player_2_elo_before", "player_1_id", "player_2_id"]]
y = model_dataframe[["player_1_wins"]]

In [15]:
X

Unnamed: 0,player_1_elo_before,player_2_elo_before,player_1_id,player_2_id
0,1500.000000,1500.000000,3190,278
1,1500.000000,1500.000000,5815,3033
2,1516.000000,1516.000000,278,3033
3,1500.000000,1532.000000,7553,3033
4,1516.000000,1500.000000,5815,7293
...,...,...,...,...
194991,28128.892161,28268.075429,4261,632
194992,28134.236003,28138.804743,3507,632
194993,28150.979579,28280.822379,4261,3205
194994,28150.446389,28172.695406,3507,4261


In [16]:
y

Unnamed: 0,player_1_wins
0,0
1,0
2,1
3,0
4,1
...,...
194991,1
194992,1
194993,1
194994,1


### Split our Testing and Training Data

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Replace NaNs in NumPy first (Data Validation)
X_train = np.nan_to_num(X_train, nan=np.nanmean(X_train, axis=0))
X_test = np.nan_to_num(X_test, nan=np.nanmean(X_test, axis=0))
y_train = np.nan_to_num(y_train, nan=np.nanmean(y_train))
y_test = np.nan_to_num(y_test, nan=np.nanmean(y_test))

In [19]:
# Intialise our tensors (training and testing)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

### Build a Dataset to feed to the Model

In [20]:
class TennisDataset(Dataset):
    def __init__(self, dataframe, player_labelencoder):
        self.dataframe = dataframe
        self.player_labelencoder = player_labelencoder

        self.player_1_ids = torch.tensor(player_labelencoder.transform(dataframe['player_1']), dtype=torch.long)
        self.player_2_ids = torch.tensor(player_labelencoder.transform(dataframe['player_2']), dtype=torch.long)

        self.player_1_elo = torch.tensor(dataframe['player_1_elo_before'].values, dtype=torch.float32)
        self.player_2_elo = torch.tensor(dataframe['player_2_elo_before'].values, dtype=torch.float32)

        # Label: 1 if player 1 wins, else 0
        self.labels = torch.tensor(dataframe['player_1_wins'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):
        return (self.player_1_ids[index], self.player_2_ids[index], self.player_1_elo[index], self.player_2_elo[index], self.labels[index])

### Build the model itself

In [21]:
class TennisModel(nn.Module):
    def __init__(self, num_players, embedding_dim=10):
        super(TennisModel, self).__init__()

        # Embdeding for player IDs
        self.player_embedding = nn.Embedding(num_players, embedding_dim)
        
        # Fully connected layers
        # Input size: 2 embeddings * embedding_dim + 2 elo ratings = 2 * embedding_dim + 2
        self.fc1 = nn.Linear(2 * embedding_dim + 2, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1) # output a single logit for binary classification

    def forward(self, player_1_id, player_2_id, player_1_elo, player_2_elo):

        # Embed players
        emb1 = self.player_embedding(player_1_id)
        emb2 = self.player_embedding(player_2_id)

        # Concatenate embeddings and numeric features
        x = torch.cat([emb1, emb2, player_1_elo.unsqueeze(1), player_2_elo.unsqueeze(1)], dim=1)

        # Feed forward
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = self.fc3(x) # output raw score logit

        return x.squeeze(1) # return shape


### Initialise Variables used during Training

In [22]:
### Hyper Params
batch_size = 64
epochs = 30
lr = 1e-3 # learning rate

#### Split Data

In [23]:
split_idx = int(len(dataframe) * 0.8)

train_df = dataframe.iloc[:split_idx]
val_df   = dataframe.iloc[split_idx:]

In [24]:
train_dataset = TennisDataset(train_df, player_labelencoder)
val_dataset   = TennisDataset(val_df, player_labelencoder)

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False
)

In [25]:
### Init Model
num_players = len(player_labelencoder.classes_)
model = TennisModel(num_players=num_players)

In [26]:
### Loss, Optimizer, Scheduler
criterion = nn.BCEWithLogitsLoss() # sigmoid + bce
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1)

# Note: gamma is what we decay the step size by (decay=lower)

### Conduct the Training/Evaluation

In [27]:
for epoch in range(epochs):

    ##################
    # Training phase #
    ##################
    model.train()
    total_loss = 0

    for batch in train_loader:
        p1_ids, p2_ids, p1_elo, p2_elo, labels = batch

        optimizer.zero_grad()

        outputs = model(p1_ids, p2_ids, p1_elo, p2_elo).squeeze()
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)

    scheduler.step()

    train_loss = total_loss / len(train_dataset)

    ###################
    # Evaluation phase #
    ###################
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            p1_ids, p2_ids, p1_elo, p2_elo, labels = batch

            outputs = model(p1_ids, p2_ids, p1_elo, p2_elo).squeeze()
            preds = (torch.sigmoid(outputs) > 0.5).float()

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    val_accuracy = correct / total * 100

    print(
        f"Epoch {epoch+1}/{epochs} | "
        f"Train Loss: {train_loss:.4f} | "
        f"Val Accuracy: {val_accuracy:.2f}%"
    )

Epoch 1/30 | Train Loss: 5.2098 | Val Accuracy: 50.83%
Epoch 2/30 | Train Loss: 1.8875 | Val Accuracy: 55.66%
Epoch 3/30 | Train Loss: 1.0880 | Val Accuracy: 49.95%
Epoch 4/30 | Train Loss: 0.8341 | Val Accuracy: 52.61%
Epoch 5/30 | Train Loss: 0.6153 | Val Accuracy: 58.91%
Epoch 6/30 | Train Loss: 0.6204 | Val Accuracy: 57.90%
Epoch 7/30 | Train Loss: 0.6195 | Val Accuracy: 60.60%
Epoch 8/30 | Train Loss: 0.6209 | Val Accuracy: 60.14%
Epoch 9/30 | Train Loss: 0.5985 | Val Accuracy: 60.94%
Epoch 10/30 | Train Loss: 0.5984 | Val Accuracy: 60.76%
Epoch 11/30 | Train Loss: 0.5979 | Val Accuracy: 60.77%
Epoch 12/30 | Train Loss: 0.5978 | Val Accuracy: 60.48%
Epoch 13/30 | Train Loss: 0.5951 | Val Accuracy: 60.77%
Epoch 14/30 | Train Loss: 0.5951 | Val Accuracy: 60.86%
Epoch 15/30 | Train Loss: 0.5951 | Val Accuracy: 60.82%
Epoch 16/30 | Train Loss: 0.5951 | Val Accuracy: 60.75%
Epoch 17/30 | Train Loss: 0.5948 | Val Accuracy: 60.92%
Epoch 18/30 | Train Loss: 0.5948 | Val Accuracy: 60.89%
E