In [None]:
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler

DataPath = "../Data/"

In [2]:
def saveModel(base_name, model):
    directory = "../Models/"
    version_numbers = []

    for filename in os.listdir(directory):
        for char in filename:
            if char.isdigit():
                version_numbers.append(int(char))
                break

    version = max(version_numbers) + 1
    torch.save(model.state_dict(), f'{directory}{base_name}_model{version}.pth')

## Data Prep


- Load in data
- Remove redundant column ['date_y']
- rename ['date_x'] as ['date']

In [None]:
df = pd.read_csv(DataPath + "dataA.csv")
df = df.drop(columns=['date_y']).rename(columns={'date_x': 'date'})

# Get all the teams and construct a unique id's
unique_teams = set(df['home_team']).union(set(df['away_team']))

# map each team to a unique integer
team_to_id = {team: idx for idx, team in enumerate(unique_teams)}

# create new columns of integer id's
df['home_team_id'] = df['home_team'].map(team_to_id)
df['away_team_id'] = df['away_team'].map(team_to_id)
df['favorite_id'] = df['favorite'].map(team_to_id)

# create a new column to hold 1's and 0's.
# 1: Draw
# 0: Not a Draw

df['is_draw'] = (df['result'] == 1).astype(int)


df.head()

Unnamed: 0,match_id,date,home_team,away_team,result,home_win_odds,draw_odds,away_win_odds,favorite,home_team_id,away_team_id,favorite_id,is_draw
0,EPL2425_001,2024-08-12,Everton,Liverpool,1,4.6,4.93,1.88,Liverpool,14,7,7,1
1,EPL2425_002,2024-08-11,West Ham,Newcastle,2,3.09,5.36,2.26,Newcastle,19,15,15,0
2,EPL2425_003,2024-08-11,Chelsea,Brighton,2,2.19,5.9,3.06,Brighton,16,18,18,0
3,EPL2425_004,2024-08-11,Burnley,Arsenal,0,3.3,5.06,2.21,Arsenal,4,9,9,0
4,EPL2425_005,2024-08-10,Brentford,Fulham,2,4.18,3.36,2.41,Fulham,5,6,6,0


Percentage of data that are draws.

In [None]:
# number of draws
draw_count = df['is_draw'].value_counts()[1]

# divide the number of draws, by the total number of matches
draws = draw_count/len(df['is_draw'])
df_numerical = df[["home_team_id", "away_team_id", "home_win_odds","draw_odds", "away_win_odds", "favorite_id", "is_draw"]]


# # disables warning text about using .iloc
# with pd.option_context("mode.chained_assignment", None):
#     df_numerical['match_id'] = df_numerical['match_id'].str.replace('EPL', '').str.replace('_', '')


# remove target to train model
X = df_numerical.drop(columns=["is_draw",])

# targets
y = df_numerical["is_draw"] # 1:draw, 0:decisive

# Splitting into train and test sets
# stratify argument ensures that the ratio of draws to decisive matches is the same in both the training and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Resampling should be done before converting into tensors, because this would lead to unnecessary computation.
# This resampling method does not work on Tensors and ensure this is done before scaling data.
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

train_draw_count = y_train_res.value_counts()[1]
test_draw_count = y_test.value_counts()[1]

training_total_draws = y_train.shape[0]
testing_total_draws = y_test.shape[0]

training_draw_percentage = round( (train_draw_count / training_total_draws) * 100 )
testing_draw_percentage = round( (test_draw_count / testing_total_draws) * 100 )


training_output = f"""TRAINNING:\n\t Features shape: {X_train_res.shape} \t Test shape : {y_train_res.shape} \t Draw-Count: { y_train_res.value_counts()[1]} \t Draw-Percentage: {training_draw_percentage}% \n"""
testing_output = f"""TESTING:\n\t Features shape: {X_test.shape} \t Test shape: {y_test.shape} \t Draw-Count: {y_test.value_counts()[1]} \t  Draw-Percentage: {testing_draw_percentage}% \n"""

print(training_output, testing_output)

Draw-Percentage: 26%
TRAINNING:
	 Features shape: (448, 6) 	 Test shape : (448,) 	 Draw-Count: 224 	 Draw-Percentage: 74% 
 TESTING:
	 Features shape: (76, 6) 	 Test shape: (76,) 	 Draw-Count: 20 	  Draw-Percentage: 26% 



`StandardScaler.fit_transform(X)` : normalizes the dataset by standardizing its features. In the following two steps

1. `StandardScaler.fit(X)`: Learns the mean and standard deviation of X. 
2. `StandardScaler.transform(X)`: Uses mean and standard deviation of X; to scale X to have a mean of 0 and a standard deviation of 1.

$$ X_{\text{scaled}} = \frac{X - \mu}{\sigma}$$

$\mu$ := mean of the feature

$\sigma$ := standard deviation of the feature

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)


# what does .view(-1,1) do?
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_res.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

batch_size = 6
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

[[-1.26031025e+00 -7.51951396e-01  8.09628755e-01 -1.25958404e+00
  -2.61573373e-01 -7.33540502e-01]
 [ 5.27024323e-01  6.24449933e-01 -6.57607424e-01  1.47646020e+00
  -4.08646841e-01  6.42183809e-01]
 [ 1.59942506e+00 -9.24001562e-01  4.71035791e-01 -1.06149939e+00
  -1.79865891e-01 -9.05506041e-01]
 [ 3.48290866e-01  1.65675093e+00 -8.34965644e-01 -1.70118459e-01
   7.84282400e-01  1.67397704e+00]
 [-1.61777716e+00 -4.07851063e-01 -1.52827505e+00  7.70783634e-01
   2.14062660e+00 -1.59336820e+00]
 [ 1.77815852e+00  1.14060043e+00  9.86986975e-01  1.57550253e+00
  -1.51986860e+00  1.15808043e+00]
 [-1.08157679e+00  4.52399767e-01 -8.67212593e-01  1.39388809e-01
   6.04525939e-01  4.70218270e-01]
 [ 7.05757779e-01  1.14060043e+00  1.96936724e-01 -4.63155518e-02
  -6.37427791e-01  1.15808043e+00]
 [-5.45376419e-01 -1.09605173e+00 -6.10388677e-02 -1.04911910e+00
   3.75744989e-01 -1.07747158e+00]
 [-3.66642962e-01  1.31265060e+00 -1.25532766e-01 -5.04386308e-01
  -4.91339190e-02  1.3300

# Neural Network


In [None]:
class DrawPredictionNN(nn.Module):
    def __init__(self, input_size):
        super(DrawPredictionNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            #nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            #nn.Dropout(0.3),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.model(x)

# Initialize model, loss function, and optimizer
input_size = X_train.shape[1]
model = DrawPredictionNN(input_size)
criterion = nn.BCELoss()  # Binary cross-entropy loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 2000
for epoch in range(num_epochs):
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

# Predictions
with torch.no_grad():
    model.eval()
    y_pred_prob = model(X_test_tensor)
    y_pred = (y_pred_prob > 0.5).float()

# Evaluating the model
model.eval()
accuracy = accuracy_score(y_test_tensor, y_pred)
classification_rep = classification_report(y_test_tensor, y_pred)

# We want HIGH F1-Scores
# The higher the F1-scores the better balance between precision and recall.
print("input_size: ", input_size)
print("Model Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Epoch 5/20, Loss: 0.5972
Epoch 10/20, Loss: 0.7729
Epoch 15/20, Loss: 0.3282
Epoch 20/20, Loss: 0.3147
input_size:  6
Model Accuracy: 0.5921052631578947
Classification Report:
               precision    recall  f1-score   support

         0.0       0.74      0.70      0.72        56
         1.0       0.26      0.30      0.28        20

    accuracy                           0.59        76
   macro avg       0.50      0.50      0.50        76
weighted avg       0.61      0.59      0.60        76

