In [1]:
import sys
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler

sys.path.append('/Users/nickocruz/Developer/EPL-market-inefficiency-forked/Nickos/src')
from Functions import *

DataPath = "../Data/Original/"

## Pre-Processing

- Taking orginal datasets:
  - Merging them on match_id with inner
    - inner: only keeps data that matches the match_id

In [2]:
df = construct_dataset(DataPath)
df = prepare_dataset(df)
df

Unnamed: 0,match_id,date,home_team,away_team,result,home_win_odds,draw_odds,away_win_odds,favorite,home_team_id,away_team_id,favorite_id,is_draw
0,EPL2425_001,2024-08-10,Arsenal,Newcastle,0,4.60,4.93,1.88,Newcastle,9,6,6,0
1,EPL2425_002,2024-08-10,Aston Villa,Brighton,0,3.09,5.36,2.26,Brighton,12,17,17,0
2,EPL2425_003,2024-08-10,Bournemouth,Wolves,2,2.19,5.90,3.06,Wolves,2,15,15,0
3,EPL2425_004,2024-08-10,Burnley,Man United,2,3.30,5.06,2.21,Man United,13,19,19,0
4,EPL2425_005,2024-08-10,Chelsea,Crystal Palace,0,4.18,3.36,2.41,Crystal Palace,3,11,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,EPL2425_366,2025-05-19,Newcastle,Man City,0,3.87,5.05,2.01,Man City,6,0,0,0
366,EPL2425_367,2025-05-19,Nottingham Forest,Sheffield United,1,2.12,4.75,3.70,Sheffield United,7,8,8,1
367,EPL2425_368,2025-05-19,Bournemouth,Wolves,2,2.27,5.86,2.93,Wolves,2,15,15,0
368,EPL2425_369,2025-05-19,Burnley,Brighton,0,2.91,4.72,2.52,Brighton,13,17,17,0


Draw Stats on entire dataset

In [3]:
print(f'draw_count: {df['is_draw'].value_counts()[1]}')
print(f'draw_percentage: {df['is_draw'].value_counts()[1] / len(df['is_draw'])}')

draw_count: 117
draw_percentage: 0.3162162162162162


Constructing feature and target subsets to train NN on.

In [4]:
feature_labels = ["home_team_id", "away_team_id", "home_win_odds", "draw_odds", "away_win_odds", "favorite_id"]
features_df = df[feature_labels]
targets = df['is_draw']


# Splitting into train and test sets
# stratify argument ensures that the ratio of draws to decisive matches is the same in both the training and test sets.
training, testing, training_targets, testing_targets = train_test_split(features_df, targets, test_size=0.2, random_state=42, stratify=targets)

# Validate Draw quantities
getDraws(training_targets, testing_targets)

TRAINING:
        	 Draw-Count: 94
        	 Draw-Percentage: 32% 

        	 Decisive-Match %: 68%
    
TESTING:
        	 Draw-Count: 23
        	 Draw-Percentage: 31% 

        	 Decisive-Match %: 69%
    


(32, 31)

## Imbalanced Dataset

- **Problem**
  - Dataset is imbalanced with only approx 30% of draws to be trained and tested on.
  - Imbalance causes model to ignore the minority class, in this specific situation the minority class being matches that end in draws.
- **Potential Solutions**
  - The following can be applied to balance the importance of the minority class:
    1. RandomOverSampler
    2. SMOTE

### 1. RandomOverSampler
- **Pros:**
  - Balances class distribution by randomly duplicating samples from the minority class until all classes have the same number of samples.

- **Cons:**
  - Duplicating data can cause overfitting, especially on a small dataset.
  - Doesn't add new information just the same.

### 2. SMOTE (Synthetic Minority Over-sampling Technique)
- **How it works:**
    1.	For each minority class sample, SMOTE finds its k nearest neighbors (usually k=5).
	2.	It randomly picks one of those neighbors.
	3.	It creates a new point along the line between the original point and the neighbor.

- **Pros:**
  - synthetic data points for the minority class instead of duplicating old ones (like RandomOverSampler does).
  - More variety than RandomOverSampler.
  - Reduces overfitting risk by not duplicating

- **Cons:**
  - Can create unrealistic samples if data isn't well-behaved.


### Thoughts

- Don't use SMOTE;
  - **why** Dataset is not well-behaved, which more than likely will cause the synthetic data points generated to be unrealistic.

## Applying RandomOverSampler
- Resampling should be done before converting into tensors, because this would lead to unnecessary computation.
- This resampling method does not work on Tensors and ensure this is done before scaling data.

In [None]:
X_train = training
X_targets = training_targets

output = f"""
BFORE ROS:\n
    \t X_train.size: {X_train.shape}\n
    \t draw(%): {round( (X_targets.value_counts()[1] / X_targets.shape[0])*100 )}% \n
"""
print(output)

ros = RandomOverSampler(random_state=42)
X_train, X_targets = ros.fit_resample(training, training_targets)

output = f"""
AFTER ROS:\n
    \t X_train.size: {X_train.shape}\n
    \t draw(%): {round( (X_targets.value_counts()[1] / X_targets.shape[0])*100 )}% \n
"""
print(output)


BFORE ROS:

    	 X_train.size: (296, 6)

    	 draw(%): 32% 



AFTER ROS:

    	 X_train.size: (404, 6)

    	 draw(%): 50% 




In [5]:
class DrawPredictionNN(nn.Module):
    def __init__(self, input_size, hidden1=64, hidden2=32):
        super(DrawPredictionNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 16)
        self.fc4 = nn.Linear(16,1)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x1 = self.fc1(x)
        x2 = self.fc2(x1)
        x3 = self.fc3(x2)
        return self.relu(self.fc4(x3))