In [97]:
import numpy as np

import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

import matplotlib.pyplot as plt

from datetime import datetime as dt

In [98]:
class Multiclass1(nn.Module):
    def __init__(self, input_size, output_size, dropout_rate):
        super().__init__()
        
        # inicialmente o ReLU para tudo
        self.act1 = nn.ELU()
        self.act2 = nn.Tanh()
        
        self.outputact = nn.Softmax(dim = 1)
        
        self.hidden1 = nn.Linear(input_size, 128)
        self.hidden2 = nn.Linear(128, 128)
        self.hidden3 = nn.Linear(128, 128)
        self.hidden4 = nn.Linear(128, 256)
        self.hidden5 = nn.Linear(256, 256)
        
        self.output = nn.Linear(256, output_size)
        
        self.dropout = nn.Dropout(dropout_rate)  # camada droupout é um tipo de regularização da rede neural
        
    def forward(self, x):
        x = self.hidden1(x)
        x = self.act1(x)
        x = self.dropout(x)  # Aplicar Dropout após a primeira camada oculta

        x = self.hidden2(x)
        x = self.act1(x)
        x = self.dropout(x)  

        x = self.hidden3(x)
        x = self.act2(x)
        x = self.dropout(x)

        x = self.hidden4(x)
        x = self.act2(x)
        x = self.dropout(x)

        x = self.hidden5(x)
        x = self.act2(x)
        x = self.dropout(x)
        
        x = self.output(x)
        x = self.outputact(x)
        return x


In [99]:
# preciso fazer o pré-processamento dos dados, OneHot e Normalização
loc = "./Premier-League/"

dataset = pd.read_csv(loc + "premier_league2001-2021.csv")

def parse_date(date):
    if date == '':
        return None
    else:
        if len(date) != 8:
            date_format = '%Y-%m-%d'
        else:
            date_format = '%y-%m-%d'
        return dt.strptime(date, date_format)

dataset["Date"] = dataset["Date"].apply(parse_date)# posso considerar a data ou não, acredito que vou manter apenas as season

dataset["Date"]

0      2000-08-19
1      2000-08-19
2      2000-08-19
3      2000-08-19
4      2000-08-19
          ...    
7975   2021-05-23
7976   2021-05-23
7977   2021-05-23
7978   2021-05-23
7979   2021-05-23
Name: Date, Length: 7980, dtype: datetime64[ns]

In [100]:
# Uma possibilidade a contornar isso pode ser unir os times em times "Bons", "Ruins" e "Medianos"
# Poderia ser com base nas taxas de vitórias, ou se eles aparecem muito nos campeonatos ou até mesmo pela media dos
# LP's

datasetParsed = dataset.drop(dataset.loc[(dataset["MW"] == 1)  | (dataset["MW"] == 2) | (dataset["MW"] == 3) | (dataset["Date"] < "2001-05-20")].index)
datasetParsed[["Date", "MW"]]

Unnamed: 0,Date,MW
410,2001-09-08,4.0
411,2001-09-08,4.0
412,2001-09-08,4.0
413,2001-09-08,4.0
414,2001-09-09,4.0
...,...,...
7975,2021-05-23,38.0
7976,2021-05-23,38.0
7977,2021-05-23,38.0
7978,2021-05-23,38.0


In [101]:
x_train = datasetParsed[["DiffLP", "DiffPts", "DiffFormPts", 
                         "HTWinStreak3", "ATWinStreak3","HTLossStreak3", "ATLossStreak3",
                         "HTGS", "ATGS", "HTGC", "ATGC", "HTFormPts", "ATFormPts",
                         "HTGD", "ATGD", "HomeTeamLP", "AwayTeamLP",
                         "HM1","HM2","HM3","HM4","HM5","AM1","AM2","AM3","AM4","AM5",
                         "HSA", "ASA", "HSTA", "ASTA"]]

columns_to_encode = ["HM1", "HM2", "HM3", "HM4", "HM5", "AM1", "AM2", "AM3", "AM4", "AM5"]

# Realiza o one-hot encoding nas colunas selecionadas
x_train_encoded = pd.get_dummies(x_train, columns=columns_to_encode)

print(x_train_encoded.columns)

x_train_encoded.info()

Index(['DiffLP', 'DiffPts', 'DiffFormPts', 'HTWinStreak3', 'ATWinStreak3',
       'HTLossStreak3', 'ATLossStreak3', 'HTGS', 'ATGS', 'HTGC', 'ATGC',
       'HTFormPts', 'ATFormPts', 'HTGD', 'ATGD', 'HomeTeamLP', 'AwayTeamLP',
       'HSA', 'ASA', 'HSTA', 'ASTA', 'HM1_D', 'HM1_L', 'HM1_W', 'HM2_D',
       'HM2_L', 'HM2_W', 'HM3_D', 'HM3_L', 'HM3_W', 'HM4_D', 'HM4_L', 'HM4_M',
       'HM4_W', 'HM5_D', 'HM5_L', 'HM5_M', 'HM5_W', 'AM1_D', 'AM1_L', 'AM1_W',
       'AM2_D', 'AM2_L', 'AM2_W', 'AM3_D', 'AM3_L', 'AM3_W', 'AM4_D', 'AM4_L',
       'AM4_M', 'AM4_W', 'AM5_D', 'AM5_L', 'AM5_M', 'AM5_W'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7000 entries, 410 to 7979
Data columns (total 55 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   DiffLP         7000 non-null   float64
 1   DiffPts        7000 non-null   float64
 2   DiffFormPts    7000 non-null   float64
 3   HTWinStreak3   7000 non-null   int64  
 4   ATW

In [102]:
scaler = MinMaxScaler()
x_train_encoded = scaler.fit_transform(x_train_encoded)

y_train = np.array(datasetParsed["FTR"]).reshape(-1, 1)

encoder = OneHotEncoder()
print(np. unique(y_train, return_counts=True))
y_train = encoder.fit_transform(y_train).toarray()

input_size = x_train_encoded.shape[1]
output_size = y_train.shape[1]

x_train = torch.tensor(x_train_encoded).float()
y_train = torch.from_numpy(y_train).float()

print(input_size, output_size)
print(y_train)

(array(['A', 'D', 'H'], dtype=object), array([2028, 1740, 3232], dtype=int64))
55 3
tensor([[1., 0., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        ...,
        [0., 0., 1.],
        [0., 0., 1.],
        [1., 0., 0.]])


In [103]:
from sklearn.model_selection import train_test_split

# Dividir o conjunto de treinamento em treinamento e validação
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.4)

# Exibir o shape dos conjuntos de treinamento e validação
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_valid shape:", x_valid.shape)
print("y_valid shape:", y_valid.shape)

x_train shape: torch.Size([4200, 55])
y_train shape: torch.Size([4200, 3])
x_valid shape: torch.Size([2800, 55])
y_valid shape: torch.Size([2800, 3])


In [104]:
n_epochs = 200

learning_rates = [0.05] #0.01
weight_decay = 0.0005 

best_acc = - np.nan
models = []
final_loss = []

CROSS_losses_train = []
CROSS_losses_val = []

ACC_losses_train = []
ACC_losses_val = []

parameters = []
batch_size = 64

for momentum in momentums:
    for lr in learning_rates:
            parameters.append({"learning_rate": lr,
                               "momentum": momentum, 
                               "weight_decay": weight_decay})
                
            batches_per_epoch = len(x_train) // batch_size
            model = Multiclass1(input_size, output_size, 0.25)
                
            current_CROSS_train = []
            current_CROSS_val = []

            current_ACC_train = []
            current_ACC_val = []
                
            loss_fn = nn.CrossEntropyLoss()
            optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay = weight_decay)
            for epoch in range(n_epochs):
                epoch_loss = []
                epoch_acc = []
                model.train()
                for i in range(batches_per_epoch):
                    start = i * batch_size
                    X_batch = x_train[start:start+batch_size]
                    y_batch = y_train[start:start+batch_size]

                            # forward 
                    y_pred = model(X_batch)
                    loss = loss_fn(y_pred, y_batch)
                            # backward 
                    optimizer.zero_grad()
                    loss.backward()
                            # update weights
                    optimizer.step()
                    acc = (torch.argmax(y_pred, 1) == torch.argmax(y_batch, 1)).float().mean()
                    epoch_loss.append(float(loss))
                    epoch_acc.append(float(acc))
                    # set model in evaluation mode and run through the test set
                    
                model.eval()
                y_pred = model(x_valid)
                print(y_pred.mean(axis = 0))
                print(y_valid.mean(axis = 0))
                ce = loss_fn(y_pred, y_valid)
                acc = (torch.argmax(y_pred, 1) == torch.argmax(y_valid, 1)).float().mean()

                ce = float(ce)
                acc = float(acc)
                print(f"Epoch {epoch} train: Cross-entropy={sum(epoch_loss)/batches_per_epoch}, Accuracy={sum(epoch_acc)/batches_per_epoch}")
                    
                current_CROSS_train.append(np.mean(epoch_loss))
                current_ACC_train.append(np.mean(epoch_acc))
                    
                current_CROSS_val.append(ce)
                current_ACC_val.append(acc)
                    
                if acc > best_acc:
                    best_acc = acc
                    best_weights = copy.deepcopy(model.state_dict())
                print(f"Epoch {epoch} validation: Cross-entropy={ce}, Accuracy={acc}")
                    
                CROSS_losses_train.append(current_CROSS_train)
                CROSS_losses_val.append(current_CROSS_val)

                ACC_losses_train.append(current_ACC_train)
                ACC_losses_val.append(current_ACC_val)
                final_loss.append(ce)

tensor([0.2591, 0.2418, 0.4991], grad_fn=<MeanBackward1>)
tensor([0.2889, 0.2493, 0.4618])
Epoch 0 train: Cross-entropy=1.0841763734817504, Accuracy=0.45721153846153845
Epoch 0 validation: Cross-entropy=1.0726141929626465, Accuracy=0.4617857038974762
tensor([0.2092, 0.1772, 0.6136], grad_fn=<MeanBackward1>)
tensor([0.2889, 0.2493, 0.4618])
Epoch 1 train: Cross-entropy=1.0661052208680373, Accuracy=0.4622596153846154
Epoch 1 validation: Cross-entropy=1.0622631311416626, Accuracy=0.4617857038974762
tensor([0.1891, 0.1472, 0.6637], grad_fn=<MeanBackward1>)
tensor([0.2889, 0.2493, 0.4618])
Epoch 2 train: Cross-entropy=1.059959663794591, Accuracy=0.4622596153846154
Epoch 2 validation: Cross-entropy=1.0586891174316406, Accuracy=0.4617857038974762
tensor([0.1852, 0.1336, 0.6812], grad_fn=<MeanBackward1>)
tensor([0.2889, 0.2493, 0.4618])
Epoch 3 train: Cross-entropy=1.0568089824456435, Accuracy=0.4622596153846154
Epoch 3 validation: Cross-entropy=1.0558983087539673, Accuracy=0.4617857038974762


##### 

In [105]:
# Lutei bastante mas não consegui resolver o problema do Draw
# Não gostei de reduzir a dificuldade do problema, acho q o problema real está nas features
# Elas não trazem tanta informação quanto parece
# Vi um artigo desse mesmo problema que ele utiliza features criadas completamente diferentes, mas infelizmente
# Não tem código, o resultado dele era bem melhor
# Vamo ter que conseguir algo com as criação de outras features, talvez Kmeans e PCA