In [14]:
import numpy as np

import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt

from datetime import datetime as dt

In [15]:
# Lendo a base de dados

loc = "./Premier-League/"
dataset = pd.read_csv(loc + "premier_league2001-2021.csv")

In [16]:
# Tirando as primeiras 3 semanas rodadas de cada ano

datasetParsed = dataset.drop(dataset.loc[(dataset["MW"] == 1)  | (dataset["MW"] == 2) | (dataset["MW"] == 3)].index)
datasetParsed

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTGS,ATGS,HTGC,ATGC,...,ATWinStreak3,ATWinStreak5,ATLossStreak3,ATLossStreak5,HTGD,ATGD,DiffPts,DiffFormPts,DiffFormPts3,DiffLP
30,2000-09-05,Man United,Bradford,6,0,H,5,2,3,1,...,0,0,0,0,0.500000,0.250000,0.250000,1,1,-16.0
31,2000-09-05,Sunderland,West Ham,1,1,D,3,4,5,7,...,0,0,0,0,-0.500000,-0.750000,0.500000,2,2,-2.0
32,2000-09-05,Tottenham,Everton,3,2,H,4,5,4,4,...,0,0,0,0,0.000000,0.250000,0.000000,0,0,-3.0
33,2000-09-06,Charlton,Southampton,1,1,D,7,6,8,7,...,0,0,0,0,-0.250000,-0.250000,0.250000,1,1,3.0
34,2000-09-06,Chelsea,Arsenal,2,2,D,5,7,5,4,...,0,0,0,0,0.000000,0.750000,-0.500000,-2,-2,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7975,2021-05-23,Liverpool,Crystal Palace,2,0,H,66,41,42,64,...,0,0,0,0,0.631579,-0.605263,0.578947,7,6,-13.0
7976,2021-05-23,Man City,Everton,5,0,H,78,47,32,43,...,0,0,0,0,1.210526,0.105263,0.631579,2,-1,-10.0
7977,2021-05-23,Sheffield United,Burnley,1,0,H,19,33,63,54,...,0,0,0,0,-1.157895,-0.552632,-0.500000,0,0,-1.0
7978,2021-05-23,West Ham,Southampton,3,0,H,59,47,47,65,...,0,0,0,0,0.315789,-0.473684,0.500000,0,-2,5.0


In [17]:
selectedColumns = ["Date","DiffLP", "DiffPts", "DiffFormPts3",  # Diferenças
                    "HTWinStreak3", "ATWinStreak3","HTLossStreak3", "ATLossStreak3", # Streaks
                    "HTGS", "ATGS", "HTGC", "ATGC", # Gols Feitos e Recebidos de cada um
                    "HTFormPts3", "ATFormPts3", # Pontos ganhos nas últimas semanas
                    "HTGD", "ATGD", "HomeTeamLP", "AwayTeamLP", # Diferença de Gol
                    "HM1","HM2","HM3","AM1","AM2","AM3", # Resultado dos últimos 3 jogos
                    "HSA", "ASA", "HSTA", "ASTA", "FTR"] # Chutes e Chutes a Gol


datasetSelected = datasetParsed[selectedColumns]
columns_to_encode = ["HM1", "HM2", "HM3", "AM1", "AM2", "AM3"] # Para fazer o OneHotEncoding


# Realiza o one-hot encoding nas colunas selecionadas
datasetSelected_encoded = pd.get_dummies(datasetSelected, columns=columns_to_encode)


print(datasetSelected_encoded.columns)


# x_train_encoded = x_train_encoded.drop(["HM4_M", "HM5_M", "AM4_M", "AM5_M"], axis = 1)


datasetSelected_encoded.info()

Index(['Date', 'DiffLP', 'DiffPts', 'DiffFormPts3', 'HTWinStreak3',
       'ATWinStreak3', 'HTLossStreak3', 'ATLossStreak3', 'HTGS', 'ATGS',
       'HTGC', 'ATGC', 'HTFormPts3', 'ATFormPts3', 'HTGD', 'ATGD',
       'HomeTeamLP', 'AwayTeamLP', 'HSA', 'ASA', 'HSTA', 'ASTA', 'FTR',
       'HM1_D', 'HM1_L', 'HM1_W', 'HM2_D', 'HM2_L', 'HM2_W', 'HM3_D', 'HM3_L',
       'HM3_W', 'AM1_D', 'AM1_L', 'AM1_W', 'AM2_D', 'AM2_L', 'AM2_W', 'AM3_D',
       'AM3_L', 'AM3_W'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 7350 entries, 30 to 7979
Data columns (total 41 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           7350 non-null   object 
 1   DiffLP         7350 non-null   float64
 2   DiffPts        7350 non-null   float64
 3   DiffFormPts3   7350 non-null   int64  
 4   HTWinStreak3   7350 non-null   int64  
 5   ATWinStreak3   7350 non-null   int64  
 6   HTLossStreak3  7350 non-null   int64  
 7   ATLossStre

In [73]:
train = datasetSelected_encoded.loc[
                datasetSelected_encoded["Date"] < "2017-09-18"].sample(frac = 1, random_state = 42)

test = datasetSelected_encoded.loc[
                datasetSelected_encoded["Date"] >= "2017-09-18"].sample(frac = 1, random_state = 42)

y_train = np.array(train["FTR"]).reshape(-1, 1)
y_test = np.array(test["FTR"]).reshape(-1, 1)

x_train = pd.DataFrame(np.array(train.drop(["FTR", "Date"], axis = 1)))
x_test = np.array(test.drop(["FTR", "Date"], axis = 1))

In [74]:
# Normalização dos dados em MinMax e transformando o y_ em OneHot
scaler = MinMaxScaler()
x_train_encoded = scaler.fit_transform(x_train)
x_test_encoded = scaler.transform(x_train)

encoder = OneHotEncoder()
print(np.unique(y_train, return_counts=True))
y_train_encoded = encoder.fit_transform(y_train).toarray()
y_test_encoded = encoder.transform(y_test).toarray()

input_size = x_train_encoded.shape[1]
output_size = y_train_encoded.shape[1]

print(input_size, output_size)

(array(['A', 'D', 'H'], dtype=object), array([1667, 1516, 2787]))
39 3


In [75]:
x_train_encoded

array([[0.17647059, 0.68421947, 0.5       , ..., 0.        , 0.        ,
        1.        ],
       [0.79411765, 0.32446634, 0.38888889, ..., 1.        , 0.        ,
        0.        ],
       [0.52941176, 0.52134647, 0.11111111, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.52941176, 0.64477285, 0.66666667, ..., 1.        , 0.        ,
        0.        ],
       [0.64705882, 0.2955665 , 0.44444444, ..., 1.        , 0.        ,
        0.        ],
       [0.85294118, 0.36059113, 0.5       , ..., 1.        , 0.        ,
        0.        ]])

In [78]:
x_test_encoded

array([[0.17647059, 0.68421947, 0.5       , ..., 0.        , 0.        ,
        1.        ],
       [0.79411765, 0.32446634, 0.38888889, ..., 1.        , 0.        ,
        0.        ],
       [0.52941176, 0.52134647, 0.11111111, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.52941176, 0.64477285, 0.66666667, ..., 1.        , 0.        ,
        0.        ],
       [0.64705882, 0.2955665 , 0.44444444, ..., 1.        , 0.        ,
        0.        ],
       [0.85294118, 0.36059113, 0.5       , ..., 1.        , 0.        ,
        0.        ]])