In [58]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

## Dados da Partida

In [59]:
nomeTime = "Vizela"
nomeOponente = "Portimonense"
horaJogo = 15 # Formato 24h
dia = "dom" # dom, seg, ter, qua, qui, sex, sáb
local = 1 # Em casa = 0 / Visitante = 1

## Carga dos Dados

In [60]:
matches = pd.read_csv('portugal_matches.csv', index_col=0)

In [61]:
matches['time'].unique()

array(['Benfica', 'Sporting CP', 'Porto', 'Braga', 'Vitoria Guimaraes',
       'Moreirense', 'Arouca', 'Famalicao', 'Farense', 'Gil Vicente FC',
       'Casa Pia', 'Boavista', 'Estrela', 'Estoril', 'Rio Ave',
       'Portimonense', 'Chaves', 'Vizela', 'Maritimo',
       'Pacos de Ferreira', 'Santa Clara', 'Tondela', 'Belenenses SAD',
       'Nacional', 'Vitoria Setubal', 'Aves'], dtype=object)

In [62]:
matches.shape

(2862, 28)

In [63]:
matches.dtypes

data                     object
horário                  object
rodada                   object
dia                      object
local                    object
resultado                object
gp                      float64
gc                      float64
oponente                 object
xg                      float64
xga                     float64
posse                   float64
público                 float64
capitão                  object
formação                 object
árbitro                  object
relatório da partida     object
notas                   float64
tc                      float64
cag                     float64
dist                    float64
g/sh                    float64
fk                      float64
pb                        int64
pt                        int64
temporada                object
time                     object
camp                     object
dtype: object

In [64]:
matches['data'] = pd.to_datetime(matches['data'])

In [65]:
matches.dtypes

data                    datetime64[ns]
horário                         object
rodada                          object
dia                             object
local                           object
resultado                       object
gp                             float64
gc                             float64
oponente                        object
xg                             float64
xga                            float64
posse                          float64
público                        float64
capitão                         object
formação                        object
árbitro                         object
relatório da partida            object
notas                          float64
tc                             float64
cag                            float64
dist                           float64
g/sh                           float64
fk                             float64
pb                               int64
pt                               int64
temporada                

In [66]:
matches['cod_time'] = matches['time'].astype('category').cat.codes

In [67]:
matches['cod_local'] = matches['local'].astype('category').cat.codes

In [68]:
matches['cod_opo'] = matches['oponente'].astype('category').cat.codes

In [69]:
matches['hora'] = matches['horário'].str.replace(":\d+", "", regex=True).astype('int')

In [70]:
matches['cod_dia'] = matches['data'].dt.dayofweek

In [71]:
matches["target"] = (matches["resultado"] == "V").astype("int")

In [72]:
rf = rfc(n_estimators=50, min_samples_split=10, random_state=1)

In [73]:
train = matches[matches['data'] < '2023-11-01']

In [74]:
test = matches[matches['data'] > '2023-11-01']

In [75]:
# Usar mais preditores para alcançar um melhor resultado
predictors = ['cod_time', 'cod_local', 'cod_opo', 'hora', 'cod_dia']

In [76]:
rf.fit(train[predictors], train['target'])

In [77]:
preds = rf.predict(test[predictors])

In [78]:
preds

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [79]:
acc = accuracy_score(test['target'], preds)

In [80]:
acc

0.6825396825396826

In [81]:
combined = pd.DataFrame(dict(actual=test['target'], prediction=preds))

In [82]:
combined

Unnamed: 0,actual,prediction
9,1,1
10,1,1
11,0,1
12,0,1
13,1,1
...,...,...
409,0,0
410,0,0
411,1,0
412,0,0


In [83]:
pd.crosstab(index=combined['actual'], columns=combined['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,131,26
1,54,41


Refining the model

In [84]:
precision_score(test['target'], preds)

0.6119402985074627

Improve algorithms with rolling averages

In [85]:
grouped_matches = matches.groupby('time')

In [86]:
group = grouped_matches.get_group(nomeTime)

In [87]:
group.columns

Index(['data', 'horário', 'rodada', 'dia', 'local', 'resultado', 'gp', 'gc',
       'oponente', 'xg', 'xga', 'posse', 'público', 'capitão', 'formação',
       'árbitro', 'relatório da partida', 'notas', 'tc', 'cag', 'dist', 'g/sh',
       'fk', 'pb', 'pt', 'temporada', 'time', 'camp', 'cod_time', 'cod_local',
       'cod_opo', 'hora', 'cod_dia', 'target'],
      dtype='object')

In [88]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("data")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)

    return group

In [89]:
cols = ['gp', 'gc', 'tc', 'cag', 'dist', 'fk', 'pb', 'pt']

In [90]:
group.columns

Index(['data', 'horário', 'rodada', 'dia', 'local', 'resultado', 'gp', 'gc',
       'oponente', 'xg', 'xga', 'posse', 'público', 'capitão', 'formação',
       'árbitro', 'relatório da partida', 'notas', 'tc', 'cag', 'dist', 'g/sh',
       'fk', 'pb', 'pt', 'temporada', 'time', 'camp', 'cod_time', 'cod_local',
       'cod_opo', 'hora', 'cod_dia', 'target'],
      dtype='object')

In [91]:
new_cols = [f'{c}_rolling' for c in cols]

In [92]:
new_cols

['gp_rolling',
 'gc_rolling',
 'tc_rolling',
 'cag_rolling',
 'dist_rolling',
 'fk_rolling',
 'pb_rolling',
 'pt_rolling']

In [93]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,data,horário,rodada,dia,local,resultado,gp,gc,oponente,xg,...,cod_dia,target,gp_rolling,gc_rolling,tc_rolling,cag_rolling,dist_rolling,fk_rolling,pb_rolling,pt_rolling
1471,2021-08-28,15:30,Rodada da semana 4,sáb,Em casa,E,1.0,1.0,Boavista,1.3,...,5,0,0.666667,2.666667,10.333333,3.000000,199.333333,0.333333,0.000000,0.000000
1472,2021-09-12,18:00,Rodada da semana 5,dom,Visitante,E,2.0,2.0,Gil Vicente FC,2.7,...,6,0,1.000000,2.000000,13.666667,4.000000,183.000000,0.333333,0.000000,0.000000
1473,2021-09-19,15:30,Rodada da semana 6,dom,Em casa,E,1.0,1.0,Paços,2.5,...,6,0,1.000000,2.333333,12.666667,3.333333,178.666667,0.333333,0.333333,0.333333
1474,2021-09-26,20:30,Rodada da semana 7,dom,Visitante,E,0.0,0.0,Portimonense,1.0,...,6,0,1.333333,1.333333,18.666667,4.000000,175.333333,0.333333,0.333333,0.333333
1475,2021-10-02,15:30,Rodada da semana 8,sáb,Em casa,E,1.0,1.0,Santa Clara,1.4,...,5,0,1.000000,1.000000,18.000000,3.333333,174.000000,0.333333,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
409,2024-01-28,20:30,Rodada da semana 19,dom,Visitante,D,0.0,5.0,Arouca,1.1,...,6,0,1.333333,3.333333,18.000000,5.333333,182.333333,0.333333,0.000000,0.000000
410,2024-02-04,15:30,Rodada da semana 20,dom,Em casa,D,0.0,1.0,Vitória,0.4,...,6,0,1.000000,4.666667,15.000000,5.333333,180.333333,0.333333,0.000000,0.000000
411,2024-02-13,15:30,Rodada da semana 21,ter,Visitante,V,1.0,0.0,Gil Vicente FC,0.3,...,1,1,0.666667,3.666667,7.666667,3.333333,190.666667,0.333333,0.000000,0.000000
412,2024-02-18,18:00,Rodada da semana 22,dom,Visitante,D,1.0,6.0,Benfica,1.6,...,6,0,0.333333,2.000000,8.666667,3.000000,199.333333,0.333333,0.000000,0.000000


In [94]:
matches_rolling = matches.groupby('time').apply(lambda x: rolling_averages(x, cols, new_cols))

In [95]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,data,horário,rodada,dia,local,resultado,gp,gc,oponente,xg,...,cod_dia,target,gp_rolling,gc_rolling,tc_rolling,cag_rolling,dist_rolling,fk_rolling,pb_rolling,pt_rolling
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arouca,1505,2021-08-28,18:00,Rodada da semana 4,sáb,Visitante,D,0.0,3.0,Porto,0.3,...,5,0,0.666667,1.666667,12.000000,2.333333,180.333333,0.333333,0.333333,0.333333
Arouca,1506,2021-09-13,19:00,Rodada da semana 5,seg,Visitante,E,2.0,2.0,Marítimo,1.4,...,0,0,0.666667,2.000000,10.666667,1.666667,198.666667,0.333333,0.333333,0.333333
Arouca,1507,2021-09-18,20:30,Rodada da semana 6,sáb,Em casa,E,2.0,2.0,Vitória,0.9,...,5,0,1.333333,2.000000,15.333333,2.333333,205.333333,0.666667,0.333333,0.333333
Arouca,1508,2021-09-25,15:30,Rodada da semana 7,sáb,Visitante,D,1.0,2.0,Moreirense,0.7,...,5,0,1.333333,2.333333,12.000000,3.000000,201.666667,0.666667,0.000000,0.000000
Arouca,1509,2021-10-02,20:30,Rodada da semana 8,sáb,Em casa,D,1.0,2.0,Sporting CP,1.2,...,5,0,1.666667,2.000000,15.000000,4.333333,191.666667,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vizela,409,2024-01-28,20:30,Rodada da semana 19,dom,Visitante,D,0.0,5.0,Arouca,1.1,...,6,0,1.333333,3.333333,18.000000,5.333333,182.333333,0.333333,0.000000,0.000000
Vizela,410,2024-02-04,15:30,Rodada da semana 20,dom,Em casa,D,0.0,1.0,Vitória,0.4,...,6,0,1.000000,4.666667,15.000000,5.333333,180.333333,0.333333,0.000000,0.000000
Vizela,411,2024-02-13,15:30,Rodada da semana 21,ter,Visitante,V,1.0,0.0,Gil Vicente FC,0.3,...,1,1,0.666667,3.666667,7.666667,3.333333,190.666667,0.333333,0.000000,0.000000
Vizela,412,2024-02-18,18:00,Rodada da semana 22,dom,Visitante,D,1.0,6.0,Benfica,1.6,...,6,0,0.333333,2.000000,8.666667,3.000000,199.333333,0.333333,0.000000,0.000000


In [96]:
matches_rolling = matches_rolling.droplevel('time')

In [97]:
matches_rolling.index = range(matches_rolling.shape[0])

In [98]:
matches_rolling

Unnamed: 0,data,horário,rodada,dia,local,resultado,gp,gc,oponente,xg,...,cod_dia,target,gp_rolling,gc_rolling,tc_rolling,cag_rolling,dist_rolling,fk_rolling,pb_rolling,pt_rolling
0,2021-08-28,18:00,Rodada da semana 4,sáb,Visitante,D,0.0,3.0,Porto,0.3,...,5,0,0.666667,1.666667,12.000000,2.333333,180.333333,0.333333,0.333333,0.333333
1,2021-09-13,19:00,Rodada da semana 5,seg,Visitante,E,2.0,2.0,Marítimo,1.4,...,0,0,0.666667,2.000000,10.666667,1.666667,198.666667,0.333333,0.333333,0.333333
2,2021-09-18,20:30,Rodada da semana 6,sáb,Em casa,E,2.0,2.0,Vitória,0.9,...,5,0,1.333333,2.000000,15.333333,2.333333,205.333333,0.666667,0.333333,0.333333
3,2021-09-25,15:30,Rodada da semana 7,sáb,Visitante,D,1.0,2.0,Moreirense,0.7,...,5,0,1.333333,2.333333,12.000000,3.000000,201.666667,0.666667,0.000000,0.000000
4,2021-10-02,20:30,Rodada da semana 8,sáb,Em casa,D,1.0,2.0,Sporting CP,1.2,...,5,0,1.666667,2.000000,15.000000,4.333333,191.666667,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2776,2024-01-28,20:30,Rodada da semana 19,dom,Visitante,D,0.0,5.0,Arouca,1.1,...,6,0,1.333333,3.333333,18.000000,5.333333,182.333333,0.333333,0.000000,0.000000
2777,2024-02-04,15:30,Rodada da semana 20,dom,Em casa,D,0.0,1.0,Vitória,0.4,...,6,0,1.000000,4.666667,15.000000,5.333333,180.333333,0.333333,0.000000,0.000000
2778,2024-02-13,15:30,Rodada da semana 21,ter,Visitante,V,1.0,0.0,Gil Vicente FC,0.3,...,1,1,0.666667,3.666667,7.666667,3.333333,190.666667,0.333333,0.000000,0.000000
2779,2024-02-18,18:00,Rodada da semana 22,dom,Visitante,D,1.0,6.0,Benfica,1.6,...,6,0,0.333333,2.000000,8.666667,3.000000,199.333333,0.333333,0.000000,0.000000


Retraing our machine model

In [99]:
def make_predictions(data, predictors):
    train = data[data['data'] < '2023-11-01']
    test = data[data['data'] > '2023-11-01']
    rf.fit(train[predictors], train['target'])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test['target'], predictd=preds), index=test.index)
    precision = precision_score(test['target'], preds)

    return combined, precision, rf

In [100]:
combined, precision, rf = make_predictions(matches_rolling, predictors)

In [101]:
precision

0.6101694915254238

In [102]:
combined

Unnamed: 0,actual,predictd
74,0,0
75,0,0
76,1,0
77,0,0
78,1,1
...,...,...
2776,0,0
2777,0,0
2778,1,0
2779,0,0


In [103]:
combined = combined.merge(matches_rolling[['data', 'time', 'oponente', 'resultado']], left_index=True, right_index=True)

In [104]:
combined

Unnamed: 0,actual,predictd,data,time,oponente,resultado
74,0,0,2023-11-06,Arouca,Farense,D
75,0,0,2023-11-12,Arouca,Braga,D
76,1,0,2023-12-03,Arouca,Boavista,V
77,0,0,2023-12-10,Arouca,Rio Ave,E
78,1,1,2023-12-16,Arouca,Gil Vicente FC,V
...,...,...,...,...,...,...
2776,0,0,2024-01-28,Vizela,Arouca,D
2777,0,0,2024-02-04,Vizela,Vitória,D
2778,1,0,2024-02-13,Vizela,Gil Vicente FC,V
2779,0,0,2024-02-18,Vizela,Benfica,D


Combining Home and Away Predictions

In [105]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Vitoria Guimaraes": "Vitória",
    "Famalicao": "Famalicão",
    "Maritimo": "Marítimo",
    "Pacos de Ferreira": "Paços",
    "Belenenses SAD": "B-SAD",
    "Vitoria Setubal": "Vitória Setúbal"
}
mapping = MissingDict(**map_values)

In [106]:
nomeOponente = mapping[nomeOponente]

In [107]:
combined['novo_time'] = combined['time'].map(mapping)

In [108]:
merged = combined.merge(combined, left_on=['data', 'novo_time'], right_on=['data', 'oponente'])

In [109]:
merged.head()

Unnamed: 0,actual_x,predictd_x,data,time_x,oponente_x,resultado_x,novo_time_x,actual_y,predictd_y,time_y,oponente_y,resultado_y,novo_time_y
0,0,0,2023-11-06,Arouca,Farense,D,Arouca,1,0,Farense,Arouca,V,Farense
1,0,0,2023-11-12,Arouca,Braga,D,Arouca,1,1,Braga,Arouca,V,Braga
2,1,0,2023-12-03,Arouca,Boavista,V,Arouca,0,1,Boavista,Arouca,D,Boavista
3,0,0,2023-12-10,Arouca,Rio Ave,E,Arouca,0,1,Rio Ave,Arouca,E,Rio Ave
4,1,1,2023-12-16,Arouca,Gil Vicente FC,V,Arouca,0,0,Gil Vicente FC,Arouca,D,Gil Vicente FC


In [110]:
# Olhando para quando o algoritmo prevê que X ganhe e y perca, qual é o resultado.
merged[(merged['predictd_x'] == 1) & (merged['predictd_y'] == 0)]['actual_x'].value_counts()

actual_x
1    36
0    23
Name: count, dtype: int64

## Fazendo a Previsão

In [111]:
codTime = matches_rolling[matches_rolling['time'] == nomeTime]['cod_time'].unique()
codOponente = matches_rolling[matches_rolling['oponente'] == nomeOponente]['cod_opo'].unique()
codDia = matches_rolling[matches_rolling['dia'] == dia]['cod_dia'].unique()
print(f'codTime: {codTime}')
print(f'codOponente: {codOponente}')
print(f'codDia: {codDia}')

codTime: [25]
codOponente: [17]
codDia: [6]


In [112]:
next_game = pd.DataFrame.from_dict(data={'cod_time': codTime, 'cod_local': [local], 'cod_opo': codOponente, 'hora': [horaJogo], 'cod_dia': codDia})
next_game

Unnamed: 0,cod_time,cod_local,cod_opo,hora,cod_dia
0,25,1,17,15,6


In [113]:
real_pred = rf.predict(next_game)
real_pred[0]

1

In [114]:
result = 'PERDE' if real_pred[0] == 0 else 'GANHA'
print(f'Resultado para a previsão - {nomeTime} x {nomeOponente}: {result}')

Resultado para a previsão - Vizela x Portimonense: GANHA
