Import data (data 2019-2025)

In [1]:
import pandas as pd
matches = pd.read_csv('matches_full.csv', index_col=0)
matches.dtypes

date              object
time              object
comp              object
round             object
day               object
venue             object
result            object
gf               float64
ga               float64
opponent          object
xg               float64
xga              float64
poss             float64
attendance       float64
captain           object
formation         object
opp formation     object
referee           object
match report      object
notes            float64
sh               float64
sot              float64
dist             float64
fk               float64
pk                 int64
pkatt              int64
season             int64
team              object
dtype: object

Data optimization; creating predictors

In [2]:
matches['date'] = pd.to_datetime(matches['date'])
matches['venue_code'] = matches['venue'].astype('category').cat.codes #categorise to home=1 and away=0
matches['opponent_code'] = matches['opponent'].astype('category').cat.codes #each opponent gets its code
matches['hour'] = matches['time'].str.split(':').str[0].astype(int) #converting hours to int values
matches['day_code'] = matches['date'].dt.dayofweek #codes days of the week to int value

matches['referee_code'] = matches['referee'].astype('category').cat.codes
matches['captain_code'] = matches['captain'].astype('category').cat.codes

matches['xg_diff'] = matches['xg'] - matches['xga']

matches['shot_accuracy'] = matches['sot'] / matches['sh']
matches['shot_accuracy'] = matches['shot_accuracy'].fillna(0)  # if sh==0

matches['round_code'] = matches['round'].astype('category').cat.codes

matches['form_code'] = matches['formation'].astype('category').cat.codes
matches['oppform_code'] = matches['opp formation'].astype('category').cat.codes



matches['target'] = (matches['result'] == 'W').astype('int') #so the win converts into int=1, rest equals 0

ML model implementation (Random Forest Classifier)

In [3]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(bootstrap=True, max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=200, random_state=1)

training_set = matches[matches['date'] < '2025-01-01']
test_set = matches[matches['date'] > '2025-01-01']

predictors = ['venue_code', 'opponent_code', 'hour', 'day_code', 'referee_code', 'captain_code',
              'xg_diff', 'shot_accuracy', 'round_code', 'form_code', 'oppform_code']

rf.fit(training_set[predictors], training_set['target'])

predictions = rf.predict(test_set[predictors])

In [192]:
# Najlepsze parametry: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
# Najlepszy wynik (neg MSE): 0.16669622319889055

In [4]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

accuracy = accuracy_score(test_set['target'], predictions)
precision = precision_score(test_set['target'], predictions)

print(f"Accuracy: {round(accuracy, 3)}\nPrecision: {round(precision, 3)}")

Accuracy: 0.744
Precision: 0.659


In [5]:
from sklearn.metrics import confusion_matrix

actual = test_set['target']
predicted = predictions

cm = confusion_matrix(actual, predicted)

cm_df = pd.DataFrame(cm, index=[f'Actual {i}' for i in range(len(cm))],
                     columns=[f'Predicted {i}' for i in range(len(cm[0]))])

print(cm_df)


          Predicted 0  Predicted 1
Actual 0           87           15
Actual 1           25           29


New predictors = rolling average

In [6]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('date')
    rolling_states = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_states
    group = group.dropna(subset=new_cols)
    return group

In [23]:
cols = ['gf', 'ga', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt', 'poss', 'shot_accuracy', 'xg_diff']
new_cols = [f'{c}_rolling' for c in cols]

In [24]:
matches_rolling = matches.groupby('team').apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling

  matches_rolling = matches.groupby('team').apply(lambda x: rolling_averages(x, cols, new_cols))


Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling,poss_rolling,shot_accuracy_rolling,xg_diff_rolling
0,2019-09-15,14:00,La Liga,Matchweek 4,Sun,Home,L,0.0,1.0,Sevilla,...,0.333333,8.000000,1.666667,15.100000,0.333333,0.000000,0.000000,38.000000,0.207407,0.000000
1,2019-09-22,18:30,La Liga,Matchweek 5,Sun,Away,L,0.0,2.0,Athletic Club,...,0.666667,7.333333,1.000000,14.733333,0.000000,0.000000,0.000000,38.333333,0.133333,0.100000
2,2019-09-26,21:00,La Liga,Matchweek 6,Thu,Away,L,0.0,3.0,Real Sociedad,...,1.333333,5.333333,0.666667,15.433333,0.000000,0.000000,0.000000,40.333333,0.150000,-0.833333
3,2019-09-29,16:00,La Liga,Matchweek 7,Sun,Home,W,2.0,0.0,Mallorca,...,2.000000,5.333333,0.333333,16.900000,0.000000,0.000000,0.000000,41.000000,0.083333,-2.000000
4,2019-10-05,18:30,La Liga,Matchweek 8,Sat,Away,L,1.0,2.0,Valencia,...,1.666667,6.333333,2.000000,18.700000,0.000000,0.333333,0.333333,46.000000,0.250000,-1.266667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4226,2025-01-25,16:15,La Liga,Matchweek 21,Sat,Away,D,1.0,1.0,Atlético Madrid,...,1.000000,15.000000,6.333333,16.033333,0.666667,0.666667,0.666667,50.333333,0.423611,1.466667
4227,2025-02-01,16:15,La Liga,Matchweek 22,Sat,Home,W,5.0,1.0,Valladolid,...,0.666667,13.000000,4.000000,14.300000,0.333333,0.333333,0.333333,42.666667,0.277778,0.933333
4228,2025-02-08,17:30,La Liga,Matchweek 23,Sat,Away,W,2.0,1.0,Las Palmas,...,0.666667,15.333333,5.000000,14.300000,0.333333,0.333333,0.333333,45.666667,0.287879,1.800000
4229,2025-02-15,21:00,La Liga,Matchweek 24,Sat,Home,D,1.0,1.0,Valencia,...,1.000000,13.666667,4.000000,14.966667,0.333333,0.333333,0.333333,46.000000,0.269360,1.000000


In [25]:
def make_predictions(matches, predictors):
    training_set = matches[matches['date'] < '2025-01-01']
    test_set = matches[matches['date'] > '2025-01-01']
    rf.fit(training_set[predictors], training_set['target'])
    predictions = rf.predict(test_set[predictors])
    combined = pd.DataFrame(dict(actual=test_set['target'], predicted=predictions), index = test_set.index)
    precision = precision_score(test_set['target'], predictions)
    accuracy = accuracy_score(test_set['target'], predictions)
    return combined, precision, accuracy, predictions, test_set['target']

In [26]:
combined, precision, accuracy, predictions, actual = make_predictions(matches_rolling, predictors + new_cols)

In [27]:
print(f"Accuracy: {round(accuracy, 3)}\nPrecision: {round(precision, 3)}")

Accuracy: 0.776
Precision: 0.711


In [11]:
cm = confusion_matrix(actual, predictions)

cm_df = pd.DataFrame(cm, index=[f'Actual {i}' for i in range(len(cm))],
                     columns=[f'Predicted {i}' for i in range(len(cm[0]))])

print(cm_df)

          Predicted 0  Predicted 1
Actual 0           88           14
Actual 1           23           31


In [12]:
combined = combined.merge(matches_rolling[['date', 'team', 'opponent', 'result']], left_index=True, right_index=True)
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
167,0,0,2025-01-11,Alaves,Girona,L
168,1,1,2025-01-18,Alaves,Betis,W
169,0,0,2025-01-27,Alaves,Celta Vigo,D
170,0,0,2025-02-02,Alaves,Barcelona,L
171,0,0,2025-02-09,Alaves,Getafe,L
...,...,...,...,...,...,...
4226,0,0,2025-01-25,Villarreal,Atlético Madrid,D
4227,1,1,2025-02-01,Villarreal,Valladolid,W
4228,1,0,2025-02-08,Villarreal,Las Palmas,W
4229,0,0,2025-02-15,Villarreal,Valencia,D
