In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import matplotlib.pyplot as plt


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Load the dataset
matches = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/matches.csv', index_col=-0)
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,referee,match report,notes,sh,sot,dist,fk,pk,pkatt,team
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2.0,0.0,Ipswich Town,...,Tim Robinson,Match Report,,18.0,5.0,14.8,0.0,0,0,Liverpool
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2.0,0.0,Brentford,...,Stuart Attwell,Match Report,,19.0,8.0,13.6,1.0,0,0,Liverpool
2,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3.0,0.0,Manchester Utd,...,Anthony Taylor,Match Report,,11.0,3.0,13.4,0.0,0,0,Liverpool
3,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0.0,1.0,Nott'ham Forest,...,Michael Oliver,Match Report,,14.0,5.0,14.9,0.0,0,0,Liverpool
5,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Home,W,3.0,0.0,Bournemouth,...,Tony Harrington,Match Report,,19.0,12.0,16.6,0.0,0,0,Liverpool
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0.0,4.0,Tottenham,...,Andre Marriner,Match Report,,8.0,1.0,18.2,0.0,0,0,Sheffield United
39,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0.0,2.0,Crystal Palace,...,Simon Hooper,Match Report,,7.0,0.0,13.4,1.0,0,0,Sheffield United
40,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1.0,0.0,Everton,...,Jonathan Moss,Match Report,,10.0,3.0,18.5,0.0,0,0,Sheffield United
41,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0.0,1.0,Newcastle Utd,...,Robert Jones,Match Report,,11.0,1.0,18.3,1.0,0,0,Sheffield United


In [5]:
print(matches.head())
print(matches.shape)

         date   time            comp        round  day venue result   gf   ga  \
0  2024-08-17  12:30  Premier League  Matchweek 1  Sat  Away      W  2.0  0.0   
1  2024-08-25  16:30  Premier League  Matchweek 2  Sun  Home      W  2.0  0.0   
2  2024-09-01  16:00  Premier League  Matchweek 3  Sun  Away      W  3.0  0.0   
3  2024-09-14  15:00  Premier League  Matchweek 4  Sat  Home      L  0.0  1.0   
5  2024-09-21  15:00  Premier League  Matchweek 5  Sat  Home      W  3.0  0.0   

          opponent  ...          referee  match report  notes    sh   sot  \
0     Ipswich Town  ...     Tim Robinson  Match Report    NaN  18.0   5.0   
1        Brentford  ...   Stuart Attwell  Match Report    NaN  19.0   8.0   
2   Manchester Utd  ...   Anthony Taylor  Match Report    NaN  11.0   3.0   
3  Nott'ham Forest  ...   Michael Oliver  Match Report    NaN  14.0   5.0   
5      Bournemouth  ...  Tony Harrington  Match Report    NaN  19.0  12.0   

   dist   fk pk pkatt       team  
0  14.8  0.0  0

In [6]:
# Clean and preprocess the data
matches["date"] = pd.to_datetime(matches["date"])
matches["target"] = (matches["result"] == "W").astype("int")

# Encode categorical variables
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches["day_code"] = matches["date"].dt.dayofweek

# Remove unnecessary columns
matches.drop(columns=["comp", "notes"], inplace=True)

In [7]:
matches

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,dist,fk,pk,pkatt,team,target,venue_code,opp_code,hour,day_code
0,2024-08-17,12:30,Matchweek 1,Sat,Away,W,2.0,0.0,Ipswich Town,2.6,...,14.8,0.0,0,0,Liverpool,1,0,10,12,5
1,2024-08-25,16:30,Matchweek 2,Sun,Home,W,2.0,0.0,Brentford,2.5,...,13.6,1.0,0,0,Liverpool,1,1,3,16,6
2,2024-09-01,16:00,Matchweek 3,Sun,Away,W,3.0,0.0,Manchester Utd,1.8,...,13.4,0.0,0,0,Liverpool,1,0,16,16,6
3,2024-09-14,15:00,Matchweek 4,Sat,Home,L,0.0,1.0,Nott'ham Forest,0.9,...,14.9,0.0,0,0,Liverpool,0,1,19,15,5
5,2024-09-21,15:00,Matchweek 5,Sat,Home,W,3.0,0.0,Bournemouth,2.0,...,16.6,0.0,0,0,Liverpool,1,1,2,15,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Matchweek 34,Sun,Away,L,0.0,4.0,Tottenham,0.5,...,18.2,0.0,0,0,Sheffield United,0,0,22,19,6
39,2021-05-08,15:00,Matchweek 35,Sat,Home,L,0.0,2.0,Crystal Palace,0.7,...,13.4,1.0,0,0,Sheffield United,0,1,7,15,5
40,2021-05-16,19:00,Matchweek 36,Sun,Away,W,1.0,0.0,Everton,1.2,...,18.5,0.0,0,0,Sheffield United,1,0,8,19,6
41,2021-05-19,18:00,Matchweek 37,Wed,Away,L,0.0,1.0,Newcastle Utd,0.9,...,18.3,1.0,0,0,Sheffield United,0,0,17,18,2


In [8]:
# Define rolling averages function
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

# Define columns for rolling averages
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

# Apply rolling averages grouped by team
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])

# Inspect processed data
print(matches_rolling.head())

        date   time        round  day venue result   gf   ga         opponent  \
0 2020-10-04  14:00  Matchweek 4  Sun  Home      W  2.0  1.0    Sheffield Utd   
1 2020-10-17  17:30  Matchweek 5  Sat  Away      L  0.0  1.0  Manchester City   
2 2020-10-25  19:15  Matchweek 6  Sun  Home      L  0.0  1.0   Leicester City   
3 2020-11-01  16:30  Matchweek 7  Sun  Away      W  1.0  0.0   Manchester Utd   
4 2020-11-08  19:15  Matchweek 8  Sun  Home      L  0.0  3.0      Aston Villa   

    xg  ...  hour  day_code  gf_rolling ga_rolling sh_rolling sot_rolling  \
0  0.6  ...    14         6    2.000000   1.333333   8.000000    3.666667   
1  0.9  ...    17         5    1.666667   1.666667   5.666667    3.666667   
2  0.7  ...    19         6    1.000000   1.666667   7.000000    3.666667   
3  1.0  ...    16         6    0.666667   1.000000   9.666667    4.000000   
4  1.5  ...    19         6    0.333333   0.666667   9.666667    2.666667   

  dist_rolling fk_rolling  pk_rolling  pkatt_rolli

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
