In [1]:
import pandas as pd
import sqlite3
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import numpy as np

In [6]:
sql = '''
with team_stat as (
	select SEASON, TEAM, ROUND, DATE, OPPONENT, XG, XGA,
		AVG(XG) OVER(partition by season, team order by date
			ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
		) AS AVG_XG,
		lag(XG) over (partition by season, team order by date) XG1,
		lag(XGA) over (partition by season, team order by date) XGA1,
		lag(XG, 2) over (partition by season, team order by date) XG2,
		lag(XGA, 2) over (partition by season, team order by date) XGA2,
		lag(XG, 3) over (partition by season, team order by date) XG3,
		lag(XGA, 3) over (partition by season, team order by date) XGA3,
		lag(XG, 4) over (partition by season, team order by date) XG4,
		lag(XGA, 4) over (partition by season, team order by date) XGA4,
		lag(XG, 5) over (partition by season, team order by date) XG5,
		lag(XGA, 5) over (partition by season, team order by date) XGA5
	from TEAM_MATCH_LOG cur
	where comp='Premier League' and season >'2017'
)
select team.SEASON, team.TEAM, team.ROUND, team.DATE, team.OPPONENT, team.AVG_XG,
	team.XG, team.XG1, team.XG2, team.XG3, team.XG4, team.XG5,
	opp.XGA1 opp_XGA1, opp.XGA2 opp_XGA2, opp.XGA3 opp_XGA3, opp.XGA4 opp_XGA4, opp.XGA5 opp_XGA5
from team_stat team
inner join team_stat opp
on team.season = opp.season
and team.round = opp.round
and team.opponent = opp.team
order by team.season, team.team, team.date;
'''

In [13]:
conn = sqlite3.connect('fpl.db')

data = pd.read_sql(sql, conn)
data = data[data["XG5"].notnull()]
data = data[data["opp_XGA5"].notnull()]
data

Unnamed: 0,SEASON,TEAM,ROUND,DATE,OPPONENT,AVG_XG,XG,XG1,XG2,XG3,XG4,XG5,opp_XGA1,opp_XGA2,opp_XGA3,opp_XGA4,opp_XGA5
5,2017-2018,Arsenal,Matchweek 6,2017-09-25,West Brom,1.600000,2.4,1.4,2.0,0.7,1.6,2.3,0.3,0.4,1.2,1.2,0.4
6,2017-2018,Arsenal,Matchweek 7,2017-10-01,Brighton,1.733333,3.0,2.4,1.4,2.0,0.7,1.6,1.3,1.5,0.8,0.3,1.7
7,2017-2018,Arsenal,Matchweek 8,2017-10-14,Watford,1.914286,1.1,3.0,2.4,1.4,2.0,0.7,1.6,1.2,3.6,0.4,0.9
8,2017-2018,Arsenal,Matchweek 9,2017-10-22,Everton,1.812500,2.8,1.1,3.0,2.4,1.4,2.0,1.1,0.7,0.8,3.5,2.0
9,2017-2018,Arsenal,Matchweek 10,2017-10-28,Swansea City,1.922222,1.6,2.8,1.1,3.0,2.4,1.4,1.7,0.4,0.9,1.7,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,2021-2022,Wolves,Matchweek 35,2022-04-30,Brighton,0.930303,0.6,0.7,0.3,1.2,0.8,0.8,0.9,1.1,0.4,1.8,0.3
3796,2021-2022,Wolves,Matchweek 36,2022-05-07,Chelsea,0.920588,2.1,0.6,0.7,0.3,1.2,0.8,1.3,0.6,0.5,2.4,0.5
3797,2021-2022,Wolves,Matchweek 33,2022-05-11,Manchester City,0.954286,0.5,2.1,0.6,0.7,0.3,1.2,0.9,1.2,0.5,0.1,1.0
3798,2021-2022,Wolves,Matchweek 37,2022-05-15,Norwich City,0.941667,0.9,0.5,2.1,0.6,0.7,0.3,2.5,3.1,2.2,2.4,2.0


In [15]:
test_data = data[data["DATE"] >= '2022-01']
train_data = data[data["DATE"] < '2022-01']

def xy_split(df):
    X = df.drop(['SEASON', 'TEAM', 'ROUND', 'DATE', 'OPPONENT', 'XG', 'AVG_XG'], axis=1)
    y = df["XG"]
    return X, y

X_train, y_train = xy_split(train_data)
X_test, y_test = xy_split(test_data)

model = LinearRegression()
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print(f"Training MAE: {mean_absolute_error(y_train, y_train_pred):.2f}\t Target mean: {np.mean(y_train):.2f}")
print(f"Test MAE: {mean_absolute_error(y_test, y_test_pred):.2f} \t Target mean: {np.mean(y_test):.2f}")

Training MAE: 0.57	 Target mean: 1.32
Test MAE: 0.59 	 Target mean: 1.34


In [None]:
print(f"Training MAE: {mean_absolute_error(y_train, train_data['avg_xg']):.2f}\t Target mean: {np.mean(y_train):.2f}")
print(f"Test MAE: {mean_absolute_error(y_test, test_data['avg_xg']):.2f} \t Target mean: {np.mean(y_test):.2f}")

Training MAE: 0.58	 Target mean: 1.32
Test MAE: 0.60 	 Target mean: 1.34
