In [14]:
import warnings
import requests
import pandas as pd
import io
import os
import numpy as np


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer
import xgboost as xgb
from sklearn.model_selection import GridSearchCV


### Data Loading

In [15]:
# Suppress divide by zero warnings
warnings.filterwarnings("ignore", category=RuntimeWarning, message="divide by zero encountered in log")


API_KEY = os.getenv("API_KEY")
url = 'https://data-service.beatthebookie.blog/data'
headers = {"x-api-key": API_KEY}
params = {'division':'Premier League'}
response = requests.get(url, headers=headers, params=params)
json_str = response.content.decode('utf-8')
prem_df = pd.read_json(io.StringIO(json_str))
prem_teams_25 = prem_df[prem_df["season"] == 20242025]
prem_teams_25 = pd.concat([prem_teams_25['home_team'], prem_teams_25['away_team']]).unique()

params = {'division':'Championship'}
response = requests.get(url, headers=headers, params=params)
json_str = response.content.decode('utf-8')
champ_df = pd.read_json(io.StringIO(json_str))

df = pd.concat([champ_df, prem_df])
#df = df[(df['home_team'].isin(prem_teams_25)) | (df['away_team'].isin(prem_teams_25))]

print(df[["season", "match_date", "home_team", "away_team", "home_goals", "home_xgoals", "away_goals", "away_xgoals"]].tail())

        season  match_date    home_team      away_team  home_goals  \
3845  20242025  2024-09-21    Tottenham      Brentford           3   
3846  20242025  2024-09-21  Aston Villa         Wolves           3   
3847  20242025  2024-09-21    Liverpool    Bournemouth           3   
3848  20242025  2024-09-22     Man City        Arsenal           2   
3849  20242025  2024-09-22     Brighton  Nott'm Forest           2   

      home_xgoals  away_goals  away_xgoals  
3845      3.89481           1     0.893565  
3846      2.37351           1     0.520117  
3847      2.43731           0     1.469790  
3848      2.75588           2     1.284110  
3849      1.56836           2     1.369230  


### Data Cleansing

In [16]:
df['match_date'] = pd.to_datetime(df['match_date'])

# Converts columns listed below to numeric datatype
numeric_cols = ["home_num_players", "home_market_value", "home_avg_market_value", "away_num_players", "away_market_value", "away_avg_market_value",
                "home_goals", "away_goals", "home_shots", "away_shots", "home_shots_on_target", "away_shots_on_target", "home_corners", "away_corners",
                "home_red", "away_red", "home_yellow", "away_yellow", "home_deep", "away_deep", "home_ppda", "away_ppda"]
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

#Check for nulls
print(df.isnull().sum())

# Investigating home_shots nulls
home_shot_nulls = df[df["home_shots"].isnull()]
home_shot_null_counts = home_shot_nulls.groupby(["division", "season"]).size().reset_index(name='count')
print(home_shot_null_counts)

# Investigating deep/ppda nulls
ppda_nulls = df[df["home_ppda"].isnull()]
ppda_null_counts = ppda_nulls.groupby(["division", "season"]).size().reset_index(name='count')
print(ppda_null_counts)


division_id                 0
division                    0
season_id                   0
season                      0
match_date                  0
match_teams                 0
home_team_id                0
home_team                   0
away_team_id                0
away_team                   0
home_num_players            0
home_market_value           0
home_avg_market_value       0
away_num_players            0
away_market_value           0
away_avg_market_value       0
home_goals                  0
away_goals                  0
home_shots                440
away_shots                  0
home_shots_on_target        0
away_shots_on_target        0
home_corners                0
away_corners                0
home_yellow                 0
away_yellow                 0
home_red                    0
away_red                    0
home_xgoals                 0
away_xgoals                 0
home_deep                3922
away_deep                3922
home_ppda                3922
away_ppda 

ABove analysis shos that Championship is missing data for deep/ppda and that some random seasons of Premier League and Championship are missing home_shots data. I will not be handling these missing values directly as I am going to use an XGBoost model which handles missing values in training itself.

In [17]:
# Separates out the home vs away and allows every team to have an individual row of data. Allows us to feature engineer home effect easier.
home_df = df[["division", "season", "match_date",
             "home_team", "home_avg_market_value", "home_goals", "home_shots", "home_xgoals",  "home_deep", "home_ppda",
             "home_shots_on_target", "home_corners", "home_yellow", "home_red",
             "away_team", "away_avg_market_value", "away_goals", "away_shots", "away_xgoals", "away_deep", "away_ppda",
             "away_shots_on_target", "away_corners", "away_yellow", "away_red",
              ]].copy()
home_df["home?"] = 1
home_df = home_df.rename(columns={"home_team":"team", "home_avg_market_value":"avg_market_value", "home_goals":"goals",
             "home_shots":"shots", "home_xgoals":"xG",  "home_deep":"deep", "home_ppda":"ppda", "home_shots_on_target":"shots_on_target",
             "home_corners":"corners", "home_yellow": "yellow", "home_red": "red",

             "away_team":"opponent_team", "away_avg_market_value":"opponent_avg_market_value", "away_goals":"opponent_goals", 
             "away_shots": "opponent_shots", "away_xgoals": "opponent_xG", "away_deep":"opponent_deep", "away_ppda":"opponent_ppda",
             "away_shots_on_target": "opponent_shots_on_target", "away_corners":"opponent_corners", "away_yellow":"opponent_yellow",
             "away_red": "opponent_red"})

away_df = df[["division", "season", "match_date",
                "away_team", "away_avg_market_value", "away_goals", "away_shots", "away_xgoals",  "away_deep", "away_ppda",
                "away_shots_on_target", "away_corners", "away_yellow", "away_red",
                "home_team", "home_avg_market_value", "home_goals", "home_shots", "home_xgoals", "home_deep", "home_ppda",
                "home_shots_on_target", "home_corners", "home_yellow", "home_red",]].copy()
away_df["home?"] = 0
away_df = away_df.rename(columns={"away_team":"team", "away_avg_market_value":"avg_market_value", "away_goals":"goals",
             "away_shots":"shots", "away_xgoals":"xG", "away_deep":"deep", "away_ppda":"ppda", "away_shots_on_target":"shots_on_target",
             "away_corners":"corners", "away_yellow":"yellow", "away_red":"red",


             "home_team":"opponent_team", "home_avg_market_value":"opponent_avg_market_value", "home_goals":"opponent_goals", 
             "home_shots": "opponent_shots", "home_xgoals": "opponent_xG", "home_deep":"opponent_deep", "home_ppda":"opponent_ppda",
             "home_shots_on_target":"opponent_shots_on_target", "home_corners":"opponent_corners", "home_yellow":"opponent_yellow",
             "home_red":"opponent_red"})

df = pd.concat([away_df, home_df])
df.sort_values(["match_date", "division"], inplace=True)

In [18]:
# Apply penalty to standardise championship games to Prem level

decrease = ["goals", "shots", "xG", "deep", "opponent_ppda", "shots_on_target", "corners"]
increase = ["opponent_goals", "opponent_shots",  "opponent_xG", "opponent_deep", "ppda", "opponent_shots_on_target", "opponent_corners"]

# df.loc[df["division"] == "Championship", decrease] *= 0.7
# df.loc[df["division"] == "Championship", increase] *= 1.4


### Feature Engineering

In [19]:
def penalized_ema(group_df, column_name, span=50):
    # Shift colkumn so current game is not included
    shifted_column = group_df[column_name].shift(1)
    
    # Apply the EMA on the shifted column
    ema_values = shifted_column.ewm(span=span, adjust=False, min_periods=1).mean()
    
    return ema_values

In [20]:
# Compute EMAs for team stats
df.sort_values(by=["team", "match_date"], inplace=True)
stat_columns = ['goals', "shots", "xG", "shots_on_target", "corners", "deep", "ppda"]

for col in stat_columns:
    df[f"rolling_{col}"] = df.groupby("team", group_keys=False).apply(lambda x: penalized_ema(x, col), include_groups=False)
    df[f"rolling_{col}_conceded"] = df.groupby("team", group_keys=False).apply(lambda x: penalized_ema(x, f'opponent_{col}'), include_groups=False)

df.reset_index(drop=True, inplace=True)

# Compute EMAs for opponent stats
df.sort_values(by=["opponent_team", "match_date"], inplace=True)

for col in stat_columns:
    df[f"opponent_rolling_{col}"] = df.groupby("opponent_team", group_keys=False).apply(lambda x: penalized_ema(x, f'opponent_{col}'), include_groups=False)
    df[f"opponent_rolling_{col}_conceded"] = df.groupby("opponent_team", group_keys=False).apply(lambda x: penalized_ema(x, col), include_groups=False)

df.reset_index(drop=True, inplace=True)

In [21]:
# Evaluate MAE

### Model Training

In [22]:
# Filter training set to Premier League matches only
df = df[df["division"] == "Premier League"]

X = df[["home?",
        
        "avg_market_value",
        "rolling_goals",
        "rolling_shots",
        "rolling_xG",
        "rolling_shots_on_target",
        "rolling_corners",
        "rolling_deep",
        "rolling_goals_conceded",
        "rolling_shots_conceded",
        "rolling_xG_conceded",
        "rolling_shots_on_target_conceded",
        "rolling_corners_conceded",
        "rolling_deep_conceded",

        "opponent_avg_market_value",
        "opponent_rolling_goals",
        "opponent_rolling_shots",
        "opponent_rolling_xG",
        "opponent_rolling_shots_on_target",
        "opponent_rolling_corners",
        "opponent_rolling_deep",
        "opponent_rolling_goals_conceded",
        "opponent_rolling_shots_conceded",
        "opponent_rolling_xG_conceded",
        "opponent_rolling_shots_on_target_conceded",
        "opponent_rolling_corners_conceded",
        "opponent_rolling_deep_conceded",]]

y = df["goals"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=26)

In [23]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=26)
xgb_model.fit(X_train, y_train)

y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)

train_mse_xgb = mean_squared_error(y_train, y_train_pred)
train_mae_xgb = mean_absolute_error(y_train, y_train_pred)

test_mse_xgb = mean_squared_error(y_test, y_test_pred)
test_mae_xgb = mean_absolute_error(y_test, y_test_pred)

print(f"Train MAE: {train_mae_xgb} | Train MSE: {train_mse_xgb}")
print(f"Test MAE: {test_mae_xgb} | Test MSE: {test_mse_xgb}")

Train MAE: 0.2964473050532752 | Train MSE: 0.15403031486654234
Test MAE: 1.0428714028877122 | Test MSE: 1.7236436306721417


In [24]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'alpha': [0, 0.1, 1],
    'lambda': [1, 1.5, 2]
}

# Initalise GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring="neg_mean_squared_error",
                           cv=5, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

# Best parameters and best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation MSE: {-grid_search.best_score_}")
best_model = grid_search.best_estimator_

# Evaluate on training set
y_train_pred_xgb = best_model.predict(X_train)

train_mse_xgb = mean_squared_error(y_train, y_train_pred_xgb)
train_r2_xgb = r2_score(y_train, y_train_pred_xgb)
print(f"train MSE: {train_mse_xgb} | train R²: {train_r2_xgb}")

# Evaluate on test set
y_test_pred_cv = best_model.predict(X_test)

test_mse_xgb = mean_squared_error(y_test, y_test_pred_cv)
test_mae_xgb = mean_absolute_error(y_test, y_test_pred_cv)
print(f"Test mae: {test_mae_xgb} | Test mse: {test_mse_xgb}")

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Best parameters: {'alpha': 0.1, 'colsample_bytree': 0.8, 'lambda': 1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Best cross-validation MSE: 1.3770519018713951
train MSE: 1.1691961471750052 | train R²: 0.2737995694368809
Test mae: 0.9519687208068836 | Test mse: 1.4537571420441406


In [25]:
#TODO: Feature Validation
    #TODO: Apply Championship penalties
    #TODO: Validate wage
    #TODO: Work out how to add time decay

#TODO: Model Evaluation
    #TODO: Evaluate Feature Importance

#TODO: Model Predictions
    #TODO: Establish a method to predict goals for fixtures (finding most recent row for team?)

#TODO: Team Strength Modelling
    #TODO: Create a team strength metric based on team performance versus average team?