In [27]:
import requests
import json
import os
import numpy as np
import pandas as pd

API_KEY = os.getenv("API_KEY")
url = 'https://data-service.beatthebookie.blog/data'
headers = {"x-api-key": API_KEY}
params = {'division':'Premier League'}
response = requests.get(url, headers=headers, params=params)
json_str_prem = response.content.decode('utf-8')
prem_df= pd.read_json(json_str_prem)

  prem_df= pd.read_json(json_str_prem)


In [28]:
params = {'division':'Championship'}
response = requests.get(url, headers=headers, params=params)
json_str_champ = response.content.decode('utf-8')
champ_df= pd.read_json(json_str_champ)

  champ_df= pd.read_json(json_str_champ)


In [48]:
from datetime import datetime

df = pd.concat([prem_df, champ_df])

# Separates out the home vs away and allows every team to have an individual row of data. Allows us to feature engineer home effect easier.
home_df = df[["division", "season", "match_date",
             "home_team", "home_avg_market_value", "home_goals", "home_xgoals",
             "away_team", "away_avg_market_value", "away_goals", "away_xgoals"]].copy()
home_df["home?"] = 1

home_df = home_df.rename(columns={"home_team":"team", "home_avg_market_value":"avg_market_value", "home_goals":"goals", "home_xgoals":"xG",
             "away_team":"opponent_team", "away_avg_market_value":"opponent_avg_market_value", "away_goals":"opponent_goals", "away_xgoals": "opponent_xG"})

away_df = df[["division", "season", "match_date",
                "away_team", "away_avg_market_value", "away_goals", "away_xgoals",
                "home_team", "home_avg_market_value", "home_goals", "home_xgoals"]].copy()
away_df["home?"] = 0

away_df = away_df.rename(columns={"away_team":"team", "away_avg_market_value":"avg_market_value", "away_goals":"goals", "away_xgoals":"xG",
             "home_team":"opponent_team", "home_avg_market_value":"opponent_avg_market_value", "home_goals":"opponent_goals", "home_xgoals": "opponent_xG"})

df = pd.concat([away_df, home_df])
df.sort_values(["match_date", "division"], inplace=True)

# Create a unique identifier for each game, so even though matches are now duplicated, they have the same id.
df["match_id"] = df.apply(
    lambda row: "-".join(sorted([row["division"], str(row["season"]), str(row['match_date']), row['team'], row['opponent_team']])),
    axis=1
) # This applies a function to each row that creates a list out of the above columns, sorts and sorts them in alphabetical order.
  # It then joins the contents of the strings to create a unique ID like: 2014-2014-08-08-Ligue 1-Paris SG-Reims

df["match_id"] = df["match_id"].apply(hash) # This then hashes the string ID into a numeric value.

df["match_date"] = pd.to_datetime(df["match_date"])
df["days_since"] = (df['match_date'].max() - df['match_date']).dt.days

In [49]:
#TODO:  Placeholder to transform target variable to log scaled.

In [72]:
df.loc[df['division'] == 'Championship', ['xG', 'goals']] *= 0.6751
df.loc[df['division'] == 'Championship', ['opponent_xG', 'opponent_goals']] *= 1.4648

  df.loc[df['division'] == 'Championship', ['opponent_xG', 'opponent_goals']] *= 1.4648


In [100]:
def penalized_ema(group_df, column_name, span=38, decay_rate=0.01):
    # Calculate decay factor: higher decay_rate means older matches get lower weight
    #decay_factor = np.exp(-decay_rate * group_df['days_since'])
    
    # Calculate the weighted EMA using the decay factor
    ema_values = group_df[column_name].astype(float).ewm(span=span, adjust=False, min_periods=1).mean()
    
    # Apply the decay factor to the EMA
    #penalized_ema_values = ema_values * decay_factor
    
    return ema_values


In [101]:
# Compute EMAs for team stats
df.sort_values(by=["team", "match_date"], inplace=True)
stats_columns = ["goals", "xG"]

for col in stats_columns:
    df[f"rolling_{col}"] = df.groupby("team", group_keys=False).apply(lambda x: penalized_ema(x, col))
    df[f"rolling_{col}_conceded"] = df.groupby("team", group_keys=False).apply(lambda x: penalized_ema(x, f'opponent_{col}'))


# Compute EMAs for opponent stats
df.sort_values(by=["opponent_team", "match_date"], inplace=True)

for col in stats_columns:
    # Here, we should use the original columns, not opponent columns
    df[f"opponent_rolling_{col}"] = df.groupby("opponent_team", group_keys=False).apply(lambda x: penalized_ema(x, col))
    df[f"opponent_rolling_{col}_conceded"] = df.groupby("opponent_team", group_keys=False).apply(lambda x: penalized_ema(x, f'opponent_{col}'))



df.sort_values(by=["team", "match_date"], inplace=True)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
print(df[df["team"] == "Man City"].tail(25))

             division    season match_date      team  avg_market_value  goals  \
7652   Premier League  20232024 2023-12-10  Man City          52500000 2.0000   
4649   Premier League  20232024 2023-12-16  Man City          52500000 2.0000   
5266   Premier League  20232024 2023-12-27  Man City          52500000 3.0000   
11646  Premier League  20232024 2023-12-30  Man City          52500000 2.0000   
9451   Premier League  20232024 2024-01-13  Man City          53750000 3.0000   
3294   Premier League  20232024 2024-01-31  Man City          51600000 3.0000   
2349   Premier League  20232024 2024-02-05  Man City          57272727 3.0000   
5271   Premier League  20232024 2024-02-10  Man City          57272727 2.0000   
4089   Premier League  20232024 2024-02-17  Man City          57272727 1.0000   
2352   Premier League  20232024 2024-02-20  Man City          57272727 1.0000   
2053   Premier League  20232024 2024-02-24  Man City          57272727 1.0000   
8427   Premier League  20232

  df[f"rolling_{col}"] = df.groupby("team", group_keys=False).apply(lambda x: penalized_ema(x, col))
  df[f"rolling_{col}_conceded"] = df.groupby("team", group_keys=False).apply(lambda x: penalized_ema(x, f'opponent_{col}'))
  df[f"rolling_{col}"] = df.groupby("team", group_keys=False).apply(lambda x: penalized_ema(x, col))
  df[f"rolling_{col}_conceded"] = df.groupby("team", group_keys=False).apply(lambda x: penalized_ema(x, f'opponent_{col}'))
  df[f"opponent_rolling_{col}"] = df.groupby("opponent_team", group_keys=False).apply(lambda x: penalized_ema(x, col))
  df[f"opponent_rolling_{col}_conceded"] = df.groupby("opponent_team", group_keys=False).apply(lambda x: penalized_ema(x, f'opponent_{col}'))
  df[f"opponent_rolling_{col}"] = df.groupby("opponent_team", group_keys=False).apply(lambda x: penalized_ema(x, col))
  df[f"opponent_rolling_{col}_conceded"] = df.groupby("opponent_team", group_keys=False).apply(lambda x: penalized_ema(x, f'opponent_{col}'))


In [102]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer

df["goals_log"] = np.log1p(df["goals"]) #log1p to deal with 0s.

# Begin with just predicting xG and not xGC
X = df[["home?","rolling_goals", "rolling_xG", "opponent_rolling_xG_conceded", "opponent_rolling_goals_conceded",
         "avg_market_value", "opponent_avg_market_value"]]
y_log = df["goals"]

X_train, X_test, y_log_train, y_log_test = train_test_split(X, y_log, test_size=0.2, random_state=42)



# Define XGBoost model and use MSE as loss function
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

# Train model
xgb_model.fit(X_train, y_log_train)

# Predicting values on the trained model with the train and test datasets.
y_log_train_pred_xgb = xgb_model.predict(X_train)
y_log_test_pred_xgb = xgb_model.predict(X_test)


k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
mse_scorer = make_scorer(mean_squared_error)
mae_scorer = make_scorer(mean_absolute_error)
r2_scorer = make_scorer(r2_score)

# Compute MSE and R² for the training set
train_mse_xgb = mean_squared_error(y_log_train, y_log_train_pred_xgb)
train_r2_xgb = r2_score(y_log_train, y_log_train_pred_xgb)
print(f"Training MSE: {train_mse_xgb} | Training R²: {train_r2_xgb}")

# Compute MSE and R² for the test set
test_mse_xgb = mean_squared_error(y_log_test, y_log_test_pred_xgb)
test_r2_xgb = r2_score(y_log_test, y_log_test_pred_xgb)
print(f"Test MSE: {test_mse_xgb} | Test R²: {test_r2_xgb}")

# Perform k-fold cross-validation for MSE
mse_scores_xgb = cross_val_score(xgb_model, X, y_log, scoring=mse_scorer, cv=kf)
print(f'MSE scores for each fold: {mse_scores_xgb} | Average MSE: {np.mean(mse_scores_xgb)}')

# Perform k-fold cross-validation for MAE
mae_scores_xgb = cross_val_score(xgb_model, X, y_log, scoring=mae_scorer, cv=kf)
print(f'MAE scores for each fold: {mae_scores_xgb} | Average MSE: {np.mean(mae_scores_xgb)}')

# Perform k-fold cross-validation for R²
r2_scores_xgb = cross_val_score(xgb_model, X, y_log, scoring=r2_scorer, cv=kf)
print(f'R² scores for each fold: {r2_scores_xgb} | Average R²: {np.mean(r2_scores_xgb)}')

Training MSE: 0.30746954489952844 | Training R²: 0.7237145787290397
Test MSE: 0.813832195136507 | Test R²: 0.24502758900714305
MSE scores for each fold: [0.83173106 0.89623637 0.86176207 0.87791574 0.79375237] | Average MSE: 0.85227952281503
MAE scores for each fold: [0.65334566 0.68557824 0.67998186 0.67694689 0.6510585 ] | Average MSE: 0.6693822309766647
R² scores for each fold: [0.22842325 0.22531397 0.22001445 0.2225487  0.25039851] | Average R²: 0.2293397756807341


In [103]:
pred_goals = xgb_model.predict(X)
pred_goals_series = pd.Series(pred_goals, index=df.index)
df["pred_goals"] = pred_goals_series

home_df = df[df["home?"] == 1].rename(columns={'team': 'home_team', 'pred_goals': 'home_pred_goals', "xG": "home_xG", "goals":"home_goals"})
away_df = df[df['home?'] == 0].rename(columns={'team': 'away_team', 'pred_goals': 'away_pred_goals', "xG": "away_xG", "goals":"away_goals"})

merged_df = pd.merge(home_df[['match_id', "division", 'match_date', 'home_team', 'home_pred_goals', "home_xG", "home_goals"]],
                     away_df[['match_id', 'away_team', 'away_pred_goals', "away_xG", "away_goals"]],
                     on='match_id')

eval_df = merged_df[["match_date", "division", "home_team", "home_xG", "home_pred_goals", "home_goals", "away_team", "away_xG", "away_goals", "away_pred_goals"]]

print(eval_df[eval_df["division"] == "Premier League"].tail(25))

     match_date        division home_team  home_xG  home_pred_goals  \
7664 2023-04-08  Premier League    Wolves   0.6000           0.3333   
7665 2023-04-15  Premier League    Wolves   0.9400           1.4169   
7666 2023-04-25  Premier League    Wolves   1.1300           1.5061   
7667 2023-05-06  Premier League    Wolves   0.8100           0.7291   
7668 2023-05-20  Premier League    Wolves   1.2900           0.9957   
7669 2023-08-19  Premier League    Wolves   2.6882           1.1781   
7670 2023-09-16  Premier League    Wolves   0.5388           0.7373   
7671 2023-09-30  Premier League    Wolves   0.6740           0.4118   
7672 2023-10-08  Premier League    Wolves   1.7623           0.8278   
7673 2023-10-28  Premier League    Wolves   0.8634           1.5687   
7674 2023-11-11  Premier League    Wolves   1.9088           1.4899   
7675 2023-12-05  Premier League    Wolves   1.1189           1.2924   
7676 2023-12-09  Premier League    Wolves   1.0220           1.5520   
7677 2