In [1]:
import pandas as pd
import requests
import pandas as pd
import json
import os
import numpy as np
from scipy.stats import poisson
from scipy.stats import norm


API_KEY = os.getenv("API_KEY")
url = 'https://data-service.beatthebookie.blog/data'
headers = {"x-api-key": API_KEY}
params = {'division':'Premier League'}
response = requests.get(url, headers=headers, params=params)
json_str = response.content.decode('utf-8')
df = pd.read_json(json_str)

df['match_date'] = pd.to_datetime(df['match_date'])
df = df[df["match_date"] > '2023-06-30']

print(df[["match_date", "home_team", "away_team", "home_goals", "home_xgoals", "away_goals", "away_xgoals"]].tail())

     match_date    home_team  away_team  home_goals  home_xgoals  away_goals  \
3815 2024-08-24       Fulham  Leicester           2      1.96218           1   
3816 2024-08-24     Man City    Ipswich           4      3.07818           1   
3817 2024-08-25  Bournemouth  Newcastle           1      2.53513           1   
3818 2024-08-25       Wolves    Chelsea           2      2.41810           6   
3819 2024-08-25    Liverpool  Brentford           2      2.71877           0   

      away_xgoals  
3815     0.857005  
3816     0.479999  
3817     1.892870  
3818     2.029350  
3819     0.457239  


  df = pd.read_json(json_str)


In [2]:
def decay(xi, t):
    return np.exp(-xi * t)

In [3]:
def rho_correction(goals_home, goals_away, home_exp, away_exp, rho):
    if goals_home == 0 and goals_away == 0:
        return 1 - (home_exp * away_exp * rho)
    elif goals_home == 0 and goals_away == 1:
        return 1 + (home_exp * rho)
    elif goals_home == 1 and goals_away == 0:
        return 1 + (away_exp * rho)
    elif goals_home == 1 and goals_away == 1:
        return 1 - rho
    else:
        return 1.0

In [4]:
def log_likelihood(
    xG_home_observed,
    xG_away_observed,
    home_attack,
    home_defence,
    away_attack,
    away_defence,
    home_advantage,
    rho,
    weight,
    sigma=1.0
):
    xG_expectation_home = np.exp(home_attack + away_defence + home_advantage)
    xG_expectation_away = np.exp(away_attack + home_defence)

    home_llk = norm.pdf(xG_home_observed, loc=xG_expectation_home, scale=sigma)
    away_llk = norm.pdf(xG_away_observed, loc=xG_expectation_away, scale=sigma)

    adj_llk = rho_correction(
        xG_home_observed,
        xG_away_observed,
        xG_expectation_home,
        xG_expectation_away,
        rho,
    )

    if xG_expectation_home < 0 or xG_expectation_away < 0 or adj_llk < 0:
        return 10000

    log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))

    return -log_llk

In [15]:
from pprint import pprint
import numpy as np
from scipy.optimize import minimize
from scipy.stats import poisson

def fit_xG_model(df, xi=0.0001, sigma=1.0):
    teams = np.sort(np.unique(np.concatenate([df["home_team"], df["away_team"]])))
    n_teams = len(teams)
    
    df["days_since"] = (df["match_date"].max() - df["match_date"]).dt.days
    df["weight"] = decay(xi, df["days_since"])

    params = np.concatenate(
        (
            np.random.uniform(0.5, 1.5, (n_teams)),  # attack strength
            np.random.uniform(0, -1, (n_teams)),  # defence strength
            [0.25],  # home advantage
            [-0.1],  # rho
        )
    )

    def _fit(params, df, teams, sigma):
        attack_params = dict(zip(teams, params[:n_teams]))
        defence_params = dict(zip(teams, params[n_teams : (2 * n_teams)]))
        home_advantage = params[-2]
        rho = params[-1]

        llk = list()
        for idx, row in df.iterrows():
            tmp = log_likelihood(
                row["home_xgoals"],
                row["away_xgoals"],
                attack_params[row["home_team"]],
                defence_params[row["home_team"]],
                attack_params[row["away_team"]],
                defence_params[row["away_team"]],
                home_advantage,
                rho,
                row["weight"],
                sigma=sigma
            )
            llk.append(tmp)

        return np.sum(llk)

    options = {
        "maxiter": 100,
        "disp": False,
    }

    constraints = [{"type": "eq", "fun": lambda x: sum(x[:n_teams]) - n_teams}]

    res = minimize(
        _fit,
        params,
        args=(df, teams, sigma),
        constraints=constraints,
        options=options,
    )

    model_params = dict(
        zip(
            ["attack_" + team for team in teams]
            + ["defence_" + team for team in teams]
            + ["home_adv", "rho"],
            res["x"],
        )
    )

    print("Log Likelihood: ", res["fun"])

    return model_params


In [16]:
model_params = fit_xG_model(df, xi=0.001, sigma=1.0)
pprint(model_params)

  log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))
  log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))
  log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))
  log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))
  log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))
  log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))
  log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))
  log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))
  log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))
  log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))
  log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))
  log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))
  log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))

Log Likelihood:  794.8145282773155
{'attack_Arsenal': 1.332100960449865,
 'attack_Aston Villa': 1.1668556003172978,
 'attack_Bournemouth': 1.1243189705145138,
 'attack_Brentford': 1.0863256258189316,
 'attack_Brighton': 1.0888746480393117,
 'attack_Burnley': 0.7449391112598716,
 'attack_Chelsea': 1.333008859806275,
 'attack_Crystal Palace': 0.9382245498796479,
 'attack_Everton': 1.0323535016278804,
 'attack_Fulham': 0.9056912132408192,
 'attack_Ipswich': -0.018384141602792677,
 'attack_Leicester': 0.47777280607558115,
 'attack_Liverpool': 1.5294983615919278,
 'attack_Luton': 0.8817160264228198,
 'attack_Man City': 1.4056932112759999,
 'attack_Man United': 1.0898657696393659,
 'attack_Newcastle': 1.4098644808156455,
 "attack_Nott'm Forest": 0.9139219417014396,
 'attack_Sheffield United': 0.7535024593071522,
 'attack_Southampton': 0.7063724154743659,
 'attack_Tottenham': 1.2312441870601027,
 'attack_West Ham': 0.9918617672033726,
 'attack_Wolves': 0.8743776740806023,
 'defence_Arsenal': 

In [17]:
def predict(params, home_team, away_team):
    home_attack = params["attack_" + home_team]
    home_defence = params["defence_" + home_team]
    away_attack = params["attack_" + away_team]
    away_defence = params["defence_" + away_team]
    home_advantage = params["home_adv"]
    rho = params["rho"]

    home_goal_expectation = np.exp(home_attack + away_defence + home_advantage)
    away_goal_expectation = np.exp(away_attack + home_defence)

    home_probs = poisson.pmf(range(10), home_goal_expectation)
    away_probs = poisson.pmf(range(10), away_goal_expectation)

    m = np.outer(home_probs, away_probs)

    m[0, 0] *= 1 - home_goal_expectation * away_goal_expectation * rho
    m[0, 1] *= 1 + home_goal_expectation * rho
    m[1, 0] *= 1 + away_goal_expectation * rho
    m[1, 1] *= 1 - rho    

    home = np.sum(np.tril(m, -1))
    draw = np.sum(np.diag(m))
    away = np.sum(np.triu(m, 1))

    # Calculate the probability of a clean sheet for the home team (away team scores 0)
    home_clean_sheet_prob = m[:, 0].sum() * 100

    # Calculate the probability of a clean sheet for the away team (home team scores 0)
    away_clean_sheet_prob = m[0, :].sum() * 100

    return home, draw, away, home_clean_sheet_prob, away_clean_sheet_prob, home_goal_expectation, away_goal_expectation

In [20]:
predict(model_params, "Tottenham", "Arsenal")

(0.20102693633537044,
 0.24215894458540216,
 0.5567819581590949,
 14.793401614037135,
 33.92127722872275,
 1.0810957860676678,
 1.9109887156418695)