In [1]:
import pandas as pd
import requests
import json
import os
import numpy as np
from scipy.stats import poisson
from scipy.stats import norm
from pprint import pprint
import numpy as np
from scipy.optimize import minimize
from scipy.stats import poisson
import warnings
import io


# Suppress divide by zero warnings
warnings.filterwarnings("ignore", category=RuntimeWarning, message="divide by zero encountered in log")


API_KEY = os.getenv("API_KEY")
url = 'https://data-service.beatthebookie.blog/data'
headers = {"x-api-key": API_KEY}
params = {'division':'Premier League'}
response = requests.get(url, headers=headers, params=params)
json_str = response.content.decode('utf-8')
prem_df = pd.read_json(io.StringIO(json_str))
prem_teams_25 = prem_df[prem_df["season"] == 20242025]
prem_teams_25 = pd.concat([prem_teams_25['home_team'], prem_teams_25['away_team']]).unique()

params = {'division':'Championship'}
response = requests.get(url, headers=headers, params=params)
json_str = response.content.decode('utf-8')
champ_df = pd.read_json(io.StringIO(json_str))

df = pd.concat([champ_df, prem_df])
#df = df[(df['home_team'].isin(prem_teams_25)) | (df['away_team'].isin(prem_teams_25))]

df['match_date'] = pd.to_datetime(df['match_date'])
df = df[df["match_date"] > '2023-06-01']

print(df[["season", "match_date", "home_team", "away_team", "home_goals", "home_xgoals", "away_goals", "away_xgoals"]].tail())

        season match_date    home_team  away_team  home_goals  home_xgoals  \
3835  20242025 2024-09-14       Fulham   West Ham           1     2.886010   
3836  20242025 2024-09-14  Aston Villa    Everton           3     3.158760   
3837  20242025 2024-09-14     Brighton    Ipswich           0     1.756960   
3838  20242025 2024-09-15    Tottenham    Arsenal           0     0.792595   
3839  20242025 2024-09-15       Wolves  Newcastle           1     1.469690   

      away_goals  away_xgoals  
3835           1     0.682719  
3836           2     0.852158  
3837           0     0.721323  
3838           1     1.120900  
3839           2     1.483430  


In [2]:
# Apply averaged penalty when Championship match contains two promoted teams.

# Penalty xG = 0.665
# Penalty xGA = 1.465

# mask_both_prem = (df['division'] == 'Championship') & (df['home_team'].isin(prem_teams_25)) & (df['away_team'].isin(prem_teams_25))
# # #df.loc[mask_both_prem, 'home_goals'] *= 0.661
# # #df.loc[mask_both_prem, 'away_goals'] *= 0.661
# df.loc[mask_both_prem, 'home_xgoals'] *= 0.665
# df.loc[mask_both_prem, 'away_xgoals'] *= 0.655

# # Adjust performance penalty to championship games when one team is a promoted team.
# mask_home = (df['division'] == 'Championship') & (df['home_team'].isin(prem_teams_25)) & ~(df['away_team'].isin(prem_teams_25))
# # #df.loc[mask_home, 'home_goals'] *= 0.661
# # #df.loc[mask_home, 'away_goals'] *= 2.060
# df.loc[mask_home, 'home_xgoals'] *= 0.665
# df.loc[mask_home, 'away_xgoals'] *= 1.465

# mask_away = (df['division'] == 'Championship') & (df['away_team'].isin(prem_teams_25)) & ~(df['home_team'].isin(prem_teams_25))
# # #df.loc[mask_away, 'home_goals'] *= 2.060
# # #df.loc[mask_away, 'away_goals'] *= 0.661
# df.loc[mask_away, 'home_xgoals'] *= 1.465
# df.loc[mask_away, 'away_xgoals'] *= 0.665


# df[['home_goals', 'away_goals']] = df[['home_goals', 'away_goals']].round()


In [3]:
def decay(xi, t):
    return np.exp(-xi * t)

def rho_correction(goals_home, goals_away, home_exp, away_exp, rho):
    if goals_home == 0 and goals_away == 0:
        return 1 - (home_exp * away_exp * rho)
    elif goals_home == 0 and goals_away == 1:
        return 1 + (home_exp * rho)
    elif goals_home == 1 and goals_away == 0:
        return 1 + (away_exp * rho)
    elif goals_home == 1 and goals_away == 1:
        return 1 - rho
    else:
        return 1.0

In [4]:
def log_likelihood_xg(
    xG_home_observed,
    xG_away_observed,
    home_attack,
    home_defence,
    away_attack,
    away_defence,
    home_advantage,
    rho,
    weight,
    sigma=1.0
):
    xG_expectation_home = np.exp(home_attack + away_defence + home_advantage)
    xG_expectation_away = np.exp(away_attack + home_defence)

    home_llk = norm.pdf(xG_home_observed, loc=xG_expectation_home, scale=sigma)
    away_llk = norm.pdf(xG_away_observed, loc=xG_expectation_away, scale=sigma)

    adj_llk = rho_correction(
        xG_home_observed,
        xG_away_observed,
        xG_expectation_home,
        xG_expectation_away,
        rho,
    )

    if xG_expectation_home < 0 or xG_expectation_away < 0 or adj_llk < 0:
        return 10000

    log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))

    return -log_llk


def fit_xG_model(df, xi=0.0001, sigma=1.0):
    teams = np.sort(np.unique(np.concatenate([df["home_team"], df["away_team"]])))
    n_teams = len(teams)
    
    df["league_strength"] = df["division"].apply(lambda x: 1 if x == "Premier League" else 0.615) 
    df["days_since"] = (df["match_date"].max() - df["match_date"]).dt.days
    df["weight"] = decay(xi, df["days_since"]) * df["league_strength"]

    params = np.concatenate(
        (
            np.random.uniform(0.5, 1.5, (n_teams)),  # attack strength
            np.random.uniform(0, -1, (n_teams)),  # defence strength
            [0.25],  # home advantage
            [-0.1],  # rho
        )
    )

    bounds = [(None, None)] * (2 * n_teams) + [(None, None), (-0.5, 0.5)]

    def _fit(params, df, teams, sigma):
        attack_params = dict(zip(teams, params[:n_teams]))
        defence_params = dict(zip(teams, params[n_teams : (2 * n_teams)]))
        home_advantage = params[-2]
        rho = params[-1]

        llk = list()
        for idx, row in df.iterrows():
            tmp = log_likelihood_xg(
                row["home_xgoals"],
                row["away_xgoals"],
                attack_params[row["home_team"]],
                defence_params[row["home_team"]],
                attack_params[row["away_team"]],
                defence_params[row["away_team"]],
                home_advantage,
                rho,
                row["weight"],
                sigma=sigma
            )
            llk.append(tmp)

        return np.sum(llk)

    options = {
        "maxiter": 100,
        "disp": False,
    }

    constraints = [{"type": "eq", "fun": lambda x: sum(x[:n_teams]) - n_teams}]

    res = minimize(
        _fit,
        params,
        args=(df, teams, sigma),
        constraints=constraints,
        bounds=bounds,
        method='L-BFGS-B',
        options=options,
    )

    model_params = dict(
        zip(
            ["attack_" + team for team in teams]
            + ["defence_" + team for team in teams]
            + ["home_adv", "rho"],
            res["x"],
        )
    )

    # Prepare table data
    attack_values = [model_params[f'attack_{team}'] for team in teams]
    defence_values = [model_params[f'defence_{team}'] for team in teams]
    home_adv = model_params['home_adv']
    rho = model_params['rho']

    # Create DataFrame for attack and defense
    team_strength_df = pd.DataFrame({
        'Team': teams,
        'Attack Strength': attack_values,
        'Defense Strength': defence_values
    })

    # Add home advantage and rho to the DataFrame
    additional_params_df = pd.DataFrame({
        'Parameter': ['Home Advantage', 'Rho'],
        'Value': [home_adv, rho]
    })

    # Print DataFrames
    print("Team Strength (Attack and Defense):")
    print(team_strength_df)
    print("\nAdditional Parameters (Home Advantage and Rho):")
    print(additional_params_df)


    print("Log Likelihood: ", res["fun"])

    return model_params


xg_model_params = fit_xG_model(df, xi=0.001, sigma=1.0)

  res = minimize(


Team Strength (Attack and Defense):
                Team  Attack Strength  Defense Strength
0            Arsenal         1.401767         -1.464089
1        Aston Villa         1.262587         -0.772069
2         Birmingham         0.338008         -0.362095
3          Blackburn         0.528731         -0.325543
4        Bournemouth         1.211068         -0.788503
5          Brentford         1.193518         -0.886905
6           Brighton         1.155393         -0.883415
7       Bristol City         0.416008         -0.440810
8            Burnley         0.717887         -0.643206
9            Cardiff         0.160793         -0.277864
10           Chelsea         1.403176         -0.823802
11          Coventry         0.552603         -0.476068
12    Crystal Palace         1.024753         -0.850290
13             Derby         0.370922         -0.296788
14           Everton         1.114666         -0.788711
15            Fulham         0.993809         -0.780896
16      Hudd

In [5]:
def log_likelihood_goals(
    goals_home_observed,
    goals_away_observed,
    home_attack,
    home_defence,
    away_attack,
    away_defence,
    home_advantage,
    rho,
    weight
):
    goal_expectation_home = np.exp(home_attack + away_defence + home_advantage)
    goal_expectation_away = np.exp(away_attack + home_defence)

    home_llk = poisson.pmf(goals_home_observed, goal_expectation_home)
    away_llk = poisson.pmf(goals_away_observed, goal_expectation_away)
    adj_llk = rho_correction(
        goals_home_observed,
        goals_away_observed,
        goal_expectation_home,
        goal_expectation_away,
        rho,
    )

    if goal_expectation_home < 0 or goal_expectation_away < 0 or adj_llk < 0:
        return 10000

    log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))

    return -log_llk


def fit_poisson_model(df, xi=0.0001):
    teams = np.sort(np.unique(np.concatenate([df["home_team"], df["away_team"]])))
    n_teams = len(teams)
    
    df["league_strength"] = df["division"].apply(lambda x: 1 if x == "Premier League" else 0.615) 
    df["days_since"] = (df["match_date"].max() - df["match_date"]).dt.days
    df["weight"] = decay(xi, df["days_since"]) * df["league_strength"]

    params = np.concatenate(
        (
            np.random.uniform(0.5, 1.5, (n_teams)),  # attack strength
            np.random.uniform(0, -1, (n_teams)),  # defence strength
            [0.25],  # home advantage
            [-0.1], # rho
        )
    )

    bounds = [(None, None)] * (2 * n_teams) + [(None, None), (-1, 1)]

    def _fit(params, df, teams):
        attack_params = dict(zip(teams, params[:n_teams]))
        defence_params = dict(zip(teams, params[n_teams : (2 * n_teams)]))
        home_advantage = params[-2]
        rho = params[-1]

        llk = list()
        for idx, row in df.iterrows():
            tmp = log_likelihood_goals(
                row["home_goals"],
                row["away_goals"],
                attack_params[row["home_team"]],
                defence_params[row["home_team"]],
                attack_params[row["away_team"]],
                defence_params[row["away_team"]],
                home_advantage,
                rho,
                row["weight"]
            )
            llk.append(tmp)

        return np.sum(llk)

    options = {
        "maxiter": 100,
        "disp": False,
    }

    constraints = [{"type": "eq", "fun": lambda x: sum(x[:n_teams]) - n_teams}]

    res = minimize(
        _fit,
        params,
        args=(df, teams),
        constraints=constraints,
        method='L-BFGS-B',
        options=options,
        bounds=bounds
    )

    model_params = dict(
        zip(
            ["attack_" + team for team in teams]
            + ["defence_" + team for team in teams]
            + ["home_adv", "rho"],
            res["x"],
        )
    )

    # Prepare table data
    attack_values = [model_params[f'attack_{team}'] for team in teams]
    defence_values = [model_params[f'defence_{team}'] for team in teams]
    home_adv = model_params['home_adv']
    rho = model_params['rho']

    # Create DataFrame for attack and defense
    team_strength_df = pd.DataFrame({
        'Team': teams,
        'Attack Strength': attack_values,
        'Defense Strength': defence_values
    })

    # Add home advantage and rho to the DataFrame
    additional_params_df = pd.DataFrame({
        'Parameter': ['Home Advantage', 'Rho'],
        'Value': [home_adv, rho]
    })

    # Print DataFrames
    print("Team Strength (Attack and Defense):")
    print(team_strength_df)
    print("\nAdditional Parameters (Home Advantage and Rho):")
    print(additional_params_df)

    print("Log Likelihood: ", res["fun"])

    return model_params

goals_model_params = fit_poisson_model(df, xi=0.001)

  res = minimize(


Team Strength (Attack and Defense):
                Team  Attack Strength  Defense Strength
0            Arsenal         1.581964         -1.703281
1        Aston Villa         1.458131         -0.850375
2         Birmingham         0.368578         -0.333619
3          Blackburn         0.650443         -0.242608
4        Bournemouth         1.118181         -0.825292
5          Brentford         1.168994         -0.843941
6           Brighton         1.116110         -0.946210
7       Bristol City         0.441756         -0.461458
8            Burnley         0.910546         -0.676623
9            Cardiff         0.337956         -0.204067
10           Chelsea         1.507195         -0.853467
11          Coventry         0.649912         -0.430435
12    Crystal Palace         1.156436         -0.915987
13             Derby         0.698318         -0.464330
14           Everton         0.816024         -0.927179
15            Fulham         1.094570         -0.908564
16      Hudd

In [6]:
def predict(params, home_team, away_team):
    home_attack = params["attack_" + home_team]
    home_defence = params["defence_" + home_team]
    away_attack = params["attack_" + away_team]
    away_defence = params["defence_" + away_team]
    home_advantage = params["home_adv"]
    #rho = params["rho"] #TODO: Debug RHO on XG model
    rho = -0.13

    home_goal_expectation = np.exp(home_attack + away_defence + home_advantage)
    away_goal_expectation = np.exp(away_attack + home_defence)

    home_probs = poisson.pmf(range(10), home_goal_expectation)
    away_probs = poisson.pmf(range(10), away_goal_expectation)

    m = np.outer(home_probs, away_probs)

    m[0, 0] *= 1 - home_goal_expectation * away_goal_expectation * rho
    m[0, 1] *= 1 + home_goal_expectation * rho
    m[1, 0] *= 1 + away_goal_expectation * rho
    m[1, 1] *= 1 - rho    

    home = np.sum(np.tril(m, -1)) 
    draw = np.sum(np.diag(m)) 
    away = np.sum(np.triu(m, 1))

    total_prob = home + draw + away
    home /= total_prob
    draw /= total_prob
    away /= total_prob 

    # Calculate the probability of a clean sheet for the home team (away team scores 0)
    home_clean_sheet_prob = m[:, 0].sum() 

    # Calculate the probability of a clean sheet for the away team (home team scores 0)
    away_clean_sheet_prob = m[0, :].sum() 

    # Calculate the probability of the home team scoring 3 or more goals
    home_3_plus_goals_prob = home_probs[3:].sum() 

    # Calculate the probability of the away team scoring 3 or more goals
    away_3_plus_goals_prob = away_probs[3:].sum() 


    return {
        "home_win_prob": home.round(2),
        "draw_prob": draw.round(2),
        "away_win_prob": away.round(2),
        "home_clean_sheet_prob": home_clean_sheet_prob.round(2),
        "away_clean_sheet_prob": away_clean_sheet_prob.round(2),
        "home_goal_expectation": home_goal_expectation.round(2),
        "away_goal_expectation": away_goal_expectation.round(2),
        "home_3_plus_goals_prob": home_3_plus_goals_prob.round(2),
        "away_3_plus_goals_prob": away_3_plus_goals_prob.round(2)
    }

In [7]:
# Initialize lists to store predictions
xg_home_preds = []
xg_away_preds = []
goals_home_preds = []
goals_away_preds = []

# Iterate through the games in your dataset
for idx, row in df.iterrows():
    xg_pred = predict(xg_model_params, row['home_team'], row['away_team'])
    goals_pred = predict(goals_model_params, row['home_team'], row['away_team'])

    xg_home_preds.append(xg_pred['home_goal_expectation'])
    xg_away_preds.append(xg_pred['away_goal_expectation'])
    goals_home_preds.append(goals_pred['home_goal_expectation'])
    goals_away_preds.append(goals_pred['away_goal_expectation'])

# Add these predictions back to the DataFrame
df['xg_home_pred'] = xg_home_preds
df['xg_away_pred'] = xg_away_preds
df['goals_home_pred'] = goals_home_preds
df['goals_away_pred'] = goals_away_preds

# Calculate absolute errors for xG predictions
df['xg_home_error'] = abs(df['home_xgoals'] - df['xg_home_pred'])
df['xg_away_error'] = abs(df['away_xgoals'] - df['xg_away_pred'])

# Calculate absolute errors for goals predictions
df['goals_home_error'] = abs(df['home_goals'] - df['goals_home_pred'])
df['goals_away_error'] = abs(df['away_goals'] - df['goals_away_pred'])


# Calculate MAE for xG predictions
xg_mae_home = df['xg_home_error'].mean()
xg_mae_away = df['xg_away_error'].mean()

# Calculate MAE for goals predictions
goals_mae_home = df['goals_home_error'].mean()
goals_mae_away = df['goals_away_error'].mean()

# Combine MAEs for home and away
xg_mae_total = (xg_mae_home + xg_mae_away) / 2
goals_mae_total = (goals_mae_home + goals_mae_away) / 2

print(f"xG MAE (home): {xg_mae_home}")
print(f"xG MAE (away): {xg_mae_away}")
print(f"xG MAE (total): {xg_mae_total}")

print(f"Goals MAE (home): {goals_mae_home}")
print(f"Goals MAE (away): {goals_mae_away}")
print(f"Goals MAE (total): {goals_mae_total}")

xG MAE (home): 0.567614165851272
xG MAE (away): 0.5222580645792564
xG MAE (total): 0.5449361152152642
Goals MAE (home): 0.9492465753424658
Goals MAE (away): 0.8424755381604696
Goals MAE (total): 0.8958610567514678


In [9]:
fixtures = [
    ("West Ham", "Chelsea"),
    ("Aston Villa", "Wolves"),
    ("Fulham", "Newcastle"),
    ("Southampton", "Ipswich"),
    ("Tottenham", "Brentford"),
    ("Leicester", "Everton"),
    ("Liverpool", "Bournemouth"),
    ("Crystal Palace", "Man United"),
    ("Brighton", "Nott'm Forest"),
    ("Man City", "Arsenal")
]

results = []

for fixture in fixtures:
    home_team, away_team = fixture
    
    xg_preds = predict(xg_model_params, home_team, away_team)
    goals_preds = predict(goals_model_params, home_team, away_team)
    
    weighted_preds = {}
    
    for key in xg_preds.keys():
        weighted_value = (0.7 * xg_preds[key]) + (0.3 * goals_preds[key])
        weighted_preds[key] = weighted_value
    
    results.append({
        'home_team': home_team,
        'away_team': away_team,
        **weighted_preds
    })

results_df = pd.DataFrame(results)


# Display the results DataFrame
display(results_df)

Unnamed: 0,home_team,away_team,home_win_prob,draw_prob,away_win_prob,home_clean_sheet_prob,away_clean_sheet_prob,home_goal_expectation,away_goal_expectation,home_3_plus_goals_prob,away_3_plus_goals_prob
0,West Ham,Chelsea,0.272,0.22,0.508,0.103,0.191,1.666,2.276,0.239,0.394
1,Aston Villa,Wolves,0.63,0.207,0.16,0.307,0.09,2.396,1.193,0.43,0.12
2,Fulham,Newcastle,0.321,0.243,0.436,0.153,0.206,1.591,1.868,0.218,0.287
3,Southampton,Ipswich,0.434,0.267,0.299,0.264,0.193,1.638,1.34,0.227,0.156
4,Tottenham,Brentford,0.497,0.237,0.269,0.235,0.134,2.003,1.447,0.325,0.175
5,Leicester,Everton,0.334,0.285,0.384,0.245,0.262,1.342,1.456,0.151,0.181
6,Liverpool,Bournemouth,0.727,0.166,0.107,0.363,0.069,2.754,1.03,0.52,0.085
7,Crystal Palace,Man United,0.459,0.259,0.282,0.259,0.171,1.769,1.367,0.26,0.161
8,Brighton,Nott'm Forest,0.456,0.29,0.254,0.336,0.217,1.529,1.085,0.2,0.097
9,Man City,Arsenal,0.395,0.313,0.302,0.337,0.275,1.285,1.099,0.141,0.1


In [10]:
results_df.to_csv("predictions/ensmeble_gw4_preds.csv")