In [None]:
import pandas as pd
import requests
import json
import os
import io
from datetime import datetime
import numpy as np

API_KEY = os.getenv("API_KEY")
url = 'https://data-service.beatthebookie.blog/data'
headers = {"x-api-key": API_KEY}

# Function to fetch data for a specific division and season
def fetch_data(division, season):
    params = {
        'division': division,
        'season': season
    }
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return pd.read_json(io.StringIO(response.content.decode('utf-8')))
    else:
        print(f"Error fetching {division} {season}: {response.status_code}")
        print(response.content.decode('utf-8'))
        return pd.DataFrame()

# Fetch data for all combinations
seasons = ['2023_2024', '2024_2025']
divisions = ['Premier League', 'Championship']
dataframes = []

for division in divisions:
    for season in seasons:
        df = fetch_data(division, season)
        if not df.empty:
            dataframes.append(df)

# Combine all dataframes
if dataframes:
    df = pd.concat(dataframes, ignore_index=True)
    
    # Convert match_date to datetime
    df['match_date'] = pd.to_datetime(df['match_date'])
    
    # Filter for dates after January 11, 2024
    df = df[df["match_date"] > '2024-01-11']
    
    # Get unique teams from 2024/25 Premier League
    prem_teams_25 = df[
        (df['season'] == '2024_2025') & 
        (df['division'] == 'Premier League')
    ]
    prem_teams_25 = pd.concat([
        prem_teams_25['home_team'], 
        prem_teams_25['away_team']
    ]).unique()
    
    # Display the results
    print("\nSample of filtered matches:")
    print(df[["season", "division", "match_date", "home_team", "away_team", 
              "home_goals", "home_xgoals", "away_goals", "away_xgoals"]].tail())
    
    print("\nTotal matches:", len(df))
    print("\nUnique seasons:", df['season'].unique())
    print("Unique divisions:", df['division'].unique())
else:
    print("No data was successfully fetched")



Sample of filtered matches:
        season      division match_date        home_team     away_team  \
1481  20242025  Championship 2025-01-21            Derby    Sunderland   
1482  20242025  Championship 2025-01-22    FC Portsmouth         Stoke   
1483  20242025  Championship 2025-01-22            Leeds       Norwich   
1484  20242025  Championship 2025-01-22   Sheffield Weds  Bristol City   
1485  20242025  Championship 2025-01-22  Plymouth Argyle       Burnley   

      home_goals  home_xgoals  away_goals  away_xgoals  
1481           0          0.7           1          0.6  
1482           3          2.0           1          0.7  
1483           2          1.1           0          0.1  
1484           2          0.9           2          0.9  
1485           0          0.4           5          2.7  

Total matches: 976

Unique seasons: [20232024 20242025]
Unique divisions: ['Premier League' 'Championship']
W9tBYtIKSya2eyBP4MjNa0muqP6EFlS3T4AFXrB8


In [2]:
# Apply averaged penalty when Championship match contains two promoted teams.

# Penalty xG = 0.665
# Penalty xGA = 1.465

# mask_both_prem = (df['division'] == 'Championship') & (df['home_team'].isin(prem_teams_25)) & (df['away_team'].isin(prem_teams_25))
# # #df.loc[mask_both_prem, 'home_goals'] *= 0.661
# # #df.loc[mask_both_prem, 'away_goals'] *= 0.661
# df.loc[mask_both_prem, 'home_xgoals'] *= 0.665
# df.loc[mask_both_prem, 'away_xgoals'] *= 0.655

# # Adjust performance penalty to championship games when one team is a promoted team.
# mask_home = (df['division'] == 'Championship') & (df['home_team'].isin(prem_teams_25)) & ~(df['away_team'].isin(prem_teams_25))
# # #df.loc[mask_home, 'home_goals'] *= 0.661
# # #df.loc[mask_home, 'away_goals'] *= 2.060
# df.loc[mask_home, 'home_xgoals'] *= 0.665
# df.loc[mask_home, 'away_xgoals'] *= 1.465

# mask_away = (df['division'] == 'Championship') & (df['away_team'].isin(prem_teams_25)) & ~(df['home_team'].isin(prem_teams_25))
# # #df.loc[mask_away, 'home_goals'] *= 2.060
# # #df.loc[mask_away, 'away_goals'] *= 0.661
# df.loc[mask_away, 'home_xgoals'] *= 1.465
# df.loc[mask_away, 'away_xgoals'] *= 0.665


# df[['home_goals', 'away_goals']] = df[['home_goals', 'away_goals']].round()


In [3]:
def decay(xi, t):
    return np.exp(-xi * t)

def rho_correction(goals_home, goals_away, home_exp, away_exp, rho):
    if goals_home == 0 and goals_away == 0:
        return 1 - (home_exp * away_exp * rho)
    elif goals_home == 0 and goals_away == 1:
        return 1 + (home_exp * rho)
    elif goals_home == 1 and goals_away == 0:
        return 1 + (away_exp * rho)
    elif goals_home == 1 and goals_away == 1:
        return 1 - rho
    else:
        return 1.0

In [4]:
from scipy.optimize import minimize
from scipy.stats import norm, poisson

def log_likelihood_xg(
    xG_home_observed,
    xG_away_observed,
    home_attack,
    home_defence,
    away_attack,
    away_defence,
    home_advantage,
    weight,
    sigma=1.0
):
    xG_expectation_home = np.exp(home_attack + away_defence + home_advantage)
    xG_expectation_away = np.exp(away_attack + home_defence)

    home_llk = norm.pdf(xG_home_observed, loc=xG_expectation_home, scale=sigma)
    away_llk = norm.pdf(xG_away_observed, loc=xG_expectation_away, scale=sigma)

    if xG_expectation_home < 0 or xG_expectation_away < 0:
        return 10000

    log_llk = weight * (np.log(home_llk) + np.log(away_llk))

    return -log_llk


def fit_xG_model(df, xi=0.0001, sigma=1.0):
    teams = np.sort(np.unique(np.concatenate([df["home_team"], df["away_team"]])))
    n_teams = len(teams)
    
    df["league_strength"] = df["division"].apply(lambda x: 1 if x == "Premier League" else 0.615) 
    df["days_since"] = (df["match_date"].max() - df["match_date"]).dt.days
    df["weight"] = decay(xi, df["days_since"]) * df["league_strength"]

    params = np.concatenate(
        (
            np.random.uniform(0.5, 1.5, (n_teams)),  # attack strength
            np.random.uniform(0, -1, (n_teams)),  # defence strength
            [0.25],  # home advantage
        )
    )

    bounds = [(None, None)] * (2 * n_teams) + [(None, None)]

    def _fit(params, df, teams, sigma):
        attack_params = dict(zip(teams, params[:n_teams]))
        defence_params = dict(zip(teams, params[n_teams : (2 * n_teams)]))
        home_advantage = params[-1]

        llk = list()
        for idx, row in df.iterrows():
            tmp = log_likelihood_xg(
                row["home_xgoals"],
                row["away_xgoals"],
                attack_params[row["home_team"]],
                defence_params[row["home_team"]],
                attack_params[row["away_team"]],
                defence_params[row["away_team"]],
                home_advantage,
                row["weight"],
                sigma=sigma
            )
            llk.append(tmp)

        return np.sum(llk)

    options = {
        "maxiter": 100,
        "disp": False,
    }

    constraints = [{"type": "eq", "fun": lambda x: sum(x[:n_teams]) - n_teams}]

    res = minimize(
        _fit,
        params,
        args=(df, teams, sigma),
        constraints=constraints,
        bounds=bounds,
        method='L-BFGS-B',
        options=options,
    )

    model_params = dict(
        zip(
            ["attack_" + team for team in teams]
            + ["defence_" + team for team in teams]
            + ["home_adv"],
            res["x"],
        )
    )

    # Prepare table data
    attack_values = [model_params[f'attack_{team}'] for team in teams]
    defence_values = [model_params[f'defence_{team}'] for team in teams]
    home_adv = model_params['home_adv']

    # Create DataFrame for attack and defense
    team_strength_df_xg = pd.DataFrame({
        'Team': teams,
        'Attack Strength': attack_values,
        'Defense Strength': defence_values
    })

    # Add home advantage to the DataFrame
    additional_params_df = pd.DataFrame({
        'Parameter': ['Home Advantage'],
        'Value': [home_adv]
    })

    # Print DataFrames
    print("Team Strength (Attack and Defense):")
    print(team_strength_df_xg)
    print("\nAdditional Parameters (Home Advantage):")
    print(additional_params_df)

    print("Log Likelihood: ", res["fun"])

    team_strength_df_xg.to_csv("predictions/xg_team_strength_jan25.csv")

    return model_params


xg_model_params = fit_xG_model(df, xi=0.001, sigma=1.0)

  res = minimize(


Team Strength (Attack and Defense):
                Team  Attack Strength  Defense Strength
0            Arsenal         1.513143         -1.430846
1        Aston Villa         1.268040         -0.828361
2         Birmingham         0.400007         -0.563297
3          Blackburn         0.517099         -0.556929
4        Bournemouth         1.371382         -0.943147
5          Brentford         1.308699         -0.732809
6           Brighton         1.111554         -0.899151
7       Bristol City         0.619512         -0.548758
8            Burnley         0.633390         -0.655731
9            Cardiff         0.353984         -0.300396
10           Chelsea         1.548715         -0.849641
11          Coventry         0.756583         -0.585467
12    Crystal Palace         1.192838         -0.897460
13             Derby         0.468784         -0.525127
14           Everton         0.984869         -0.824104
15     FC Portsmouth         0.457568         -0.201887
16          

In [5]:
def log_likelihood_goals(
    goals_home_observed,
    goals_away_observed,
    home_attack,
    home_defence,
    away_attack,
    away_defence,
    home_advantage,
    rho,
    weight
):
    goal_expectation_home = np.exp(home_attack + away_defence + home_advantage)
    goal_expectation_away = np.exp(away_attack + home_defence)

    home_llk = poisson.pmf(goals_home_observed, goal_expectation_home)
    away_llk = poisson.pmf(goals_away_observed, goal_expectation_away)
    adj_llk = rho_correction(
        goals_home_observed,
        goals_away_observed,
        goal_expectation_home,
        goal_expectation_away,
        rho,
    )

    if goal_expectation_home < 0 or goal_expectation_away < 0 or adj_llk < 0:
        return 10000

    log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))

    return -log_llk

def fit_poisson_model(df, xi=0.0001):
    teams = np.sort(np.unique(np.concatenate([df["home_team"], df["away_team"]])))
    n_teams = len(teams)
    
    df["league_strength"] = df["division"].apply(lambda x: 1 if x == "Premier League" else 0.615) 
    df["days_since"] = (df["match_date"].max() - df["match_date"]).dt.days
    df["weight"] = decay(xi, df["days_since"]) * df["league_strength"]

    params = np.concatenate(
        (
            np.random.uniform(0.5, 1.5, (n_teams)),  # attack strength
            np.random.uniform(0, -1, (n_teams)),  # defence strength
            [0.25],  # home advantage
            [-0.1], # rho
        )
    )

    bounds = [(None, None)] * (2 * n_teams) + [(None, None), (-1, 1)]

    def _fit(params, df, teams):
        attack_params = dict(zip(teams, params[:n_teams]))
        defence_params = dict(zip(teams, params[n_teams : (2 * n_teams)]))
        home_advantage = params[-2]
        rho = params[-1]

        llk = list()
        for idx, row in df.iterrows():
            tmp = log_likelihood_goals(
                row["home_goals"],
                row["away_goals"],
                attack_params[row["home_team"]],
                defence_params[row["home_team"]],
                attack_params[row["away_team"]],
                defence_params[row["away_team"]],
                home_advantage,
                rho,
                row["weight"]
            )
            llk.append(tmp)

        return np.sum(llk)

    options = {
        "maxiter": 100,
        "disp": False,
    }

    constraints = [{"type": "eq", "fun": lambda x: sum(x[:n_teams]) - n_teams}]

    res = minimize(
        _fit,
        params,
        args=(df, teams),
        constraints=constraints,
        method='L-BFGS-B',
        options=options,
        bounds=bounds
    )

    model_params = dict(
        zip(
            ["attack_" + team for team in teams]
            + ["defence_" + team for team in teams]
            + ["home_adv", "rho"],
            res["x"],
        )
    )

    # Prepare table data
    attack_values = [model_params[f'attack_{team}'] for team in teams]
    defence_values = [model_params[f'defence_{team}'] for team in teams]
    home_adv = model_params['home_adv']
    rho = model_params['rho']

    # Create DataFrame for attack and defense
    team_strength_df_gls = pd.DataFrame({
        'Team': teams,
        'Attack Strength': attack_values,
        'Defense Strength': defence_values
    })

    # Add home advantage and rho to the DataFrame
    additional_params_df = pd.DataFrame({
        'Parameter': ['Home Advantage', 'Rho'],
        'Value': [home_adv, rho]
    })

    # Print DataFrames
    print("Team Strength (Attack and Defense):")
    print(team_strength_df_gls)
    print("\nAdditional Parameters (Home Advantage and Rho):")
    print(additional_params_df)

    print("Log Likelihood: ", res["fun"])

    team_strength_df_gls.to_csv("predictions/gls_team_strength_jan25.csv")

    return model_params

goals_model_params = fit_poisson_model(df, xi=0.001)

  res = minimize(


Team Strength (Attack and Defense):
                Team  Attack Strength  Defense Strength
0            Arsenal         1.653670         -1.613194
1        Aston Villa         1.328238         -0.850955
2         Birmingham         0.311610         -0.509582
3          Blackburn         0.457412         -0.670861
4        Bournemouth         1.242283         -1.056546
5          Brentford         1.385244         -0.805604
6           Brighton         1.138169         -1.008053
7       Bristol City         0.491558         -0.557748
8            Burnley         0.693157         -0.949877
9            Cardiff         0.400437         -0.186044
10           Chelsea         1.608486         -0.988797
11          Coventry         0.690553         -0.301086
12    Crystal Palace         1.190131         -1.048042
13             Derby         0.488486         -0.385279
14           Everton         0.687177         -1.124938
15     FC Portsmouth         0.601682         -0.132011
16          

In [6]:
def predict(params, home_team, away_team):
    home_attack = params["attack_" + home_team]
    home_defence = params["defence_" + home_team]
    away_attack = params["attack_" + away_team]
    away_defence = params["defence_" + away_team]
    home_advantage = params["home_adv"]
    rho = params.get("rho", 0)

    home_goal_expectation = np.exp(home_attack + away_defence + home_advantage)
    away_goal_expectation = np.exp(away_attack + home_defence)

    home_probs = poisson.pmf(range(10), home_goal_expectation)
    away_probs = poisson.pmf(range(10), away_goal_expectation)

    m = np.outer(home_probs, away_probs)

    m[0, 0] *= 1 - home_goal_expectation * away_goal_expectation * rho
    m[0, 1] *= 1 + home_goal_expectation * rho
    m[1, 0] *= 1 + away_goal_expectation * rho
    m[1, 1] *= 1 - rho    

    home = np.sum(np.tril(m, -1)) 
    draw = np.sum(np.diag(m)) 
    away = np.sum(np.triu(m, 1))

    total_prob = home + draw + away
    home /= total_prob
    draw /= total_prob
    away /= total_prob 

    # Calculate the probability of a clean sheet for the home team (away team scores 0)
    home_clean_sheet_prob = m[:, 0].sum() 

    # Calculate the probability of a clean sheet for the away team (home team scores 0)
    away_clean_sheet_prob = m[0, :].sum() 

    # Calculate the probability of the home team scoring 3 or more goals
    home_3_plus_goals_prob = home_probs[3:].sum() 

    # Calculate the probability of the away team scoring 3 or more goals
    away_3_plus_goals_prob = away_probs[3:].sum() 


    return {
        "home_win_prob": home.round(2),
        "draw_prob": draw.round(2),
        "away_win_prob": away.round(2),
        "home_clean_sheet_prob": home_clean_sheet_prob.round(2),
        "away_clean_sheet_prob": away_clean_sheet_prob.round(2),
        "home_goal_expectation": home_goal_expectation.round(2),
        "away_goal_expectation": away_goal_expectation.round(2),
        "home_3_plus_goals_prob": home_3_plus_goals_prob.round(2),
        "away_3_plus_goals_prob": away_3_plus_goals_prob.round(2)
    }

In [7]:
# Initialize lists to store predictions
xg_home_preds = []
xg_away_preds = []
goals_home_preds = []
goals_away_preds = []

# Iterate through the games in your dataset
for idx, row in df.iterrows():
    xg_pred = predict(xg_model_params, row['home_team'], row['away_team'])
    goals_pred = predict(goals_model_params, row['home_team'], row['away_team'])

    xg_home_preds.append(xg_pred['home_goal_expectation'])
    xg_away_preds.append(xg_pred['away_goal_expectation'])
    goals_home_preds.append(goals_pred['home_goal_expectation'])
    goals_away_preds.append(goals_pred['away_goal_expectation'])

# Add these predictions back to the DataFrame
df['xg_home_pred'] = xg_home_preds
df['xg_away_pred'] = xg_away_preds
df['goals_home_pred'] = goals_home_preds
df['goals_away_pred'] = goals_away_preds

# Calculate absolute errors for xG predictions
df['xg_home_error'] = abs(df['home_xgoals'] - df['xg_home_pred'])
df['xg_away_error'] = abs(df['away_xgoals'] - df['xg_away_pred'])

# Calculate absolute errors for goals predictions
df['goals_home_error'] = abs(df['home_goals'] - df['goals_home_pred'])
df['goals_away_error'] = abs(df['away_goals'] - df['goals_away_pred'])


# Calculate MAE for xG predictions
xg_mae_home = df['xg_home_error'].mean()
xg_mae_away = df['xg_away_error'].mean()

# Calculate MAE for goals predictions
goals_mae_home = df['goals_home_error'].mean()
goals_mae_away = df['goals_away_error'].mean()

# Combine MAEs for home and away
xg_mae_total = (xg_mae_home + xg_mae_away) / 2
goals_mae_total = (goals_mae_home + goals_mae_away) / 2

print(f"xG MAE (home): {xg_mae_home}")
print(f"xG MAE (away): {xg_mae_away}")
print(f"xG MAE (total): {xg_mae_total}")

print(f"Goals MAE (home): {goals_mae_home}")
print(f"Goals MAE (away): {goals_mae_away}")
print(f"Goals MAE (total): {goals_mae_total}")

xG MAE (home): 0.580767299273859
xG MAE (away): 0.5415073954356846
xG MAE (total): 0.5611373473547718
Goals MAE (home): 0.9160995850622408
Goals MAE (away): 0.8311514522821577
Goals MAE (total): 0.8736255186721993


In [8]:
import pandas as pd
import sys
import os


# Manually define fixtures for GW22
fixtures = [
    ("Brighton", "Everton"),
    ("Liverpool", "Ipswich"),
    ("Southampton", "Newcastle"),
    ("Wolves", "Arsenal"),
    ("Bournemouth", "Nott'm Forest"),
    ("Man City", "Chelsea"),
    ("Tottenham", "Leicester"),
    ("Crystal Palace", "Brentford"),
    ("Aston Villa", "West Ham"),
    ("Fulham", "Man United")
]

# Get predictions for each fixture
results = []

for fixture in fixtures:
    home_team, away_team = fixture
    
    xg_preds = predict(xg_model_params, home_team, away_team)
    goals_preds = predict(goals_model_params, home_team, away_team)
    
    weighted_preds = {}
    
    for key in xg_preds.keys():
        weighted_value = (0.7 * xg_preds[key]) + (0.3 * goals_preds[key])
        weighted_preds[key] = weighted_value
    
    results.append({
        'home_team': home_team,
        'away_team': away_team,
        **weighted_preds
    })

results_df = pd.DataFrame(results)


# Display the results DataFrame
display(results_df)


Unnamed: 0,home_team,away_team,home_win_prob,draw_prob,away_win_prob,home_clean_sheet_prob,away_clean_sheet_prob,home_goal_expectation,away_goal_expectation,home_3_plus_goals_prob,away_3_plus_goals_prob
0,Brighton,Everton,0.481,0.274,0.245,0.382,0.23,1.473,0.982,0.183,0.082
1,Liverpool,Ipswich,0.885,0.082,0.033,0.49,0.026,3.643,0.707,0.697,0.037
2,Southampton,Newcastle,0.132,0.164,0.704,0.067,0.32,1.14,2.719,0.112,0.509
3,Wolves,Arsenal,0.084,0.154,0.755,0.091,0.501,0.692,2.45,0.033,0.44
4,Bournemouth,Nott'm Forest,0.442,0.258,0.293,0.299,0.218,1.536,1.207,0.199,0.124
5,Man City,Chelsea,0.521,0.206,0.269,0.204,0.103,2.254,1.602,0.39,0.223
6,Tottenham,Leicester,0.708,0.157,0.135,0.301,0.057,2.846,1.211,0.539,0.125
7,Crystal Palace,Brentford,0.464,0.229,0.307,0.229,0.159,1.845,1.477,0.278,0.184
8,Aston Villa,West Ham,0.593,0.193,0.204,0.253,0.09,2.382,1.368,0.423,0.157
9,Fulham,Man United,0.496,0.242,0.269,0.28,0.165,1.793,1.26,0.266,0.13


In [10]:
# from viz.model_pred_viz.gw_dict import gw_dict
# current_date = pd.Timestamp.today()
# for gw, date in gw_dict.items():
#         if current_date >= pd.to_datetime(date):
#             current_gw = gw
#         else:
#             break
# print (current_gw)

results_df.to_csv(f"predictions/ensmeble_23_preds.csv")