In [1]:
import pandas as pd
import requests
import json
import os
import io
from datetime import datetime
import numpy as np

API_KEY = os.getenv("API_KEY")
url = 'https://data-service.beatthebookie.blog/data'
headers = {"x-api-key": API_KEY}

# Function to fetch data for a specific division and season
def fetch_data(division, season):
    params = {
        'division': division,
        'season': season
    }
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return pd.read_json(io.StringIO(response.content.decode('utf-8')))
    else:
        print(f"Error fetching {division} {season}: {response.status_code}")
        print(response.content.decode('utf-8'))
        return pd.DataFrame()

# Fetch data for all combinations
seasons = ['2023_2024', '2024_2025']
divisions = ['Premier League', 'Championship']
dataframes = []

for division in divisions:
    for season in seasons:
        df = fetch_data(division, season)
        if not df.empty:
            dataframes.append(df)

# Combine all dataframes
if dataframes:
    df = pd.concat(dataframes, ignore_index=True)
    
    # Convert match_date to datetime
    df['match_date'] = pd.to_datetime(df['match_date'])
    
    # Filter for dates after January 11, 2024
    df = df[df["match_date"] > '2024-01-11']
    
    # Get unique teams from 2024/25 Premier League
    prem_teams_25 = df[
        (df['season'] == '2024_2025') & 
        (df['division'] == 'Premier League')
    ]
    prem_teams_25 = pd.concat([
        prem_teams_25['home_team'], 
        prem_teams_25['away_team']
    ]).unique()
    
    # Display the results
    print("\nSample of filtered matches:")
    print(df[["season", "division", "match_date", "home_team", "away_team", 
              "home_goals", "home_xgoals", "away_goals", "away_xgoals"]].tail())
    
    print("\nTotal matches:", len(df))
    print("\nUnique seasons:", df['season'].unique())
    print("Unique divisions:", df['division'].unique())
else:
    print("No data was successfully fetched")



Sample of filtered matches:
        season      division match_date      home_team         away_team  \
1569  20242025  Championship 2025-02-15      Blackburn   Plymouth Argyle   
1570  20242025  Championship 2025-02-15          Luton  Sheffield United   
1571  20242025  Championship 2025-02-15       Millwall         West Brom   
1572  20242025  Championship 2025-02-15  Oxford United     FC Portsmouth   
1573  20242025  Championship 2025-02-17          Leeds        Sunderland   

      home_goals  home_xgoals  away_goals  away_xgoals  
1569           2          2.2           0          1.5  
1570           0          1.4           1          1.8  
1571           1          0.7           1          0.4  
1572           0          1.4           2          1.5  
1573           2          1.9           1          0.5  

Total matches: 1064

Unique seasons: [20232024 20242025]
Unique divisions: ['Premier League' 'Championship']


In [2]:
# Apply averaged penalty when Championship match contains two promoted teams.

# Penalty xG = 0.665
# Penalty xGA = 1.465

# mask_both_prem = (df['division'] == 'Championship') & (df['home_team'].isin(prem_teams_25)) & (df['away_team'].isin(prem_teams_25))
# # #df.loc[mask_both_prem, 'home_goals'] *= 0.661
# # #df.loc[mask_both_prem, 'away_goals'] *= 0.661
# df.loc[mask_both_prem, 'home_xgoals'] *= 0.665
# df.loc[mask_both_prem, 'away_xgoals'] *= 0.655

# # Adjust performance penalty to championship games when one team is a promoted team.
# mask_home = (df['division'] == 'Championship') & (df['home_team'].isin(prem_teams_25)) & ~(df['away_team'].isin(prem_teams_25))
# # #df.loc[mask_home, 'home_goals'] *= 0.661
# # #df.loc[mask_home, 'away_goals'] *= 2.060
# df.loc[mask_home, 'home_xgoals'] *= 0.665
# df.loc[mask_home, 'away_xgoals'] *= 1.465

# mask_away = (df['division'] == 'Championship') & (df['away_team'].isin(prem_teams_25)) & ~(df['home_team'].isin(prem_teams_25))
# # #df.loc[mask_away, 'home_goals'] *= 2.060
# # #df.loc[mask_away, 'away_goals'] *= 0.661
# df.loc[mask_away, 'home_xgoals'] *= 1.465
# df.loc[mask_away, 'away_xgoals'] *= 0.665


# df[['home_goals', 'away_goals']] = df[['home_goals', 'away_goals']].round()


In [3]:
def decay(xi, t):
    return np.exp(-xi * t)

def rho_correction(goals_home, goals_away, home_exp, away_exp, rho):
    if goals_home == 0 and goals_away == 0:
        return 1 - (home_exp * away_exp * rho)
    elif goals_home == 0 and goals_away == 1:
        return 1 + (home_exp * rho)
    elif goals_home == 1 and goals_away == 0:
        return 1 + (away_exp * rho)
    elif goals_home == 1 and goals_away == 1:
        return 1 - rho
    else:
        return 1.0

In [6]:
from scipy.optimize import minimize
from scipy.stats import norm, poisson, gamma

def log_likelihood_xg(
    xG_home_observed,
    xG_away_observed,
    home_attack,
    home_defence,
    away_attack,
    away_defence,
    home_advantage,
    weight,
    shape_factor=3.0  # Controls the shape of the gamma distribution
):
    # Calculate expected xG values
    xG_expectation_home = np.exp(home_attack + away_defence + home_advantage)
    xG_expectation_away = np.exp(away_attack + home_defence)
    
    # Check for valid expected values
    if xG_expectation_home <= 0 or xG_expectation_away <= 0:
        return 10000
    
    # Calculate gamma parameters
    # For gamma distribution, mean = shape * scale
    # We'll use shape_factor to determine shape, and then calculate scale
    
    # Home team parameters
    shape_home = shape_factor
    scale_home = xG_expectation_home / shape_home
    
    # Away team parameters
    shape_away = shape_factor
    scale_away = xG_expectation_away / shape_away
    
    # Handle zero or near-zero observations (rare but possible in xG data)
    xG_home_observed = max(xG_home_observed, 1e-6)
    xG_away_observed = max(xG_away_observed, 1e-6)
    
    # Calculate likelihoods using gamma PDF
    home_llk = gamma.pdf(xG_home_observed, a=shape_home, scale=scale_home)
    away_llk = gamma.pdf(xG_away_observed, a=shape_away, scale=scale_away)
    
    # Handle very small likelihoods
    if home_llk <= 1e-10 or away_llk <= 1e-10:
        return 10000
    
    # Calculate negative log-likelihood
    log_llk = weight * (np.log(home_llk) + np.log(away_llk))
    
    return -log_llk

def fit_xG_model(df, xi=0.0001, shape_factor=3.0):
    teams = np.sort(np.unique(np.concatenate([df["home_team"], df["away_team"]])))
    n_teams = len(teams)
         
    df["league_strength"] = df["division"].apply(lambda x: 1 if x == "Premier League" else 0.615)
     
    df["days_since"] = (df["match_date"].max() - df["match_date"]).dt.days
    df["weight"] = decay(xi, df["days_since"]) * df["league_strength"]
     
    params = np.concatenate(
        (
            np.random.uniform(0.5, 1.5, (n_teams)),  # attack strength
            np.random.uniform(0, -1, (n_teams)),  # defence strength
            [0.25],  # home advantage
        )
    )
     
    bounds = [(None, None)] * (2 * n_teams) + [(None, None)]
     
    def _fit(params, df, teams, shape_factor):
        attack_params = dict(zip(teams, params[:n_teams]))
        defence_params = dict(zip(teams, params[n_teams : (2 * n_teams)]))
        home_advantage = params[-1]
         
        llk = list()
        for idx, row in df.iterrows():
            tmp = log_likelihood_xg(
                row["home_xgoals"],
                row["away_xgoals"],
                attack_params[row["home_team"]],
                defence_params[row["home_team"]],
                attack_params[row["away_team"]],
                defence_params[row["away_team"]],
                home_advantage,
                row["weight"],
                shape_factor=shape_factor
            )
            llk.append(tmp)
         
        return np.sum(llk)
     
    options = {
        "maxiter": 100,
        "disp": False,
    }
     
    constraints = [{"type": "eq", "fun": lambda x: sum(x[:n_teams]) - n_teams}]
     
    res = minimize(
        _fit,
        params,
        args=(df, teams, shape_factor),
        constraints=constraints,
        bounds=bounds,
        method='L-BFGS-B',
        options=options,
    )
     
    model_params = dict(
        zip(
            ["attack_" + team for team in teams]
            + ["defence_" + team for team in teams]
            + ["home_adv"],
            res["x"],
        )
    )
     
    # Prepare table data
    attack_values = [model_params[f'attack_{team}'] for team in teams]
    defence_values = [model_params[f'defence_{team}'] for team in teams]
    home_adv = model_params['home_adv']
     
    # Create DataFrame for attack and defense
    team_strength_df_xg = pd.DataFrame({
        'Team': teams,
        'Attack Strength': attack_values,
        'Defense Strength': defence_values
    })
     
    # Add home advantage to the DataFrame
    additional_params_df = pd.DataFrame({
        'Parameter': ['Home Advantage'],
        'Value': [home_adv]
    })
     
    # Print DataFrames
    print("Team Strength (Attack and Defense):")
    print(team_strength_df_xg)
    print("\nAdditional Parameters (Home Advantage):")
    print(additional_params_df)
     
    print("Log Likelihood: ", res["fun"])
     
    team_strength_df_xg.to_csv("predictions/xg_team_strength_jan25.csv")
     
    return model_params


xg_model_params = fit_xG_model(df, xi=0.001, shape_factor=3.0)

  res = minimize(


Team Strength (Attack and Defense):
                Team  Attack Strength  Defense Strength
0            Arsenal         1.428614         -1.495588
1        Aston Villa         1.224644         -0.756155
2         Birmingham         0.375692         -0.649890
3          Blackburn         0.591922         -0.469659
4        Bournemouth         1.385034         -0.849635
5          Brentford         1.149959         -0.751465
6           Brighton         1.116295         -0.846430
7       Bristol City         0.556083         -0.525399
8            Burnley         0.678709         -0.778373
9            Cardiff         0.325354         -0.258345
10           Chelsea         1.423694         -0.765934
11          Coventry         0.727686         -0.575384
12    Crystal Palace         1.128123         -0.844139
13             Derby         0.386764         -0.596489
14           Everton         0.901721         -0.845836
15     FC Portsmouth         0.612776         -0.259409
16          

In [7]:
def log_likelihood_goals(
    goals_home_observed,
    goals_away_observed,
    home_attack,
    home_defence,
    away_attack,
    away_defence,
    home_advantage,
    rho,
    weight
):
    goal_expectation_home = np.exp(home_attack + away_defence + home_advantage)
    goal_expectation_away = np.exp(away_attack + home_defence)

    home_llk = poisson.pmf(goals_home_observed, goal_expectation_home)
    away_llk = poisson.pmf(goals_away_observed, goal_expectation_away)
    adj_llk = rho_correction(
        goals_home_observed,
        goals_away_observed,
        goal_expectation_home,
        goal_expectation_away,
        rho,
    )

    if goal_expectation_home < 0 or goal_expectation_away < 0 or adj_llk < 0:
        return 10000

    log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))

    return -log_llk

def fit_poisson_model(df, xi=0.0001):
    teams = np.sort(np.unique(np.concatenate([df["home_team"], df["away_team"]])))
    n_teams = len(teams)
    
    df["league_strength"] = df["division"].apply(lambda x: 1 if x == "Premier League" else 0.615) 
    df["days_since"] = (df["match_date"].max() - df["match_date"]).dt.days
    df["weight"] = decay(xi, df["days_since"]) * df["league_strength"]

    params = np.concatenate(
        (
            np.random.uniform(0.5, 1.5, (n_teams)),  # attack strength
            np.random.uniform(0, -1, (n_teams)),  # defence strength
            [0.25],  # home advantage
            [-0.1], # rho
        )
    )

    bounds = [(None, None)] * (2 * n_teams) + [(None, None), (-1, 1)]

    def _fit(params, df, teams):
        attack_params = dict(zip(teams, params[:n_teams]))
        defence_params = dict(zip(teams, params[n_teams : (2 * n_teams)]))
        home_advantage = params[-2]
        rho = params[-1]

        llk = list()
        for idx, row in df.iterrows():
            tmp = log_likelihood_goals(
                row["home_goals"],
                row["away_goals"],
                attack_params[row["home_team"]],
                defence_params[row["home_team"]],
                attack_params[row["away_team"]],
                defence_params[row["away_team"]],
                home_advantage,
                rho,
                row["weight"]
            )
            llk.append(tmp)

        return np.sum(llk)

    options = {
        "maxiter": 100,
        "disp": False,
    }

    constraints = [{"type": "eq", "fun": lambda x: sum(x[:n_teams]) - n_teams}]

    res = minimize(
        _fit,
        params,
        args=(df, teams),
        constraints=constraints,
        method='L-BFGS-B',
        options=options,
        bounds=bounds
    )

    model_params = dict(
        zip(
            ["attack_" + team for team in teams]
            + ["defence_" + team for team in teams]
            + ["home_adv", "rho"],
            res["x"],
        )
    )

    # Prepare table data
    attack_values = [model_params[f'attack_{team}'] for team in teams]
    defence_values = [model_params[f'defence_{team}'] for team in teams]
    home_adv = model_params['home_adv']
    rho = model_params['rho']

    # Create DataFrame for attack and defense
    team_strength_df_gls = pd.DataFrame({
        'Team': teams,
        'Attack Strength': attack_values,
        'Defense Strength': defence_values
    })

    # Add home advantage and rho to the DataFrame
    additional_params_df = pd.DataFrame({
        'Parameter': ['Home Advantage', 'Rho'],
        'Value': [home_adv, rho]
    })

    # Print DataFrames
    print("Team Strength (Attack and Defense):")
    print(team_strength_df_gls)
    print("\nAdditional Parameters (Home Advantage and Rho):")
    print(additional_params_df)

    print("Log Likelihood: ", res["fun"])

    team_strength_df_gls.to_csv("predictions/gls_team_strength_jan25.csv")

    return model_params

goals_model_params = fit_poisson_model(df, xi=0.001)

  res = minimize(


Team Strength (Attack and Defense):
                Team  Attack Strength  Defense Strength
0            Arsenal         1.697713         -1.692111
1        Aston Villa         1.286610         -0.890362
2         Birmingham         0.343542         -0.543878
3          Blackburn         0.517261         -0.684087
4        Bournemouth         1.333313         -1.104655
5          Brentford         1.376182         -0.884678
6           Brighton         1.158949         -0.972528
7       Bristol City         0.531711         -0.594078
8            Burnley         0.706015         -1.069331
9            Cardiff         0.454683         -0.155907
10           Chelsea         1.604297         -0.985766
11          Coventry         0.734952         -0.422244
12    Crystal Palace         1.224236         -1.069354
13             Derby         0.366265         -0.403478
14           Everton         0.880083         -1.200146
15     FC Portsmouth         0.632844         -0.194220
16          

In [8]:
def predict(params, home_team, away_team):
    home_attack = params["attack_" + home_team]
    home_defence = params["defence_" + home_team]
    away_attack = params["attack_" + away_team]
    away_defence = params["defence_" + away_team]
    home_advantage = params["home_adv"]
    rho = params.get("rho", 0)

    home_goal_expectation = np.exp(home_attack + away_defence + home_advantage)
    away_goal_expectation = np.exp(away_attack + home_defence)

    home_probs = poisson.pmf(range(10), home_goal_expectation)
    away_probs = poisson.pmf(range(10), away_goal_expectation)

    m = np.outer(home_probs, away_probs)

    m[0, 0] *= 1 - home_goal_expectation * away_goal_expectation * rho
    m[0, 1] *= 1 + home_goal_expectation * rho
    m[1, 0] *= 1 + away_goal_expectation * rho
    m[1, 1] *= 1 - rho    

    home = np.sum(np.tril(m, -1)) 
    draw = np.sum(np.diag(m)) 
    away = np.sum(np.triu(m, 1))

    total_prob = home + draw + away
    home /= total_prob
    draw /= total_prob
    away /= total_prob 

    # Calculate the probability of a clean sheet for the home team (away team scores 0)
    home_clean_sheet_prob = m[:, 0].sum() 

    # Calculate the probability of a clean sheet for the away team (home team scores 0)
    away_clean_sheet_prob = m[0, :].sum() 

    # Calculate the probability of the home team scoring 3 or more goals
    home_3_plus_goals_prob = home_probs[3:].sum() 

    # Calculate the probability of the away team scoring 3 or more goals
    away_3_plus_goals_prob = away_probs[3:].sum() 


    return {
        "home_win_prob": home.round(2),
        "draw_prob": draw.round(2),
        "away_win_prob": away.round(2),
        "home_clean_sheet_prob": home_clean_sheet_prob.round(2),
        "away_clean_sheet_prob": away_clean_sheet_prob.round(2),
        "home_goal_expectation": home_goal_expectation.round(2),
        "away_goal_expectation": away_goal_expectation.round(2),
        "home_3_plus_goals_prob": home_3_plus_goals_prob.round(2),
        "away_3_plus_goals_prob": away_3_plus_goals_prob.round(2)
    }

In [9]:
# Initialize lists to store predictions
xg_home_preds = []
xg_away_preds = []
goals_home_preds = []
goals_away_preds = []

# Iterate through the games in your dataset
for idx, row in df.iterrows():
    xg_pred = predict(xg_model_params, row['home_team'], row['away_team'])
    goals_pred = predict(goals_model_params, row['home_team'], row['away_team'])

    xg_home_preds.append(xg_pred['home_goal_expectation'])
    xg_away_preds.append(xg_pred['away_goal_expectation'])
    goals_home_preds.append(goals_pred['home_goal_expectation'])
    goals_away_preds.append(goals_pred['away_goal_expectation'])

# Add these predictions back to the DataFrame
df['xg_home_pred'] = xg_home_preds
df['xg_away_pred'] = xg_away_preds
df['goals_home_pred'] = goals_home_preds
df['goals_away_pred'] = goals_away_preds

# Calculate absolute errors for xG predictions
df['xg_home_error'] = abs(df['home_xgoals'] - df['xg_home_pred'])
df['xg_away_error'] = abs(df['away_xgoals'] - df['xg_away_pred'])

# Calculate absolute errors for goals predictions
df['goals_home_error'] = abs(df['home_goals'] - df['goals_home_pred'])
df['goals_away_error'] = abs(df['away_goals'] - df['goals_away_pred'])


# Calculate MAE for xG predictions
xg_mae_home = df['xg_home_error'].mean()
xg_mae_away = df['xg_away_error'].mean()

# Calculate MAE for goals predictions
goals_mae_home = df['goals_home_error'].mean()
goals_mae_away = df['goals_away_error'].mean()

# Combine MAEs for home and away
xg_mae_total = (xg_mae_home + xg_mae_away) / 2
goals_mae_total = (goals_mae_home + goals_mae_away) / 2

print(f"xG MAE (home): {xg_mae_home}")
print(f"xG MAE (away): {xg_mae_away}")
print(f"xG MAE (total): {xg_mae_total}")

print(f"Goals MAE (home): {goals_mae_home}")
print(f"Goals MAE (away): {goals_mae_away}")
print(f"Goals MAE (total): {goals_mae_total}")

xG MAE (home): 0.5931893604323308
xG MAE (away): 0.5302216778195488
xG MAE (total): 0.5617055191259398
Goals MAE (home): 0.935657894736842
Goals MAE (away): 0.8163157894736841
Goals MAE (total): 0.875986842105263


In [11]:
import pandas as pd
import sys
import os


# Manually define fixtures for GW22
fixtures = [
    ("Brighton", "Bournemouth"),
]

# Get predictions for each fixture
results = []

for fixture in fixtures:
    home_team, away_team = fixture
    
    xg_preds = predict(xg_model_params, home_team, away_team)
    goals_preds = predict(goals_model_params, home_team, away_team)
    
    weighted_preds = {}
    
    for key in xg_preds.keys():
        weighted_value = (0.7 * xg_preds[key]) + (0.3 * goals_preds[key])
        weighted_preds[key] = weighted_value
    
    results.append({
        'home_team': home_team,
        'away_team': away_team,
        **weighted_preds
    })

results_df = pd.DataFrame(results)


# Display the results DataFrame
display(results_df)


Unnamed: 0,home_team,away_team,home_win_prob,draw_prob,away_win_prob,home_clean_sheet_prob,away_clean_sheet_prob,home_goal_expectation,away_goal_expectation,home_3_plus_goals_prob,away_3_plus_goals_prob
0,Brighton,Bournemouth,0.355,0.242,0.407,0.198,0.224,1.512,1.626,0.193,0.226


In [None]:
# from viz.model_pred_viz.gw_dict import gw_dict
# current_date = pd.Timestamp.today()
# for gw, date in gw_dict.items():
#         if current_date >= pd.to_datetime(date):
#             current_gw = gw
#         else:
#             break
# print (current_gw)

results_df.to_csv(f"predictions/ensmeble_27_preds.csv")