In [3]:
import pymc as pm
import numpy as np
import pandas as pd
import os
import requests
import io

# get data
API_KEY = os.getenv("API_KEY")
url = 'https://data-service.beatthebookie.blog/data'
headers = {"x-api-key": API_KEY}

# Function to fetch data for a specific division and season
def fetch_data(division, season):
    params = {
        'division': division,
        'season': season
    }
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return pd.read_json(io.StringIO(response.content.decode('utf-8')))
    else:
        print(f"Error fetching {division} {season}: {response.status_code}")
        print(response.content.decode('utf-8'))
        return pd.DataFrame()

# Fetch data for all combinations
seasons = ['2024_2025']
divisions = ['Premier League']
dataframes = []

for division in divisions:
    for season in seasons:
        df = fetch_data(division, season)
        if not df.empty:
            dataframes.append(df)

# Combine all dataframes
if dataframes:
    df = pd.concat(dataframes, ignore_index=True)
    
    # Convert match_date to datetime
    df['match_date'] = pd.to_datetime(df['match_date'])

In [4]:
# build model

def build_bayesian_model(home_teams, away_teams, home_goals, away_goals):
    # get unique teams
    teams = sorted(list(set(home_teams) | set(away_teams))) # alphabetically sorts and de-dupes list of team names
    team_indices = {team: idx for idx, team in enumerate(teams)} # sets index values for each team within a dict

    # convert team names to index vals
    home_idx = [team_indices[team] for team in home_teams]
    away_idx = [team_indices[team] for team in away_teams]

    with pm.Model() as model:
        # team strength initalisation
        attack = pm.Normal("attack", mu=0, sigma=0.3, shape=len(teams)) # initalising normal distribution for relative attacking strength with mean 0 and std of 0.3
        defense = pm.Normal('defense', mu=0, sigma=0.3, shape=len(teams))

        # initalise home advantage
        home_advantage = pm.Normal("home_advantage", mu=0.2, sigma=0.1) # initalises home_adv to 0.2 and has std of 0.1 so val can extend or reduce that much

        # expected goals
        home_theta = pm.math.exp(attack[home_idx] - defense[away_idx] + home_advantage) # we use exponential so it's always positive and team strengths are multiplicative
        away_theta = pm.math.exp(attack[away_idx] - defense[home_idx])

        # goals likelihood
        home_goals_like = pm.Poisson("home_goals", mu=home_theta, observed=home_goals) # the likelihood function modelling goals on a Poisson distribution
        away_goals_like = pm.Poisson("away_goals", mu=away_theta, observed=away_goals)

    return model, team_indices

def fit_bayesian_model(model, draws=1000):
    with model:
        trace = pm.sample(draws=draws) # x draws means the MCMC is generating x different sets of parameters that are ALL plausible given our data, with more likely parameter sets appearing more frequently in our draws.
    return trace


def get_team_strengths(trace, team_indices):
    teams = list(team_indices.keys())
    attack_means = np.mean(trace.posterior['attack'], axis=(0,1))
    defense_means = np.mean(trace.posterior['defense'], axis=(0,1))
    
    # Create DataFrame with better formatting
    results = pd.DataFrame({
        'team': teams,
        'attack_strength': attack_means,
        'defense_strength': defense_means
    })
    
    # Round the values and sort by attack strength
    results = results.round(3).sort_values('attack_strength', ascending=False)
    
    return results
    

In [6]:
data = df[["home_team", "away_team", "home_goals", "away_goals"]]

# Build model
model, team_indices = build_bayesian_model(
        home_teams=data['home_team'],
        away_teams=data['away_team'],
        home_goals=np.array(data['home_goals']),
        away_goals=np.array(data['away_goals'])
    )
    
# Fit model
trace = fit_bayesian_model(model)
    
# Get results
strengths = get_team_strengths(trace, team_indices)
print("\nTeam Strengths:")
print(strengths.to_string(index=False))

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [attack, defense, home_advantage]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 322 seconds.



Team Strengths:
          team                                                                                          attack_strength                                                                                           defense_strength
       Arsenal    <xarray.DataArray 'attack' ()> Size: 8B\narray(0.3095619)\nCoordinates:\n    attack_dim_0  int32 4B 0   <xarray.DataArray 'defense' ()> Size: 8B\narray(0.22268613)\nCoordinates:\n    defense_dim_0  int32 4B 0
   Aston Villa   <xarray.DataArray 'attack' ()> Size: 8B\narray(0.11556464)\nCoordinates:\n    attack_dim_0  int32 4B 1  <xarray.DataArray 'defense' ()> Size: 8B\narray(-0.15549263)\nCoordinates:\n    defense_dim_0  int32 4B 1
   Bournemouth    <xarray.DataArray 'attack' ()> Size: 8B\narray(0.2937062)\nCoordinates:\n    attack_dim_0  int32 4B 2   <xarray.DataArray 'defense' ()> Size: 8B\narray(0.07721247)\nCoordinates:\n    defense_dim_0  int32 4B 2
     Brentford   <xarray.DataArray 'attack' ()> Size: 8B\narray(0.29839308)