In [22]:
import pymc as pm
import numpy as np
import pandas as pd
import os
import requests
import io
from datetime import datetime, timedelta
import multiprocessing
import arviz as az
import logging
import tqdm

# get data
API_KEY = os.getenv("API_KEY")
url = 'https://data-service.beatthebookie.blog/data'
headers = {"x-api-key": API_KEY}

# Function to fetch data for a specific division and season
def fetch_data(division, season):
    params = {
        'division': division,
        'season': season
    }
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return pd.read_json(io.StringIO(response.content.decode('utf-8')))
    else:
        print(f"Error fetching {division} {season}: {response.status_code}")
        print(response.content.decode('utf-8'))
        return pd.DataFrame()

# Fetch data for all combinations
seasons = ['2024_2025', '2023_2024']
divisions = ['Premier League', 'Championship']
dataframes = []

for division in divisions:
    for season in seasons:
        df = fetch_data(division, season)
        if not df.empty:
            dataframes.append(df)

# Combine all dataframes
if dataframes:
    df = pd.concat(dataframes, ignore_index=True)
    
    # Convert match_date to datetime
    df['match_date'] = pd.to_datetime(df['match_date'])

In [None]:
# build model
import psutil


def build_bayesian_model(home_teams, away_teams, home_goals, away_goals, home_xg, away_xg, dates, leagues):
    print("Building Bayesian model...")
    print(f"Dataset size: {len(home_teams)} matches")
    print(f"Time span: {dates.min()} to {dates.max()}")
    
    # get unique teams and leagues
    teams = sorted(list(set(home_teams) | set(away_teams))) # alphabetically sorts and de-dupes list of team names
    unique_leagues = sorted(list(set(leagues)))

    # sets index values for each team/league within a dict
    team_indices = {team: idx for idx, team in enumerate(teams)}
    league_indices = {league: idx for idx, league in enumerate(unique_leagues)}

    # convert date into time differences
    max_date = np.max(dates)
    time_diffs = (max_date - dates).dt.days

    # convert team names to index vals
    home_idx = [team_indices[team] for team in home_teams]
    away_idx = [team_indices[team] for team in away_teams]

    # Get league index for each team directly from the data
    home_league_idx = [league_indices[league] for league in leagues]
    away_league_idx = [league_indices[league] for league in leagues]
    
    # Create array of league indices for each team
    team_league_idx = np.zeros(len(teams), dtype=int)
    for team, idx in team_indices.items():
        # Find first occurrence of this team and use its league
        if team in home_teams:
            first_idx = list(home_teams).index(team)
            team_league_idx[idx] = home_league_idx[first_idx]
        else:
            first_idx = list(away_teams).index(team)
            team_league_idx[idx] = away_league_idx[first_idx]

    with pm.Model() as model:
        # league level parameters for league strengths
        league_attack_mu = pm.Normal("league_attack_mu", mu=0, sigma=0.5) # using a normal distribution to infer average league attack value
        league_attack_sigma = pm.HalfNormal("league_attack_sigma", sigma=0.5) # using a half normal dist to infer league attack spread, half normal as std must be positive
        league_defense_mu = pm.Normal("league_defense_mu", mu=0, sigma=0.5)
        league_defense_sigma = pm.HalfNormal("league_defense_sigma", sigma=0.5)

        # creating raw league strengths for all leagues EXCEPT Premier League
        premier_league_idx = league_indices["Premier League"]
        league_strength_raw = pm.Normal("league_strength_raw", mu=-0.5, sigma=0.3, shape=len(unique_leagues)-1) # setting mu to -0.5 as other leagues are expected to be weaker. shape = -1 as Premier league will be 0
        league_strength = pm.Deterministic( # deterministic variable as derived from other random variables (league strengths)
            "league_strength",
            pm.math.concatenate([
                league_strength_raw[:premier_league_idx],
                pm.math.zeros(1), # creating array that will have all league strengths with Premier league in the "middle" with 0
                league_strength_raw[premier_league_idx:]
            ])
        )

        # team strength initalisation
        attack_raw = pm.Normal("attack_raw", mu=0, sigma=1, shape=len(teams)) # initalising normal distribution for relative attacking strength with mean 0 and std of 1
        defense_raw = pm.Normal('defense_raw', mu=0, sigma=1, shape=len(teams))

        # scale team strengths by league
        attack = pm.Deterministic(
            "attack",
            attack_raw * league_attack_sigma + league_attack_mu + league_strength[team_league_idx] # combining raw team strength with league average/std and then penalising by league overall strength
        )
        defense = pm.Deterministic(
            "defense",
            defense_raw * league_defense_sigma + league_defense_mu + league_strength[team_league_idx]
        )

        # initalise time decay parameter
        decay_rate = pm.HalfNormal("decay_rate", sigma=1.5/365) # balanced prior for decay rate, divided by 365 to account for daily rate

        # initalise home advantage
        home_advantage = pm.Normal("home_advantage", mu=0.2, sigma=0.1) # initalises home_adv to 0.2 and has std of 0.1 so val can extend or reduce that much

        # create time decay factor to apply to expected goals
        time_factor = pm.math.exp(-decay_rate * time_diffs)

        # expected goals parameter for both xG and goals, applied time decay
        home_theta = time_factor * pm.math.exp(attack[home_idx] - defense[away_idx] + home_advantage) # we use exponential so it's always positive and team strengths are multiplicative
        away_theta = time_factor * pm.math.exp(attack[away_idx] - defense[home_idx])

        # goals likelihood (poisson for actual goals)
        home_goals_like = pm.Poisson("home_goals", mu=home_theta, observed=home_goals) 
        away_goals_like = pm.Poisson("away_goals", mu=away_theta, observed=away_goals)

        # xG likelihood (gamma for expected goals)
        xg_alpha = pm.HalfNormal("xg_alpha", sigma=1.0) # shape parameter (must be positive hence half normal) - alpha shapes basic form of distribution
        home_xg_beta = xg_alpha / home_theta # beta is rate parameter - scales where that form sits on the axis
        away_xg_beta = xg_alpha / away_theta # we are setting the mean of the xg distribution to be equal to our team strength rating

        # add small constant to not allow 0s which breaks Gamma dist
        epsilon = 0.00001
        home_xg_adj = home_xg + epsilon
        away_xg_adj = away_xg + epsilon

        home_xg_like = pm.Gamma("home_xg", alpha=xg_alpha, beta=home_xg_beta, observed=home_xg_adj)
        away_xg_like = pm.Gamma("away_xg", alpha=xg_alpha, beta=away_xg_beta, observed=away_xg_adj)

        print("Model building completed!")

    return model, team_indices, league_indices

def fit_bayesian_model(model, draws=500):
    n_cores = min(4, multiprocessing.cpu_count() - 1)
    
    print(f"Starting model fitting with {n_cores} cores...")
    print(f"Planning {draws} draws with 500 tuning steps...")
    
    with model:
        trace = pm.sample(
            draws=draws,
            tune=500,
            chains=n_cores,
            cores=n_cores,
            progressbar=True,
            return_inferencedata=True,
            init='adapt_diag',
            target_accept=0.95,
            nuts={"max_treedepth": 15}  # Correctly nested NUTS parameter
        )
        
        # Print sampling diagnostics
        print("\nSampling Statistics:")
        print(f"Number of divergences: {trace.sample_stats.diverging.sum().values}")
        
        return trace
    
# Function to monitor memory usage
def print_memory_usage():
    process = psutil.Process()
    print(f"Memory usage: {process.memory_info().rss / 1024 / 1024:.2f} MB")

# Setup logging
logging.getLogger('pymc').setLevel(logging.INFO)



def get_league_strengths(trace, league_indices):
    leagues = list(league_indices.keys())
    league_strength_means = trace.posterior['league_strength'].mean(dim=['chain', 'draw']).values
    
    results = pd.DataFrame({
        'league': leagues,
        'league_strength': league_strength_means
    })
    
    return results.round(3).sort_values('league_strength', ascending=False)

def get_hierarchical_team_strengths(trace, team_indices, league_indices, team_leagues, current_teams):
    teams = list(team_indices.keys())
    attack_means = trace.posterior['attack'].mean(dim=['chain', 'draw']).values
    defense_means = trace.posterior['defense'].mean(dim=['chain', 'draw']).values
    home_adv = trace.posterior['home_advantage'].mean(dim=['chain', 'draw']).values
    
    # Get league strengths for reference
    league_strengths = get_league_strengths(trace, league_indices)
    
    results = pd.DataFrame({
        'team': teams,
        'league': [team_leagues.get(team, 'Unknown') for team in teams],  # Correctly map teams to leagues
        'attack_strength': attack_means,
        'defense_strength': defense_means,
        'overall_strength': (np.exp(attack_means - np.mean(defense_means)) - 
                           np.exp(np.mean(attack_means) - defense_means)),
        'home_advantage': home_adv
    })
    
    # Merge with league strengths
    results = results.merge(
        league_strengths,
        left_on='league',
        right_on='league',
        how='left'
    )
    
    # Filter current teams and sort
    results = (results[results['team'].isin(current_teams)]
              .round(3)
              .sort_values('overall_strength', ascending=False))
    
    return results, home_adv

def analyze_league_strengths(trace, league_indices, team_indices, team_leagues):
    # Get basic league strengths
    leagues = list(league_indices.keys())
    league_strength_means = trace.posterior['league_strength'].mean(dim=['chain', 'draw']).values
    
    # Get the posterior distributions for additional analysis
    league_attack_mu = trace.posterior['league_attack_mu'].mean(dim=['chain', 'draw']).values
    league_attack_sigma = trace.posterior['league_attack_sigma'].mean(dim=['chain', 'draw']).values
    league_defense_mu = trace.posterior['league_defense_mu'].mean(dim=['chain', 'draw']).values
    league_defense_sigma = trace.posterior['league_defense_sigma'].mean(dim=['chain', 'draw']).values
    
    # Calculate league-specific metrics
    detailed_results = []
    
    for league in leagues:
        league_idx = league_indices[league]
        league_teams = [team for team, l in team_leagues.items() if l == league]
        
        league_data = {
            'league': league,
            'base_strength': league_strength_means[league_idx],
            'attack_variation': league_attack_sigma,  # How much attack strength varies within the league
            'defense_variation': league_defense_sigma,  # How much defense strength varies within the league
            'num_teams': len(league_teams),
            'teams': ', '.join(sorted(league_teams)[:5]) + ('...' if len(league_teams) > 5 else '')
        }
        
        detailed_results.append(league_data)
    
    results_df = pd.DataFrame(detailed_results)
    
    # Calculate expected goals adjustment between leagues
    for idx, row in results_df.iterrows():
        base_league_strength = row['base_strength']
        results_df.loc[idx, 'expected_goals_vs_avg'] = np.exp(base_league_strength) - 1
    
    return results_df.round(3).sort_values('base_strength', ascending=False)
    

In [24]:
data = df[["home_team", "away_team", "home_goals", "away_goals", "home_xgoals", "away_xgoals", "match_date", "division"]]

# filter to matches only in previous 365 days
data = data[data["match_date"] > datetime.now() - timedelta(days=365)]

# get list of current teams
current_teams = df[df["season"] == 20242025]["home_team"].unique()

# get list of leagues
team_leagues = dict(zip(df["home_team"], df["division"]))

# Build model
model, team_indices, league_indices = build_bayesian_model(
        home_teams=data['home_team'],
        away_teams=data['away_team'],
        home_goals=np.array(data['home_goals']),
        away_goals=np.array(data['away_goals']),
        home_xg=np.array(data["home_xgoals"]),
        away_xg=np.array(data["away_xgoals"]),
        dates=data["match_date"],
        leagues=data["division"]
    )
    
# Fit model
print_memory_usage()
trace = fit_bayesian_model(model)
print_memory_usage()




Building Bayesian model...
Dataset size: 912 matches
Time span: 2024-02-10 00:00:00 to 2025-01-26 00:00:00
Model building completed!
Memory usage: 302.32 MB
Starting model fitting with 4 cores...
Planning 500 draws with 250 tuning steps...


  warn(
Auto-assigning NUTS sampler...
Initializing NUTS using adapt_diag...
  warn(
  warn(
  warn(
  warn(
  warn(
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [league_attack_mu, league_attack_sigma, league_defense_mu, league_defense_sigma, league_strength_raw, attack_raw, defense_raw, decay_rate, home_advantage, xg_alpha]


Sampling 4 chains for 500 tune and 65 draw iterations (2_000 + 260 draws total) took 3361 seconds.
The number of samples is too small to check convergence reliably.



Sampling Statistics:
Number of divergences: 0
Memory usage: 121.39 MB


In [None]:
# Create a dictionary mapping each team to its league based on the most recent season
latest_season = df["season"].max()
previous_season = latest_season - 1

# Combine current and previous season data
combined_df = pd.concat([df[df["season"] == latest_season], df[df["season"] == previous_season]])

# Create a dictionary mapping each team to its league
team_leagues = dict(zip(combined_df["home_team"], combined_df["division"]))

# Get results
team_strengths, home_advantage = get_hierarchical_team_strengths(
    trace=trace,
    team_indices=team_indices,
    league_indices=league_indices,
    team_leagues=team_leagues,
    current_teams=current_teams
)

# Analyze league strengths
league_analysis = analyze_league_strengths(
    trace=trace,
    league_indices=league_indices,
    team_indices=team_indices,
    team_leagues=team_leagues
)

# Print results
print("\nTeam Strengths:")
print(team_strengths)

print("\nLeague Analysis:")
print(league_analysis)


Team Strengths:
                team          league  attack_strength  defense_strength  \
22         Liverpool  Premier League            0.606             0.121   
0            Arsenal  Premier League            0.484             0.424   
24          Man City  Premier League            0.497             0.094   
10           Chelsea  Premier League            0.500            -0.132   
28         Newcastle  Premier League            0.384            -0.084   
4        Bournemouth  Premier League            0.364            -0.108   
42         Tottenham  Premier League            0.388            -0.206   
16            Fulham  Premier League            0.163            -0.054   
12    Crystal Palace  Premier League            0.158            -0.058   
30     Nott'm Forest  Premier League            0.125            -0.006   
1        Aston Villa  Premier League            0.240            -0.215   
5          Brentford  Premier League            0.226            -0.210   
6       

In [None]:
import pickle

def save_model_results(trace, team_indices, league_indices, team_strengths, league_analysis, filename=None):
    """Save all model results to a pickle file"""
    if filename is None:
        filename = f'model_results_{datetime.now().strftime("%Y%m%d")}.pkl'
    
    results = {
        'trace': trace,
        'team_indices': team_indices,
        'league_indices': league_indices,
        'team_strengths': team_strengths,
        'league_analysis': league_analysis
    }
    with open(filename, 'wb') as f:
        pickle.dump(results, f)
    print(f"Results saved to {filename}")

def load_model_results(filename):
    """Load model results from pickle file"""
    with open(filename, 'rb') as f:
        results = pickle.load(f)
    return (results['trace'], results['team_indices'], results['league_indices'], 
            results['team_strengths'], results['league_analysis'])

filename = f'model_results_{datetime.now().strftime("%Y%m%d")}.pkl'
save_model_results(trace, team_indices, league_indices, team_strengths, league_analysis, filename)

Results saved to model_results_20250208.pkl


In [None]:
trace, team_indices, league_indices, team_strengths, league_analysis = load_model_results(filename)

# Print results
print("\nTeam Strengths:")
print(team_strengths)

print("\nLeague Analysis:")
print(league_analysis)


Team Strengths:
                team          league  attack_strength  defense_strength  \
22         Liverpool  Premier League            0.606             0.121   
0            Arsenal  Premier League            0.484             0.424   
24          Man City  Premier League            0.497             0.094   
10           Chelsea  Premier League            0.500            -0.132   
28         Newcastle  Premier League            0.384            -0.084   
4        Bournemouth  Premier League            0.364            -0.108   
42         Tottenham  Premier League            0.388            -0.206   
16            Fulham  Premier League            0.163            -0.054   
12    Crystal Palace  Premier League            0.158            -0.058   
30     Nott'm Forest  Premier League            0.125            -0.006   
1        Aston Villa  Premier League            0.240            -0.215   
5          Brentford  Premier League            0.226            -0.210   
6       

In [None]:
def predict_match(home_team, away_team, trace, team_indices):
    home_idx = team_indices[home_team]
    away_idx = team_indices[away_team]

    # returning the range of distributions that teams attack/defense and home_adv could lie between
    attack_samples = trace.posterior["attack"].values
    defense_samples = trace.posterior["defense"].values
    home_advantage = trace.posterior["home_advantage"].values

    # use all combinations of strength to make prediction
    home_theta = np.exp(attack_samples[..., home_idx] - # ... means use all chains and draws
                        defense_samples[..., away_idx] +
                        home_advantage)
    away_theta = np.exp(attack_samples[...,  away_idx] - 
                        defense_samples[..., home_idx])
    
    # calculate mean expected goals from above samples
    home_xg = float(home_theta.mean())
    away_xg = float(away_theta.mean())

    # simulate match many times using Poisson distribution
    n_sims = 1000
    home_goals = np.random.poisson(home_xg, n_sims)
    away_goals = np.random.poisson(away_xg, n_sims)

    # Calculate match outcome probabilities
    home_wins = np.mean(home_goals > away_goals)
    draws = np.mean(home_goals == away_goals)
    away_wins = np.mean(home_goals < away_goals)

    return {
        'home_xg': round(home_xg, 2),
        'away_xg': round(away_xg, 2),
        'home_win_prob': round(home_wins * 100, 1),
        'draw_prob': round(draws * 100, 1),
        'away_win_prob': round(away_wins * 100, 1)
    }

def print_prediction(home_team, away_team, prediction):
    """Pretty print the match prediction"""
    print(f"\nMatch Prediction: {home_team} (H) vs {away_team} (A)")
    print(f"Expected Goals: {home_team} {prediction['home_xg']} - {prediction['away_xg']} {away_team}")
    print(f"Win Probability: {home_team}: {prediction['home_win_prob']}%")
    print(f"Draw Probability: {prediction['draw_prob']}%")
    print(f"Win Probability: {away_team}: {prediction['away_win_prob']}%")



prediction = predict_match("Tottenham", "Man United", trace, team_indices)   
print_prediction("Tottenham", "Man United", prediction) 

NameError: name 'trace' is not defined