In [13]:

import pymc as pm

import pandas as pd
import pymc as pm
import arviz as az
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from scipy.stats import poisson
import sqlite3

conn = sqlite3.connect(r'C:\Users\Owner\dev\algobetting\infra\data\db\algobetting.db')

df = pd.read_sql_query("""
                SELECT 
                    team as home_team,
                    summary_xg as home_xg,
                    summary_goals as home_goals,
                    keeper_psxg as home_psxg,
                    opp_team as away_team,
                    opp_summary_xg as away_xg,
                    opp_summary_goals as away_goals,
                    opp_keeper_psxg as away_psxg,
                    match_date as date,
                    division
                FROM fbref_match_all_columns
                WHERE division IN ('Premier League',  'Championship')
                    AND match_date > '2024-01-01'
                    AND summary_xg IS NOT NULL
                    AND opp_summary_xg IS NOT NULL
                       """, conn)

conn.close()

df["days_ago"] = (pd.to_datetime(df["date"]).max() - pd.to_datetime(df["date"])).dt.days

df

Unnamed: 0,home_team,home_xg,home_goals,home_psxg,away_team,away_xg,away_goals,away_psxg,date,division,days_ago
0,Tottenham,2.0,1.0,1.5,Brighton,2.2,4.0,4.1,2025-05-25,Premier League,0
1,Brighton,2.2,4.0,4.1,Tottenham,2.0,1.0,1.5,2025-05-25,Premier League,0
2,Bournemouth,1.6,2.0,2.1,Leicester City,0.3,0.0,0.0,2025-05-25,Premier League,0
3,Leicester City,0.3,0.0,0.0,Bournemouth,1.6,2.0,2.1,2025-05-25,Premier League,0
4,Newcastle Utd,1.2,0.0,1.7,Everton,1.2,1.0,1.1,2025-05-25,Premier League,0
...,...,...,...,...,...,...,...,...,...,...,...
2725,Watford,1.1,3.0,1.6,Millwall,2.0,2.0,1.7,2024-08-10,Championship,288
2726,Preston,0.6,0.0,0.7,Sheffield Utd,1.0,2.0,0.9,2024-08-09,Championship,289
2727,Sheffield Utd,1.0,2.0,0.9,Preston,0.6,0.0,0.7,2024-08-09,Championship,289
2728,Blackburn,2.1,4.0,3.4,Derby County,1.1,2.0,2.0,2024-08-09,Championship,289


In [14]:
def scoreline_probability_matrix(home_xg, away_xg, max_goals=9):
    """
    Create probability matrix for all possible scorelines given xG
    Returns: matrix where [i,j] = P(home_goals=i, away_goals=j | xG)
    """
    home_probs = poisson.pmf(range(max_goals + 1), home_xg)
    away_probs = poisson.pmf(range(max_goals + 1), away_xg)
    
    # Outer product gives all scoreline combinations
    prob_matrix = np.outer(home_probs, away_probs)
    return prob_matrix

def create_weighted_scoreline_data(df, max_goals=9, actual_scoreline_boost=0.3):
    """
    Create expanded dataset with all possible scorelines for each match
    Each scoreline gets weighted by its xG probability, with extra weight on actual result
    """
    expanded_data = []
    
    for idx, row in df.iterrows():
        # Get probability matrix for this match
        prob_matrix = scoreline_probability_matrix(
            row['home_xg'], row['away_xg'], max_goals
        )
        
        # Get actual scoreline
        actual_home = row['home_goals']  # assuming these columns exist
        actual_away = row['away_goals']
        
        # Calculate total probability mass to redistribute
        remaining_weight = 1.0 - actual_scoreline_boost
        
        # Create entry for every possible scoreline
        for home_goals in range(max_goals + 1):
            for away_goals in range(max_goals + 1):
                
                # Base xG probability weight
                xg_weight = prob_matrix[home_goals, away_goals]
                
                # Check if this is the actual scoreline
                is_actual = (home_goals == actual_home and away_goals == actual_away)
                
                if is_actual:
                    # Actual scoreline gets boosted weight
                    final_xg_weight = actual_scoreline_boost + (remaining_weight * xg_weight)
                else:
                    # Other scorelines share the remaining probability mass
                    final_xg_weight = remaining_weight * xg_weight
                
                # Time decay weight
                time_weight = np.exp(-0.0001 * row['days_ago'])
                
                # Combined weight
                combined_weight = time_weight * final_xg_weight
                
                expanded_data.append({
                    'match_id': idx,
                    'home_team': row['home_team'],
                    'away_team': row['away_team'],
                    'home_goals': home_goals,
                    'away_goals': away_goals,
                    'weight': combined_weight,
                    'days_ago': row['days_ago'],
                    'is_actual': is_actual,
                    'division': row['division']
                })
    
    return pd.DataFrame(expanded_data)


In [15]:
df = create_weighted_scoreline_data(df, max_goals=9)

teams = df["home_team"].unique()
n_teams = len(teams)
home_idx = pd.Categorical(df["home_team"], categories=teams).codes
away_idx = pd.Categorical(df["away_team"], categories=teams).codes

divisions = df["division"].unique()
n_divisions = len(divisions)
division_idx = pd.Categorical(df["division"], categories=divisions).codes


home_goals_obs = df["home_goals"]
away_goals_obs = df["away_goals"]

In [None]:
with pm.Model() as model:
    att_str_raw = pm.Normal("att_str_raw", mu=0, sigma=1, shape=n_teams)
    def_str_raw = pm.Normal("def_str_raw", mu=0, sigma=1, shape=n_teams)
    
    # Division effects
    div_att_effect = pm.Normal("div_att_effect", 
                              mu=[0, -0.22], 
                              sigma=[0.05, 0.1], 
                              shape=n_divisions)
    
    div_def_effect = pm.Normal("div_def_effect", 
                              mu=[0, 0.26], 
                              sigma=[0.05, 0.1], 
                              shape=n_divisions)
    
    # Center the raw team parameters first
    att_str = pm.Deterministic("att_str", att_str_raw - pm.math.mean(att_str_raw))
    def_str = pm.Deterministic("def_str", def_str_raw - pm.math.mean(def_str_raw))
    
    # Apply division effects to centered team strengths
    effective_home_att = att_str[home_idx] + div_att_effect[division_idx]
    effective_home_def = def_str[home_idx] + div_def_effect[division_idx]
    effective_away_att = att_str[away_idx] + div_att_effect[division_idx]
    effective_away_def = def_str[away_idx] + div_def_effect[division_idx]
    
    # Home advantage by division
    home_adv_by_div = pm.HalfNormal("home_adv_by_div", sigma=0.25, shape=n_divisions)
    
    home_goals_mu = pm.math.exp(effective_home_att + effective_away_def + home_adv_by_div[division_idx])
    away_goals_mu = pm.math.exp(effective_away_att + effective_home_def)

    weights = pm.ConstantData("weights", df["weight"].values)
    
    home_logp = pm.logp(pm.Poisson.dist(mu=home_goals_mu), home_goals_obs)
    away_logp = pm.logp(pm.Poisson.dist(mu=away_goals_mu), away_goals_obs)

    pm.Potential("weighted_home_goals", pm.math.sum(weights * home_logp))
    pm.Potential("weighted_away_goals", pm.math.sum(weights * away_logp))

    trace = pm.sample(
        100,
        tune=50,
        cores=4,
        nuts_sampler="blackjax",
        return_inferencedata=True
    )




ValueError: Only supporting the following methods to draw chains: "parallel" or "vectorized"

In [None]:
# Summary statistics for all parameters
az.summary(trace)

# Trace plots for all parameters
az.plot_trace(trace, var_names=[
    "att_str_raw", 
    "def_str_raw", 
    "att_str",     
    "def_str",            
    "div_att_effect", 
    "div_def_effect", 
    "home_adv_by_div"
])


print(az.summary(trace))

In [None]:

# Get team rankings
att_summary = az.summary(trace, var_names=["att_str"])
att_summary.index = teams
print("Attack Strength Rankings:")
print(att_summary[['mean', 'hdi_3%', 'hdi_97%']].sort_values("mean", ascending=False))

def_summary = az.summary(trace, var_names=["def_str"]) 
def_summary.index = teams
print("\nDefense Strength Rankings (lower is better):")
print(def_summary[['mean', 'hdi_3%', 'hdi_97%']].sort_values("mean", ascending=True))



In [None]:
import numpy as np

# With sum-to-zero constraints, league average is simply 0
att_league_avg = 0.0
def_league_avg = 0.0
home_adv_mean = az.summary(trace, var_names=["home_adv"])['mean'].iloc[0]

print(f"League average attack strength: {att_league_avg:.3f}")
print(f"League average defense strength: {def_league_avg:.3f}")
print(f"Home advantage: {home_adv_mean:.3f}")

def convert_to_expected_goals_constrained(att_summary, def_summary, home_adv):
    results = []
    
    for team in teams:
        team_att = att_summary.loc[team, 'mean']
        team_def = def_summary.loc[team, 'mean']
        
        # Expected goals FOR this team vs league average opponent
        goals_for_home = np.exp(team_att + def_league_avg + home_adv)
        goals_for_away = np.exp(team_att + def_league_avg)
        
        # Expected goals AGAINST this team vs league average opponent  
        goals_against_home = np.exp(att_league_avg + team_def)
        goals_against_away = np.exp(att_league_avg + team_def + home_adv)
        
        results.append({
            'Team': team,
            'Goals_For_Home': goals_for_home,
            'Goals_For_Away': goals_for_away,
            'Goals_Against_Home': goals_against_home,
            'Goals_Against_Away': goals_against_away,
            'Avg_Goals_For': (goals_for_home + goals_for_away) / 2,
            'Avg_Goals_Against': (goals_against_home + goals_against_away) / 2,
            'Avg_Goal_Diff': ((goals_for_home + goals_for_away) / 2) - ((goals_against_home + goals_against_away) / 2)
        })
    
    return pd.DataFrame(results)

# Create the conversion
goals_df = convert_to_expected_goals_constrained(att_summary, def_summary, home_adv_mean)

print("\nExpected Goals vs League Average Team:")
print(goals_df[['Team', 'Avg_Goals_For', 'Avg_Goals_Against', 'Avg_Goal_Diff']].sort_values('Avg_Goal_Diff', ascending=False))