In [1]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.special import factorial
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
import sqlite3

conn = sqlite3.connect(r'C:\Users\Owner\dev\algobetting\infra\data\db\algobetting.db')

df = pd.read_sql_query("""
                SELECT 
                    team as home_team,
                    opp_team as away_team,
                    summary_xg as home_xg,
                    opp_summary_xg as away_xg,
                    summary_goals as home_goals,
                    opp_summary_goals as away_goals,
                    match_date as date,
                    season
                FROM fbref_match_all_columns
                WHERE division = 'Premier League'
                    AND season IN ('2024-2025')
                    AND summary_xg IS NOT NULL
                    AND opp_summary_xg IS NOT NULL
                    AND is_home = 1
                    AND match_date > '2025-05-01'
                       """, conn)

conn.close()

df

Unnamed: 0,home_team,away_team,home_xg,away_xg,home_goals,away_goals,date,season
0,Tottenham,Brighton,2.0,2.2,1.0,4.0,2025-05-25,2024-2025
1,Bournemouth,Leicester City,1.6,0.3,2.0,0.0,2025-05-25,2024-2025
2,Newcastle Utd,Everton,1.2,1.2,0.0,1.0,2025-05-25,2024-2025
3,Fulham,Manchester City,1.3,3.1,0.0,2.0,2025-05-25,2024-2025
4,Nott'ham Forest,Chelsea,1.2,1.1,0.0,1.0,2025-05-25,2024-2025
5,Liverpool,Crystal Palace,2.1,1.8,1.0,1.0,2025-05-25,2024-2025
6,Southampton,Arsenal,0.6,2.4,1.0,2.0,2025-05-25,2024-2025
7,Wolves,Brentford,1.0,1.4,1.0,1.0,2025-05-25,2024-2025
8,Ipswich Town,West Ham,0.7,1.1,1.0,3.0,2025-05-25,2024-2025
9,Manchester Utd,Aston Villa,2.9,0.4,2.0,0.0,2025-05-25,2024-2025


In [2]:
import pymc as pm
import numpy as np
import pandas as pd
import pytensor.tensor as pt
from scipy.stats import poisson
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from tqdm import tqdm

class BayesianTeamStrengthModel:
    def __init__(self, decay_rate=0.1):
        """
        Bayesian model for team strength estimation with weighted scorelines and time decay
        
        Parameters:
        -----------
        decay_rate : float
            Rate of exponential time decay (higher = faster decay)
        """
        self.decay_rate = decay_rate
        self.model = None
        self.trace = None
        self.teams = None
        self.team_to_idx = None
        
    def prepare_data(self, matches_df, show_progress=True):
        """
        Prepare match data for modeling
        
        Expected columns:
        - home_team, away_team: team names
        - home_xg, away_xg: expected goals
        - home_goals, away_goals: actual goals
        - match_date: date of match
        - scoreline_weights: dict of {(home_goals, away_goals): probability} or None
        """
        if show_progress:
            print("🔄 Preparing data...")
        
        # Create team mappings
        all_teams = list(set(matches_df['home_team'].unique()) | set(matches_df['away_team'].unique()))
        self.teams = sorted(all_teams)
        self.team_to_idx = {team: idx for idx, team in enumerate(self.teams)}
        
        if show_progress:
            print(f"   📊 Found {len(self.teams)} teams, {len(matches_df)} matches")
        
        # Convert team names to indices
        matches_df = matches_df.copy()
        matches_df['home_idx'] = matches_df['home_team'].map(self.team_to_idx)
        matches_df['away_idx'] = matches_df['away_team'].map(self.team_to_idx)
        
        # Calculate time weights
        if 'match_date' in matches_df.columns:
            if show_progress:
                print("   ⏰ Calculating time decay weights...")
            matches_df['match_date'] = pd.to_datetime(matches_df['match_date'])
            current_date = matches_df['match_date'].max()
            days_ago = (current_date - matches_df['match_date']).dt.days
            matches_df['time_weight'] = np.exp(-self.decay_rate * days_ago / 365.25)
        else:
            matches_df['time_weight'] = 1.0
            
        if show_progress:
            print("   ✅ Data preparation complete\n")
            
        return matches_df
    
    def generate_scoreline_probabilities(self, home_xg, away_xg, max_goals=6):
        """
        Generate scoreline probabilities using Poisson distribution from xG
        """
        scoreline_probs = {}
        total_prob = 0
        
        for h_goals in range(max_goals + 1):
            for a_goals in range(max_goals + 1):
                prob = poisson.pmf(h_goals, home_xg) * poisson.pmf(a_goals, away_xg)
                scoreline_probs[(h_goals, a_goals)] = prob
                total_prob += prob
        
        # Normalize probabilities
        for key in scoreline_probs:
            scoreline_probs[key] /= total_prob
            
        return scoreline_probs
    
    def build_model(self, matches_df, show_progress=True):
        """
        Build the PyMC model
        """
        if show_progress:
            print("🏗️  Building Bayesian model...")
        
        matches_df = self.prepare_data(matches_df, show_progress=show_progress)
        n_teams = len(self.teams)
        n_matches = len(matches_df)
        
        with pm.Model() as model:
            if show_progress:
                print("   📈 Setting up priors and parameters...")
            
            # Team strength parameters
            attack_raw = pm.Normal('attack_raw', mu=0, sigma=1, shape=n_teams)
            defense_raw = pm.Normal('defense_raw', mu=0, sigma=1, shape=n_teams)
            
            # Sum-to-zero constraints
            attack = pm.Deterministic('attack', attack_raw - pt.mean(attack_raw))
            defense = pm.Deterministic('defense', defense_raw - pt.mean(defense_raw))
            
            # Home advantage
            home_advantage = pm.Normal('home_advantage', mu=0, sigma=0.5)
            
            # Hyperpriors for variance
            attack_sigma = pm.HalfNormal('attack_sigma', sigma=1)
            defense_sigma = pm.HalfNormal('defense_sigma', sigma=1)
            
            # Scale team strengths
            attack_scaled = attack * attack_sigma
            defense_scaled = defense * defense_sigma
            
            if show_progress:
                print("   ⚽ Computing expected goals...")
            
            # Expected goals for each match (Dixon-Coles parameterization)
            # Defense now represents defensive weakness (higher = more goals conceded)
            home_lambda = pm.Deterministic(
                'home_lambda',
                pt.exp(attack_scaled[matches_df['home_idx'].values] + 
                      defense_scaled[matches_df['away_idx'].values] + 
                      home_advantage)
            )
            
            away_lambda = pm.Deterministic(
                'away_lambda', 
                pt.exp(attack_scaled[matches_df['away_idx'].values] + 
                      defense_scaled[matches_df['home_idx'].values])
            )
            
            if show_progress:
                print("   🎯 Building likelihood function...")
            
            # Likelihood with weighted scorelines
            match_logp = pt.zeros(n_matches)
            
            # Use tqdm for progress if requested
            match_iterator = tqdm(range(n_matches), desc="   Processing matches", 
                                disable=not show_progress, leave=False)
            
            for i in match_iterator:
                row = matches_df.iloc[i]
                time_weight = row['time_weight']
                
                # Generate or use provided scoreline weights
                if 'scoreline_weights' in matches_df.columns and row['scoreline_weights'] is not None:
                    scoreline_weights = row['scoreline_weights']
                else:
                    # Generate from xG if not provided
                    if 'home_xg' in matches_df.columns:
                        scoreline_weights = self.generate_scoreline_probabilities(
                            row['home_xg'], row['away_xg']
                        )
                    else:
                        # Fallback to simple Poisson if no xG data
                        scoreline_weights = {(row['home_goals'], row['away_goals']): 1.0}
                
                # Calculate weighted likelihood for this match
                match_ll = 0
                for (h_goals, a_goals), weight in scoreline_weights.items():
                    # Poisson likelihood for each possible scoreline
                    h_ll = pm.logp(pm.Poisson.dist(home_lambda[i]), h_goals)
                    a_ll = pm.logp(pm.Poisson.dist(away_lambda[i]), a_goals)
                    
                    # Weight by scoreline probability and time decay
                    match_ll += weight * pt.exp(h_ll + a_ll)
                
                # Add to total log-likelihood with time weighting
                match_logp = pt.set_subtensor(
                    match_logp[i], 
                    time_weight * pt.log(match_ll + 1e-10)  # Small constant for numerical stability
                )
            
            # Custom likelihood
            pm.Potential('match_likelihood', pt.sum(match_logp))
            
            if show_progress:
                print("   ✅ Model built successfully\n")
            
        self.model = model
        return model
    
    def fit(self, matches_df, draws=2000, tune=1000, chains=2, show_progress=True):
        """
        Fit the model using MCMC sampling
        """
        if self.model is None:
            self.build_model(matches_df, show_progress=show_progress)
        
        if show_progress:
            print("🔥 Starting MCMC sampling...")
            print(f"   ⛓️  Chains: {chains}")
            print(f"   🎯 Tuning steps: {tune}")
            print(f"   📊 Sampling steps: {draws}")
            print()
        
        with self.model:
            # PyMC automatically shows progress bars for sampling
            self.trace = pm.sample(
                draws=draws, 
                tune=tune, 
                chains=chains,
                target_accept=0.9, 
                return_inferencedata=True,
                progressbar=show_progress  # This enables/disables the built-in progress bar
            )
        
        if show_progress:
            print("\n🎉 Sampling completed successfully!")
            print("📋 Computing summary statistics...")
        
        return self.trace
    
    def get_team_strengths(self, show_progress=True):
        """
        Extract team strength estimates from trace
        """
        if self.trace is None:
            raise ValueError("Model must be fitted first")
        
        if show_progress:
            print("📊 Extracting team strengths...")
        
        attack_means = self.trace.posterior['attack'].mean(dim=['chain', 'draw']).values
        defense_means = self.trace.posterior['defense'].mean(dim=['chain', 'draw']).values
        
        team_strengths = pd.DataFrame({
            'team': self.teams,
            'attack': attack_means,
            'defense': defense_means,  # Now represents defensive weakness
            'overall': attack_means - defense_means  # High attack, low defensive weakness = good
        })
        
        if show_progress:
            print("   ✅ Team strengths computed\n")
        
        return team_strengths.sort_values('overall', ascending=False)
    
    def predict_match(self, home_team, away_team, n_samples=1000, show_progress=True):
        """
        Predict match outcome between two teams
        """
        if self.trace is None:
            raise ValueError("Model must be fitted first")
        
        if show_progress:
            print(f"🔮 Predicting {home_team} vs {away_team}...")
        
        home_idx = self.team_to_idx[home_team]
        away_idx = self.team_to_idx[away_team]
        
        # Sample from posterior
        attack_samples = self.trace.posterior['attack'].values.reshape(-1, len(self.teams))
        defense_samples = self.trace.posterior['defense'].values.reshape(-1, len(self.teams))
        home_adv_samples = self.trace.posterior['home_advantage'].values.flatten()
        
        # Random sample selection
        sample_idx = np.random.choice(len(attack_samples), n_samples)
        
        predictions = []
        sample_iterator = tqdm(sample_idx, desc="   Generating predictions", 
                             disable=not show_progress, leave=False)
        
        for idx in sample_iterator:
            # Dixon-Coles parameterization: defense is defensive weakness
            home_lambda = np.exp(attack_samples[idx, home_idx] + 
                               defense_samples[idx, away_idx] + 
                               home_adv_samples[idx])
            away_lambda = np.exp(attack_samples[idx, away_idx] + 
                               defense_samples[idx, home_idx])
            
            home_goals = np.random.poisson(home_lambda)
            away_goals = np.random.poisson(away_lambda)
            
            predictions.append((home_goals, away_goals))
        
        predictions = np.array(predictions)
        
        # Calculate probabilities
        home_wins = np.mean(predictions[:, 0] > predictions[:, 1])
        draws = np.mean(predictions[:, 0] == predictions[:, 1])
        away_wins = np.mean(predictions[:, 0] < predictions[:, 1])
        
        result = {
            'home_win_prob': home_wins,
            'draw_prob': draws,
            'away_win_prob': away_wins,
            'expected_home_goals': np.mean(predictions[:, 0]),
            'expected_away_goals': np.mean(predictions[:, 1])
        }
        
        if show_progress:
            print("   ✅ Prediction complete\n")
        
        return result




In [3]:
# Super fast for debugging/testing
model = BayesianTeamStrengthModel()

model.fit(df, draws=100, tune=100)

ratings = model.get_team_ratings()

ratings

🏗️  Building Bayesian model...
🔄 Preparing data...
   📊 Found 20 teams, 40 matches
   ✅ Data preparation complete

   📈 Setting up priors and parameters...
   ⚽ Computing expected goals...
   🎯 Building likelihood function...


                                                                      

   ✅ Model built successfully

🔥 Starting MCMC sampling...
   ⛓️  Chains: 2
   🎯 Tuning steps: 100
   📊 Sampling steps: 100



KeyboardInterrupt: 