In [4]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.special import factorial
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
import sqlite3

conn = sqlite3.connect(r'C:\Users\Owner\dev\algobetting\infra\data\db\algobetting.db')

df = pd.read_sql_query("""
                SELECT 
                    team as home_team,
                    opp_team as away_team,
                    summary_xg as home_xg,
                    opp_summary_xg as away_xg,
                    summary_goals as home_goals,
                    opp_summary_goals as away_goals,
                    match_date as date,
                    season
                FROM fbref_match_all_columns
                WHERE division = 'Premier League'
                    AND season IN ('2024-2025')
                    AND summary_xg IS NOT NULL
                    AND opp_summary_xg IS NOT NULL
                    AND is_home = 1
                    --AND match_date > '2025-05-01'
                       """, conn)

conn.close()

df

Unnamed: 0,home_team,away_team,home_xg,away_xg,home_goals,away_goals,date,season
0,Tottenham,Brighton,2.0,2.2,1.0,4.0,2025-05-25,2024-2025
1,Bournemouth,Leicester City,1.6,0.3,2.0,0.0,2025-05-25,2024-2025
2,Newcastle Utd,Everton,1.2,1.2,0.0,1.0,2025-05-25,2024-2025
3,Fulham,Manchester City,1.3,3.1,0.0,2.0,2025-05-25,2024-2025
4,Nott'ham Forest,Chelsea,1.2,1.1,0.0,1.0,2025-05-25,2024-2025
...,...,...,...,...,...,...,...,...
375,Nott'ham Forest,Bournemouth,1.4,1.2,1.0,1.0,2024-08-17,2024-2025
376,Everton,Brighton,0.5,1.4,0.0,3.0,2024-08-17,2024-2025
377,Newcastle Utd,Southampton,0.3,1.8,1.0,0.0,2024-08-17,2024-2025
378,Ipswich Town,Liverpool,0.5,2.6,0.0,2.0,2024-08-17,2024-2025


In [5]:
import pandas as pd
import numpy as np
import pymc as pm
import pytensor.tensor as pt
from sklearn.preprocessing import LabelEncoder
from scipy.stats import poisson
import matplotlib.pyplot as plt

def calculate_scoreline_probabilities(home_xg, away_xg, max_goals=5):
    """
    Calculate probability of each possible scoreline given xG values
    
    Parameters:
    home_xg, away_xg: Expected goals for home and away teams
    max_goals: Maximum goals to consider for probability calculation
    
    Returns:
    Dictionary with scoreline probabilities
    """
    scoreline_probs = {}
    
    for home_goals in range(max_goals + 1):
        for away_goals in range(max_goals + 1):
            prob = poisson.pmf(home_goals, home_xg) * poisson.pmf(away_goals, away_xg)
            scoreline_probs[(home_goals, away_goals)] = prob
    
    # Handle higher goal counts with residual probability
    residual_prob = 1 - sum(scoreline_probs.values())
    
    return scoreline_probs, residual_prob

def prepare_data_with_xg(df, home_col='home_team', away_col='away_team', 
                        home_goals='home_goals', away_goals='away_goals',
                        home_xg='home_xg', away_xg='away_xg',
                        date_col='date', decay_rate=0.0056, max_goals=9):
    """
    Prepare data with both time decay and xG-based scoreline weighting
    
    Parameters:
    df: DataFrame with match data including xG
    home_xg, away_xg: column names for expected goals
    max_goals: Maximum goals to consider for xG probability calculation
    """
    
    # Sort by date
    df = df.sort_values(date_col).reset_index(drop=True)
    df[date_col] = pd.to_datetime(df[date_col])
    
    # Encode teams
    teams = sorted(list(set(df[home_col].unique()) | set(df[away_col].unique())))
    team_encoder = LabelEncoder()
    team_encoder.fit(teams)
    
    home_team_idx = team_encoder.transform(df[home_col])
    away_team_idx = team_encoder.transform(df[away_col])
    
    # Time weights
    most_recent_date = df[date_col].max()
    days_ago = (most_recent_date - df[date_col]).dt.days
    time_weights = np.exp(-decay_rate * days_ago)
    
    # Calculate xG-based scoreline weights
    xg_weights = []
    xg_details = []
    
    for idx, row in df.iterrows():
        actual_score = (row[home_goals], row[away_goals])
        home_xg_val = row[home_xg]
        away_xg_val = row[away_xg]
        
        # Get probability of actual scoreline given xG
        scoreline_probs, residual = calculate_scoreline_probabilities(
            home_xg_val, away_xg_val, max_goals
        )
        
        # Probability of the actual scoreline
        if actual_score in scoreline_probs:
            scoreline_prob = scoreline_probs[actual_score]
        else:
            # For high-scoring games, use residual probability divided by remaining combinations
            scoreline_prob = residual / 100  # Rough approximation
        
        # Convert probability to weight - higher probability = higher weight
        # You might want to experiment with different transformations here
        xg_weight = scoreline_prob / np.mean(list(scoreline_probs.values()))
        
        xg_weights.append(xg_weight)
        xg_details.append({
            'actual_score': actual_score,
            'home_xg': home_xg_val,
            'away_xg': away_xg_val,
            'scoreline_prob': scoreline_prob,
            'xg_weight': xg_weight
        })
    
    xg_weights = np.array(xg_weights)
    
    # Combined weights: time decay * xG weighting
    combined_weights = time_weights.values * xg_weights
    
    # Print some examples
    print(f"Time and xG weighting examples:")
    print(f"Most recent match: time_weight={time_weights.iloc[-1]:.4f}, xg_weight={xg_weights[-1]:.4f}, combined={combined_weights[-1]:.4f}")
    
    # Show some interesting cases
    high_xg_weight_idx = np.argmax(xg_weights)
    low_xg_weight_idx = np.argmin(xg_weights)
    
    print(f"\nHighest xG weight match (most expected result):")
    detail = xg_details[high_xg_weight_idx]
    print(f"  Score: {detail['actual_score']}, xG: {detail['home_xg']:.1f}-{detail['away_xg']:.1f}, weight: {detail['xg_weight']:.4f}")
    
    print(f"\nLowest xG weight match (most unexpected result):")
    detail = xg_details[low_xg_weight_idx]
    print(f"  Score: {detail['actual_score']}, xG: {detail['home_xg']:.1f}-{detail['away_xg']:.1f}, weight: {detail['xg_weight']:.4f}")
    
    return {
        'home_team': home_team_idx,
        'away_team': away_team_idx,
        'home_goals': df[home_goals].values,
        'away_goals': df[away_goals].values,
        'time_weights': time_weights.values,
        'xg_weights': xg_weights,
        'combined_weights': combined_weights,
        'n_teams': len(teams),
        'teams': teams,
        'team_encoder': team_encoder,
        'days_ago': days_ago.values,
        'dates': df[date_col].values,
        'xg_details': xg_details
    }

def build_model_with_xg_weights(data):
    """Build the Bayesian model with both time decay and xG weighting"""
    
    with pm.Model() as model:
        # Home advantage
        home_adv = pm.Normal("home_adv", mu=0, sigma=1)
        
        # Attack ratings
        tau_att = pm.Gamma("tau_att", alpha=1, beta=1)
        atts_star = pm.Normal("atts_star", mu=0, sigma=1/pt.sqrt(tau_att), 
                             shape=data['n_teams'])
        
        # Defense ratings  
        tau_def = pm.Gamma("tau_def", alpha=1, beta=1)
        defs_star = pm.Normal("defs_star", mu=0, sigma=1/pt.sqrt(tau_def),
                             shape=data['n_teams'])
        
        # Apply sum-to-zero constraints
        atts = pm.Deterministic("atts", atts_star - pt.mean(atts_star))
        defs = pm.Deterministic("defs", defs_star - pt.mean(defs_star))
        
        # Calculate expected goals
        home_theta = pt.exp(home_adv + 
                           atts[data['home_team']] + 
                           defs[data['away_team']])
        
        away_theta = pt.exp(atts[data['away_team']] + 
                           defs[data['home_team']])
        
        # Standard Poisson likelihood
        home_goals_obs = pm.Poisson("home_goals", mu=home_theta,
                                   observed=data['home_goals'])
        away_goals_obs = pm.Poisson("away_goals", mu=away_theta,
                                   observed=data['away_goals'])
        
        # Combined weights (time * xG)
        combined_weights_tensor = pm.ConstantData("combined_weights", data['combined_weights'])
        
        # Calculate weighted log-probabilities
        home_logp = pm.logp(home_goals_obs, data['home_goals'])
        away_logp = pm.logp(away_goals_obs, data['away_goals'])
        
        # Add the weighted adjustment as a potential
        weight_adjustment = pm.Potential("weight_adjustment",
                                       pt.sum((combined_weights_tensor - 1) * home_logp) +
                                       pt.sum((combined_weights_tensor - 1) * away_logp))
        
    return model

def analyze_weighting_impact(data):
    """Analyze how the weighting affects different types of matches"""
    
    df_analysis = pd.DataFrame({
        'time_weight': data['time_weights'],
        'xg_weight': data['xg_weights'],
        'combined_weight': data['combined_weights'],
        'home_goals': data['home_goals'],
        'away_goals': data['away_goals'],
        'days_ago': data['days_ago']
    })
    
    # Add xG details
    for i, detail in enumerate(data['xg_details']):
        df_analysis.loc[i, 'home_xg'] = detail['home_xg']
        df_analysis.loc[i, 'away_xg'] = detail['away_xg']
        df_analysis.loc[i, 'scoreline_prob'] = detail['scoreline_prob']
    
    print("\nWeighting Analysis:")
    print(f"Time weights - Mean: {df_analysis.time_weight.mean():.4f}, Std: {df_analysis.time_weight.std():.4f}")
    print(f"xG weights - Mean: {df_analysis.xg_weight.mean():.4f}, Std: {df_analysis.xg_weight.std():.4f}")
    print(f"Combined weights - Mean: {df_analysis.combined_weight.mean():.4f}, Std: {df_analysis.combined_weight.std():.4f}")
    
    # Find most and least weighted matches
    most_weighted = df_analysis.loc[df_analysis.combined_weight.idxmax()]
    least_weighted = df_analysis.loc[df_analysis.combined_weight.idxmin()]
    
    print(f"\nMost weighted match:")
    print(f"  Score: {most_weighted.home_goals:.0f}-{most_weighted.away_goals:.0f}, xG: {most_weighted.home_xg:.1f}-{most_weighted.away_xg:.1f}")
    print(f"  Days ago: {most_weighted.days_ago:.0f}, Combined weight: {most_weighted.combined_weight:.4f}")
    
    print(f"\nLeast weighted match:")
    print(f"  Score: {least_weighted.home_goals:.0f}-{least_weighted.away_goals:.0f}, xG: {least_weighted.home_xg:.1f}-{least_weighted.away_xg:.1f}")
    print(f"  Days ago: {least_weighted.days_ago:.0f}, Combined weight: {least_weighted.combined_weight:.4f}")
    
    return df_analysis

def plot_weighting_analysis(df_analysis):
    """Create visualizations of the weighting scheme"""
    
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # 1. Time weights over time
    axes[0,0].scatter(df_analysis.days_ago, df_analysis.time_weight, alpha=0.6)
    axes[0,0].set_xlabel('Days Ago')
    axes[0,0].set_ylabel('Time Weight')
    axes[0,0].set_title('Time Decay Weighting')
    
    # 2. xG weights distribution
    axes[0,1].hist(df_analysis.xg_weight, bins=30, alpha=0.7)
    axes[0,1].set_xlabel('xG Weight')
    axes[0,1].set_ylabel('Frequency')
    axes[0,1].set_title('Distribution of xG Weights')
    
    # 3. Combined weights over time
    scatter = axes[1,0].scatter(df_analysis.days_ago, df_analysis.combined_weight, 
                               c=df_analysis.xg_weight, alpha=0.6, cmap='viridis')
    axes[1,0].set_xlabel('Days Ago')
    axes[1,0].set_ylabel('Combined Weight')
    axes[1,0].set_title('Combined Weighting (colored by xG weight)')
    plt.colorbar(scatter, ax=axes[1,0])
    
    # 4. xG weight vs goal difference from xG
    df_analysis['goal_diff_vs_xg'] = (df_analysis.home_goals - df_analysis.away_goals) - \
                                    (df_analysis.home_xg - df_analysis.away_xg)
    
    axes[1,1].scatter(df_analysis.goal_diff_vs_xg, df_analysis.xg_weight, alpha=0.6)
    axes[1,1].set_xlabel('Actual Goal Diff - xG Goal Diff')
    axes[1,1].set_ylabel('xG Weight')
    axes[1,1].set_title('xG Weighting vs Result vs Performance')
    axes[1,1].axvline(x=0, color='red', linestyle='--', alpha=0.5)
    
    plt.tight_layout()
    plt.show()
    
    return fig

# Example usage function
def run_xg_weighted_model(df):
    """
    Complete workflow for running the xG-weighted Bayesian model
    
    Expected DataFrame columns:
    - home_team, away_team: team names
    - home_goals, away_goals: actual goals scored
    - home_xg, away_xg: expected goals
    - date: match date
    """
    
    print("Preparing data with xG weighting...")
    data = prepare_data_with_xg(df)
    
    print("\nAnalyzing weighting impact...")
    df_analysis = analyze_weighting_impact(data)
    
    print("\nBuilding Bayesian model...")
    model = build_model_with_xg_weights(data)
    
    print("\nFitting model (this may take a few minutes)...")
    trace = fit_model(model, data, draws=1000, tune=1000, chains=2)
    
    print("\nExtracting team ratings...")
    ratings_df, home_advantage = get_team_ratings(trace, data['teams'])
    
    # Create plots
    plot_weighting_analysis(df_analysis)
    
    return {
        'model': model,
        'trace': trace,
        'ratings': ratings_df,
        'home_advantage': home_advantage,
        'data': data,
        'analysis': df_analysis
    }

# You'll also need the original functions from the base model
def fit_model(model, data, draws=2000, tune=1000, chains=2, progress_bar=True):
    """Fit the model using MCMC"""
    
    with model:
        trace = pm.sample(draws=draws, tune=tune, chains=chains,
                         return_inferencedata=True,
                         progressbar=progress_bar,
                         compute_convergence_checks=True)
    
    return trace

def get_team_ratings(trace, teams):
    """Extract team ratings and convert to interpretable metrics"""
    
    attack_ratings = trace.posterior.atts.mean(dim=['chain', 'draw']).values
    defense_ratings = trace.posterior.defs.mean(dim=['chain', 'draw']).values
    home_advantage = trace.posterior.home_adv.mean().values
    
    # Calculate metrics vs average team
    goals_scored_home = np.exp(home_advantage + attack_ratings + 0)
    goals_scored_away = np.exp(attack_ratings + 0)
    goals_conceded_home = np.exp(0 + defense_ratings)
    goals_conceded_away = np.exp(home_advantage + 0 + defense_ratings)
    
    goal_diff_home = goals_scored_home - goals_conceded_home
    goal_diff_away = goals_scored_away - goals_conceded_away
    goal_diff_avg = (goal_diff_home + goal_diff_away) / 2
    
    ratings_df = pd.DataFrame({
        'team': teams,
        'attack_rating': attack_ratings,
        'defense_rating': defense_ratings,
        'goals_scored_home_vs_avg': goals_scored_home,
        'goals_scored_away_vs_avg': goals_scored_away,
        'goals_conceded_home_vs_avg': goals_conceded_home,
        'goals_conceded_away_vs_avg': goals_conceded_away,
        'goal_diff_home_vs_avg': goal_diff_home,
        'goal_diff_away_vs_avg': goal_diff_away,
        'goal_diff_avg_vs_avg': goal_diff_avg,
        'attack_strength': np.exp(attack_ratings),
        'defense_strength': np.exp(-defense_ratings)
    })
    
    return ratings_df.sort_values('goal_diff_avg_vs_avg', ascending=False), home_advantage

In [6]:
data = prepare_data_with_xg(df
)

model = build_model_with_xg_weights(data)
trace = fit_model(model, data, draws=100, tune=100)
ratings, home_adv = get_team_ratings(trace, data['teams'])

ratings, home_adv

Time and xG weighting examples:
Most recent match: time_weight=1.0000, xg_weight=2.9278, combined=2.9278

Highest xG weight match (most expected result):
  Score: (0.0, 0.0), xG: 0.4-0.7, weight: 33.2871

Lowest xG weight match (most unexpected result):
  Score: (1.0, 1.0), xG: 1.8-0.0, weight: 0.0000


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [home_adv, tau_att, atts_star, tau_def, defs_star]


Output()

Sampling 2 chains for 100 tune and 100 draw iterations (200 + 200 draws total) took 239 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


(               team  attack_rating  defense_rating  goals_scored_home_vs_avg  \
 11        Liverpool       0.579796       -0.189732                  1.850531   
 14    Newcastle Utd       0.424611       -0.151178                  1.584529   
 0           Arsenal       0.205801       -0.444080                  1.273130   
 12  Manchester City       0.220830       -0.385735                  1.292407   
 4          Brighton       0.475167        0.109857                  1.666697   
 5           Chelsea       0.130600       -0.286582                  1.180900   
 6    Crystal Palace       0.091796       -0.241075                  1.135954   
 3         Brentford       0.199922        0.000743                  1.265666   
 1       Aston Villa       0.143122       -0.045198                  1.195780   
 2       Bournemouth      -0.034848       -0.205911                  1.000829   
 15  Nott'ham Forest       0.000905       -0.150286                  1.037259   
 7           Everton      -0