In [2]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.special import factorial
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
import sqlite3

conn = sqlite3.connect(r'C:\Users\Owner\dev\algobetting\infra\data\db\algobetting.db')

df = pd.read_sql_query("""
                SELECT 
                    team as home_team,
                    opp_team as away_team,
                    summary_xg as xg_home,
                    opp_summary_xg as xg_away,
                    summary_goals as home_goals,
                    opp_summary_goals as away_goals,
                    match_date as date
                FROM fbref_match_all_columns
                WHERE division = 'Premier League'
                    AND season = '2024-2025'
                    AND summary_xg IS NOT NULL
                    AND opp_summary_xg IS NOT NULL
                    AND is_home = 1
                       """, conn)

conn.close()

df

Unnamed: 0,home_team,away_team,xg_home,xg_away,home_goals,away_goals,date
0,Tottenham,Brighton,2.0,2.2,1.0,4.0,2025-05-25
1,Bournemouth,Leicester City,1.6,0.3,2.0,0.0,2025-05-25
2,Newcastle Utd,Everton,1.2,1.2,0.0,1.0,2025-05-25
3,Fulham,Manchester City,1.3,3.1,0.0,2.0,2025-05-25
4,Nott'ham Forest,Chelsea,1.2,1.1,0.0,1.0,2025-05-25
...,...,...,...,...,...,...,...
375,Nott'ham Forest,Bournemouth,1.4,1.2,1.0,1.0,2024-08-17
376,Everton,Brighton,0.5,1.4,0.0,3.0,2024-08-17
377,Newcastle Utd,Southampton,0.3,1.8,1.0,0.0,2024-08-17
378,Ipswich Town,Liverpool,0.5,2.6,0.0,2.0,2024-08-17


In [None]:
class DixonColesXGModel:
    """
    Dixon-Coles model modified to use xG data with MCMC-generated scoreline weights.
    
    Instead of using actual scorelines, this model creates weighted synthetic matches
    based on the probability distribution of scorelines given xG values.
    """
    
    def __init__(self, xi=0.0018, n_simulations=1000, max_goals=9):
        """
        Initialize the Dixon-Coles xG model.
        
        Parameters:
        xi (float): Time decay parameter
        n_simulations (int): Number of MCMC simulations per match for scoreline weights
        max_goals (int): Maximum goals to consider in scoreline generation
        """
        self.xi = xi
        self.n_simulations = n_simulations
        self.max_goals = max_goals
        self.teams = None
        self.attack_params = None
        self.defense_params = None
        self.home_advantage = None
        self.rho = None
        self.is_fitted = False
        
    def time_weight(self, match_dates, current_date=None):
        """Calculate time decay weights for matches."""
        if current_date is None:
            current_date = max(match_dates)
        
        days_ago = [(current_date - date).days for date in match_dates]
        weights = np.exp(-self.xi * np.array(days_ago))
        return weights
    
    def tau_correction(self, home_goals, away_goals, lambda_home, lambda_away, rho):
        """Dixon-Coles correction factor for low-scoring games."""
        if home_goals == 0 and away_goals == 0:
            return 1 - lambda_home * lambda_away * rho
        elif home_goals == 0 and away_goals == 1:
            return 1 + lambda_home * rho
        elif home_goals == 1 and away_goals == 0:
            return 1 + lambda_away * rho
        elif home_goals == 1 and away_goals == 1:
            return 1 - rho
        else:
            return 1.0
    
    def poisson_prob(self, goals, lambda_param):
        """Calculate Poisson probability."""
        return (lambda_param ** goals) * np.exp(-lambda_param) / factorial(goals)
    
    def match_probability(self, home_goals, away_goals, lambda_home, lambda_away, rho):
        """Calculate probability of specific scoreline with Dixon-Coles correction."""
        basic_prob = self.poisson_prob(home_goals, lambda_home) * self.poisson_prob(away_goals, lambda_away)
        correction = self.tau_correction(home_goals, away_goals, lambda_home, lambda_away, rho)
        return basic_prob * correction
    
    def mcmc_scoreline_weights(self, xg_home, xg_away):
        """
        Generate scoreline weights via MCMC simulation from xG values.
        
        Parameters:
        xg_home (float): Expected goals for home team
        xg_away (float): Expected goals for away team
        
        Returns:
        dict: Scoreline -> probability mapping
        """
        scoreline_counts = {}
        
        # Run MCMC simulations
        for _ in range(self.n_simulations):
            # Sample goals from Poisson distributions
            home_goals = np.random.poisson(xg_home)
            away_goals = np.random.poisson(xg_away)
            
            # Cap at max_goals to keep computational load reasonable
            home_goals = min(home_goals, self.max_goals)
            away_goals = min(away_goals, self.max_goals)
            
            scoreline = f"{home_goals}-{away_goals}"
            scoreline_counts[scoreline] = scoreline_counts.get(scoreline, 0) + 1
        
        # Convert counts to probabilities
        total_sims = sum(scoreline_counts.values())
        scoreline_weights = {k: v/total_sims for k, v in scoreline_counts.items()}
        
        return scoreline_weights
    
    def create_weighted_dataset(self, matches_df):
        """
        Create expanded dataset with MCMC-generated scoreline weights.
        
        Parameters:
        matches_df (DataFrame): Match data with 'date', 'home_team', 'away_team', 'xg_home', 'xg_away'
        
        Returns:
        DataFrame: Expanded dataset with weighted synthetic scorelines
        """
        required_columns = ['date', 'home_team', 'away_team', 'xg_home', 'xg_away']
        missing_columns = [col for col in required_columns if col not in matches_df.columns]
        if missing_columns:
            raise ValueError(f"DataFrame must contain columns: {missing_columns}")
        
        expanded_matches = []
        
        print(f"Generating weighted scorelines for {len(matches_df)} matches...")
        
        for i, (match_idx, match) in enumerate(matches_df.iterrows()):
            if i % 100 == 0:
                print(f"Processing match {i+1}/{len(matches_df)}")
                
            xg_home = match['xg_home']
            xg_away = match['xg_away']
            
            # Get scoreline probability weights from MCMC
            scoreline_weights = self.mcmc_scoreline_weights(xg_home, xg_away)
            
            # Create weighted matches for each possible scoreline
            for scoreline, weight in scoreline_weights.items():
                if weight > 0.001:  # Only include scorelines with reasonable probability
                    home_goals, away_goals = map(int, scoreline.split('-'))
                    
                    # Create new match record
                    weighted_match = match.copy()
                    weighted_match['home_goals'] = home_goals
                    weighted_match['away_goals'] = away_goals
                    weighted_match['xg_weight'] = weight
                    weighted_match['original_match_id'] = i  # Track which original match this came from
                    
                    expanded_matches.append(weighted_match)
        
        expanded_df = pd.DataFrame(expanded_matches)
        print(f"Created {len(expanded_df)} weighted match records from {len(matches_df)} original matches")

        expanded_df.to_csv("test.csv", index=False)
        
        return expanded_df
    
    def negative_log_likelihood(self, params, matches, weights, xg_weights):
        """
        Calculate negative log-likelihood with xG-based weights.
        
        Parameters:
        params: Model parameters to optimize
        matches: Expanded match dataset
        weights: Time decay weights
        xg_weights: MCMC-generated scoreline weights
        """
        n_teams = len(self.teams)
        
        # Extract parameters
        attack_params = params[:n_teams]
        defense_params_partial = params[n_teams:2*n_teams-1]
        home_advantage = params[2*n_teams-1]
        rho = params[2*n_teams]
        
        # Compute first team's defense parameter
        first_team_defense = -np.sum(defense_params_partial)
        defense_params = np.concatenate([[first_team_defense], defense_params_partial])
        
        total_log_likelihood = 0
        
        for i, match in matches.iterrows():
            home_idx = self.team_to_idx[match['home_team']]
            away_idx = self.team_to_idx[match['away_team']]
            
            # Calculate expected goals
            lambda_home = np.exp(attack_params[home_idx] + defense_params[away_idx] + home_advantage)
            lambda_away = np.exp(attack_params[away_idx] + defense_params[home_idx])
            
            # Get match probability
            prob = self.match_probability(
                match['home_goals'], match['away_goals'], 
                lambda_home, lambda_away, rho
            )
            
            # Weight by both time decay and xG-based scoreline probability
            combined_weight = weights[i] * xg_weights[i]
            
            if prob > 0:
                total_log_likelihood += combined_weight * np.log(prob)
            else:
                total_log_likelihood += combined_weight * np.log(1e-10)
        
        return -total_log_likelihood
    
    def fit(self, matches_df, current_date=None):
        """
        Fit the Dixon-Coles model using xG data with MCMC scoreline weights.
        
        Parameters:
        matches_df (DataFrame): Match data with 'date', 'home_team', 'away_team', 'xg_home', 'xg_away'
        current_date (datetime): Date for time decay calculation
        """
        # Ensure date column is datetime
        if 'date' in matches_df.columns:
            matches_df['date'] = pd.to_datetime(matches_df['date'])
        else:
            raise ValueError("DataFrame must contain a 'date' column")
        
        # Get unique teams
        self.teams = sorted(list(set(matches_df['home_team'].tolist() + matches_df['away_team'].tolist())))
        self.team_to_idx = {team: i for i, team in enumerate(self.teams)}
        
        # Create weighted dataset using MCMC
        expanded_matches = self.create_weighted_dataset(matches_df)
        
        # Calculate time weights for expanded dataset
        # Each synthetic match gets the same time weight as its original match
        original_time_weights = self.time_weight(matches_df['date'].tolist(), current_date)
        
        expanded_time_weights = []
        xg_scoreline_weights = []
        
        for _, match in expanded_matches.iterrows():
            original_idx = int(match['original_match_id'])
            expanded_time_weights.append(original_time_weights[original_idx])
            xg_scoreline_weights.append(match['xg_weight'])
        
        expanded_time_weights = np.array(expanded_time_weights)
        xg_scoreline_weights = np.array(xg_scoreline_weights)
        
        # Initial parameters
        n_teams = len(self.teams)
        initial_params = np.concatenate([
            np.zeros(n_teams),      # attack parameters
            np.zeros(n_teams - 1),  # defense parameters (n-1)
            [0.1],  # home advantage
            [0.0]   # rho
        ])
        
        print("Optimizing model parameters...")
        
        # Optimize parameters
        result = minimize(
            self.negative_log_likelihood,
            initial_params,
            args=(expanded_matches, expanded_time_weights, xg_scoreline_weights),
            method='L-BFGS-B',
            options={'maxiter': 1000}
        )
        
        if result.success:
            # Extract fitted parameters
            self.attack_params = result.x[:n_teams]
            defense_params_partial = result.x[n_teams:2*n_teams-1]
            
            first_team_defense = -np.sum(defense_params_partial)
            self.defense_params = np.concatenate([[first_team_defense], defense_params_partial])
            
            self.home_advantage = result.x[2*n_teams-1]
            self.rho = result.x[2*n_teams]
            self.is_fitted = True
            
            print(f"Model fitted successfully!")
            print(f"Home advantage: {np.exp(self.home_advantage):.4f}")
            print(f"Rho parameter: {self.rho:.4f}")
            print(f"Used {len(expanded_matches)} weighted data points from {len(matches_df)} original matches")
        else:
            raise RuntimeError("Optimization failed to converge")
    
    def predict_match(self, home_team, away_team):
        """Predict match outcome using fitted parameters."""
        if not self.is_fitted:
            raise RuntimeError("Model must be fitted before making predictions")
        
        if home_team not in self.teams or away_team not in self.teams:
            raise ValueError("One or both teams not in training data")
        
        home_idx = self.team_to_idx[home_team]
        away_idx = self.team_to_idx[away_team]
        
        # Calculate expected goals
        lambda_home = np.exp(self.attack_params[home_idx] + self.defense_params[away_idx] + self.home_advantage)
        lambda_away = np.exp(self.attack_params[away_idx] + self.defense_params[home_idx])
        
        # Calculate outcome probabilities
        home_win_prob = 0
        away_win_prob = 0
        draw_prob = 0
        
        scoreline_probs = {}
        for h_goals in range(6):
            for a_goals in range(6):
                prob = self.match_probability(h_goals, a_goals, lambda_home, lambda_away, self.rho)
                scoreline_probs[f"{h_goals}-{a_goals}"] = prob
                
                if h_goals > a_goals:
                    home_win_prob += prob
                elif h_goals < a_goals:
                    away_win_prob += prob
                else:
                    draw_prob += prob
        
        return {
            'expected_goals_home': lambda_home,
            'expected_goals_away': lambda_away,
            'home_win_prob': home_win_prob,
            'draw_prob': draw_prob,
            'away_win_prob': away_win_prob,
            'most_likely_scoreline': max(scoreline_probs, key=scoreline_probs.get),
            'scoreline_probabilities': scoreline_probs
        }
    
    def get_team_ratings(self):
        """Get team strength ratings from fitted parameters."""
        if not self.is_fitted:
            raise RuntimeError("Model must be fitted before getting ratings")
        
        attack_vs_avg = []
        defense_vs_avg = []
        
        for i, team in enumerate(self.teams):
            goals_scored_vs_avg = np.exp(self.attack_params[i] + 0)
            
            avg_attack = np.mean(self.attack_params)
            goals_conceded_vs_avg = np.exp(avg_attack + self.defense_params[i])
            
            attack_vs_avg.append(goals_scored_vs_avg)
            defense_vs_avg.append(goals_conceded_vs_avg)
        
        ratings = pd.DataFrame({
            'team': self.teams,
            'attack_rating': attack_vs_avg,
            'defense_rating': defense_vs_avg,
            'goal_difference': np.array(attack_vs_avg) - np.array(defense_vs_avg)
        })
        
        return ratings.sort_values('goal_difference', ascending=False)

In [17]:
model = DixonColesXGModel(xi=0.0018)  # Slightly faster decay
model.fit(df)
    
# Get team ratings
print("\nTeam ratings:")
print(model.get_team_ratings())
    


Generating weighted scorelines for 380 matches...
Processing match 1/380
Processing match 101/380
Processing match 201/380
Processing match 301/380
Created 11159 weighted match records from 380 original matches
Optimizing model parameters...
Model fitted successfully!
Home advantage: 1.1127
Rho parameter: -0.0076
Used 11159 weighted data points from 380 original matches

Team ratings:
               team  attack_rating  defense_rating  goal_difference
11        Liverpool       1.505374        0.719404         0.785970
0           Arsenal       1.108883        0.633237         0.475646
5           Chelsea       1.258451        0.874369         0.384081
12  Manchester City       1.260911        0.886150         0.374761
14    Newcastle Utd       1.177645        0.845166         0.332479
2       Bournemouth       1.183511        0.898037         0.285474
6    Crystal Palace       1.104267        0.922091         0.182176
1       Aston Villa       1.045886        0.910420         0.135466


In [18]:
# Make a prediction
home_team = "Arsenal"
away_team = "Liverpool"
prediction = model.predict_match(home_team, away_team)
print(f"\n{home_team} vs {away_team} prediction:")
print(f"Expected goals - {home_team}: {prediction['expected_goals_home']:.2f}, {away_team}: {prediction['expected_goals_away']:.2f}")
print(f"Win probabilities - {home_team}: {prediction['home_win_prob']:.3f}, Draw: {prediction['draw_prob']:.3f}, {away_team}: {prediction['away_win_prob']:.3f}")
print(f"Most likely scoreline: {prediction['most_likely_scoreline']}")


Arsenal vs Liverpool prediction:
Expected goals - Arsenal: 0.91, Liverpool: 0.98
Win probabilities - Arsenal: 0.321, Draw: 0.321, Liverpool: 0.357
Most likely scoreline: 0-0


In [19]:
calibration = model.model_calibration_check(df)

=== XG MODEL CALIBRATION CHECK ===

📊 GOALS PER GAME COMPARISON
--------------------------------------------------
Actual avg home goals:     1.51
xG avg home goals:         1.53
Model avg home goals:      1.14

Actual avg away goals:     1.42
xG avg away goals:         1.37
Model avg away goals:      1.02

Actual avg total goals:    2.93
xG avg total goals:        2.90
Model avg total goals:     2.16


🎯 MATCH OUTCOME COMPARISON
--------------------------------------------------
Actual home win rate:      40.8%
Model home win rate:       38.4%
Difference:                -2.4%

Actual draw rate:          24.5%
Model draw rate:           28.1%
Difference:                +3.6%

Actual away win rate:      34.7%
Model away win rate:       33.1%
Difference:                -1.6%


📈 DETAILED ANALYSIS
--------------------------------------------------
Model vs Actual (Home):    -0.37
Model vs Actual (Away):    -0.40
Model vs Actual (Total):   -0.77

xG vs Model (Home):        +0.39
xG vs Mode