In [23]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.special import factorial
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
import sqlite3

conn = sqlite3.connect(r'C:\Users\Owner\dev\algobetting\infra\data\db\algobetting.db')

df = pd.read_sql_query("""
                SELECT 
                    team as home_team,
                    opp_team as away_team,
                    summary_xg as xg_home,
                    opp_summary_xg as xg_away,
                    summary_goals as home_goals,
                    opp_summary_goals as away_goals,
                    match_date as date
                FROM fbref_match_all_columns
                WHERE division = 'Premier League'
                    AND season = '2024-2025'
                    AND summary_xg IS NOT NULL
                    AND opp_summary_xg IS NOT NULL
                    AND is_home = 1
                       """, conn)

conn.close()

df

Unnamed: 0,home_team,away_team,xg_home,xg_away,home_goals,away_goals,date
0,Tottenham,Brighton,2.0,2.2,1.0,4.0,2025-05-25
1,Bournemouth,Leicester City,1.6,0.3,2.0,0.0,2025-05-25
2,Newcastle Utd,Everton,1.2,1.2,0.0,1.0,2025-05-25
3,Fulham,Manchester City,1.3,3.1,0.0,2.0,2025-05-25
4,Nott'ham Forest,Chelsea,1.2,1.1,0.0,1.0,2025-05-25
...,...,...,...,...,...,...,...
375,Nott'ham Forest,Bournemouth,1.4,1.2,1.0,1.0,2024-08-17
376,Everton,Brighton,0.5,1.4,0.0,3.0,2024-08-17
377,Newcastle Utd,Southampton,0.3,1.8,1.0,0.0,2024-08-17
378,Ipswich Town,Liverpool,0.5,2.6,0.0,2.0,2024-08-17


In [26]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.special import factorial
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

class DixonColesXGModel:
    """
    Dixon-Coles model modified to use xG data following the Ben Torvaney approach.
    
    This creates synthetic matches for each possible scoreline, weighted by their
    probability given the xG values, then fits a standard Dixon-Coles model.
    """
    
    def __init__(self, xi=0.0018, max_goals=10):
        """
        Initialize the Dixon-Coles xG model.
        
        Parameters:
        xi (float): Time decay parameter
        max_goals (int): Maximum goals to consider in scoreline generation
        """
        self.xi = xi
        self.max_goals = max_goals
        self.teams = None
        self.attack_params = None
        self.defense_params = None
        self.home_advantage = None
        self.rho = None
        self.is_fitted = False
        
    def time_weight(self, match_dates, current_date=None):
        """Calculate time decay weights for matches."""
        if current_date is None:
            current_date = max(match_dates)
        
        days_ago = [(current_date - date).days for date in match_dates]
        weights = np.exp(-self.xi * np.array(days_ago))
        return weights
    
    def tau_correction(self, home_goals, away_goals, lambda_home, lambda_away, rho):
        """Dixon-Coles correction factor for low-scoring games."""
        if home_goals == 0 and away_goals == 0:
            return 1 - lambda_home * lambda_away * rho
        elif home_goals == 0 and away_goals == 1:
            return 1 + lambda_home * rho
        elif home_goals == 1 and away_goals == 0:
            return 1 + lambda_away * rho
        elif home_goals == 1 and away_goals == 1:
            return 1 - rho
        else:
            return 1.0
    
    def poisson_prob(self, goals, lambda_param):
        """Calculate Poisson probability."""
        if goals > 20:  # Prevent factorial overflow
            return 0.0
        return (lambda_param ** goals) * np.exp(-lambda_param) / factorial(int(goals))
    
    def match_probability(self, home_goals, away_goals, lambda_home, lambda_away, rho):
        """Calculate probability of specific scoreline with Dixon-Coles correction."""
        basic_prob = self.poisson_prob(home_goals, lambda_home) * self.poisson_prob(away_goals, lambda_away)
        correction = self.tau_correction(home_goals, away_goals, lambda_home, lambda_away, rho)
        return basic_prob * correction
    
    def create_weighted_dataset(self, matches_df):
        """
        Create expanded dataset with probability-weighted scorelines based on xG.
        
        Following the Ben Torvaney approach: for each match, create synthetic
        scorelines weighted by their Poisson probability given the xG values.
        """
        required_columns = ['date', 'home_team', 'away_team', 'xg_home', 'xg_away']
        missing_columns = [col for col in required_columns if col not in matches_df.columns]
        if missing_columns:
            raise ValueError(f"DataFrame must contain columns: {missing_columns}")
        
        expanded_matches = []
        
        print(f"Generating weighted scorelines for {len(matches_df)} matches...")
        
        for i, (match_idx, match) in enumerate(matches_df.iterrows()):
            if i % 100 == 0:
                print(f"Processing match {i+1}/{len(matches_df)}")
                
            xg_home = match['xg_home']
            xg_away = match['xg_away']
            
            # Calculate exact Poisson probabilities for each scoreline
            for h_goals in range(self.max_goals + 1):
                for a_goals in range(self.max_goals + 1):
                    # Probability of this scoreline given xG values
                    prob_h = self.poisson_prob(h_goals, xg_home)
                    prob_a = self.poisson_prob(a_goals, xg_away)
                    prob = prob_h * prob_a
                    
                    # Only include scorelines with non-negligible probability
                    if prob > 0.001:
                        weighted_match = match.copy()
                        weighted_match['home_goals'] = h_goals
                        weighted_match['away_goals'] = a_goals
                        weighted_match['scoreline_prob'] = prob
                        weighted_match['original_match_id'] = i
                        
                        expanded_matches.append(weighted_match)
        
        expanded_df = pd.DataFrame(expanded_matches)
        print(f"Created {len(expanded_df)} weighted match records from {len(matches_df)} original matches")
        
        return expanded_df
    
    def negative_log_likelihood(self, params, matches, weights):
        """
        Calculate negative log-likelihood for the Dixon-Coles model.
        
        This is the STANDARD Dixon-Coles likelihood, but each observation
        is weighted by (time_weight * scoreline_probability).
        """
        n_teams = len(self.teams)
        
        # Extract parameters
        attack_params = params[:n_teams]
        defense_params_partial = params[n_teams:2*n_teams-1]
        home_advantage = params[2*n_teams-1]
        rho = params[2*n_teams]
        
        # Compute first team's defense parameter (sum-to-zero constraint)
        first_team_defense = -np.sum(defense_params_partial)
        defense_params = np.concatenate([[first_team_defense], defense_params_partial])
        
        total_log_likelihood = 0
        
        for i, match in matches.iterrows():
            home_idx = self.team_to_idx[match['home_team']]
            away_idx = self.team_to_idx[match['away_team']]
            
            # Calculate expected goals according to current parameters
            lambda_home = np.exp(attack_params[home_idx] + defense_params[away_idx] + home_advantage)
            lambda_away = np.exp(attack_params[away_idx] + defense_params[home_idx])
            
            # Get match probability according to Dixon-Coles
            prob = self.match_probability(
                match['home_goals'], match['away_goals'], 
                lambda_home, lambda_away, rho
            )
            
            # Weight combines time decay and scoreline probability
            # This is the key: we're fitting to all possible scorelines,
            # but weighted by how likely they are given the xG
            if prob > 0:
                total_log_likelihood += weights[i] * np.log(prob)
            else:
                total_log_likelihood += weights[i] * np.log(1e-10)
        
        return -total_log_likelihood
    
    def fit(self, matches_df, current_date=None):
        """
        Fit the Dixon-Coles model using xG data.
        
        Following the Ben Torvaney approach from the R/regista implementation,
        with calibration adjustment to match xG averages.
        """
        # Ensure date column is datetime
        if 'date' in matches_df.columns:
            matches_df['date'] = pd.to_datetime(matches_df['date'])
        else:
            raise ValueError("DataFrame must contain a 'date' column")
        
        # Get unique teams
        self.teams = sorted(list(set(matches_df['home_team'].tolist() + matches_df['away_team'].tolist())))
        self.team_to_idx = {team: i for i, team in enumerate(self.teams)}
        
        # Create weighted dataset
        expanded_matches = self.create_weighted_dataset(matches_df)
        
        # Calculate time weights for original matches
        original_time_weights = self.time_weight(matches_df['date'].tolist(), current_date)
        
        # Combine weights: time weight (from original match) * scoreline probability
        combined_weights = []
        for _, match in expanded_matches.iterrows():
            original_idx = int(match['original_match_id'])
            time_weight = original_time_weights[original_idx]
            scoreline_prob = match['scoreline_prob']
            
            # The key insight: multiply time weight by scoreline probability
            # This is exactly what the R code does: weights = prob
            combined_weight = time_weight * scoreline_prob
            combined_weights.append(combined_weight)
        
        combined_weights = np.array(combined_weights)
        
        # Initial parameters
        n_teams = len(self.teams)
        initial_params = np.concatenate([
            np.zeros(n_teams),      # attack parameters
            np.zeros(n_teams - 1),  # defense parameters (n-1)
            [0.1],  # home advantage
            [0.0]   # rho
        ])
        
        print("Optimizing model parameters...")
        
        # Optimize parameters
        result = minimize(
            self.negative_log_likelihood,
            initial_params,
            args=(expanded_matches, combined_weights),
            method='L-BFGS-B',
            bounds=[(None, None)] * (2*n_teams) + [(-0.2, 0.2)],  # Bound rho to reasonable range
            options={'maxiter': 1000}
        )
        
        if result.success or result.status == 2:  # Accept "converged" or "max iterations"
            # Extract fitted parameters
            attack_params = result.x[:n_teams]
            defense_params_partial = result.x[n_teams:2*n_teams-1]
            home_advantage = result.x[2*n_teams-1]
            rho = result.x[2*n_teams]
            
            # Apply sum-to-zero constraint for defense
            first_team_defense = -np.sum(defense_params_partial)
            defense_params = np.concatenate([[first_team_defense], defense_params_partial])
            
            # Normalize parameters to maintain calibration
            # Calculate what the model currently predicts on average
            avg_lambda_home = 0
            avg_lambda_away = 0
            count = 0
            
            for i in range(n_teams):
                for j in range(n_teams):
                    if i != j:
                        lambda_h = np.exp(attack_params[i] + defense_params[j] + home_advantage)
                        lambda_a = np.exp(attack_params[j] + defense_params[i])
                        avg_lambda_home += lambda_h
                        avg_lambda_away += lambda_a
                        count += 1
            
            avg_lambda_home /= count
            avg_lambda_away /= count
            
            # Calculate target averages from the xG data
            target_home = matches_df['xg_home'].mean()
            target_away = matches_df['xg_away'].mean()
            
            # Calculate scaling factors
            home_scale = target_home / avg_lambda_home if avg_lambda_home > 0 else 1.0
            away_scale = target_away / avg_lambda_away if avg_lambda_away > 0 else 1.0
            
            # Use geometric mean of scales to preserve relative strengths
            overall_scale = np.sqrt(home_scale * away_scale)
            
            # Apply scaling to attack parameters to calibrate the model
            self.attack_params = attack_params + np.log(overall_scale)
            self.defense_params = defense_params
            self.home_advantage = home_advantage
            self.rho = rho
            self.is_fitted = True
            
            print(f"Model fitted successfully!")
            print(f"Home advantage: {np.exp(self.home_advantage):.4f}")
            print(f"Rho parameter: {self.rho:.4f}")
            print(f"Used {len(expanded_matches)} weighted data points from {len(matches_df)} original matches")
            
            # Verify total weight
            print(f"Total weight check - Original: {original_time_weights.sum():.2f}, Combined: {combined_weights.sum():.2f}")
        else:
            raise RuntimeError(f"Optimization failed: {result.message}")
    
    def predict_match(self, home_team, away_team):
        """Predict match outcome using fitted parameters."""
        if not self.is_fitted:
            raise RuntimeError("Model must be fitted before making predictions")
        
        if home_team not in self.teams or away_team not in self.teams:
            raise ValueError("One or both teams not in training data")
        
        home_idx = self.team_to_idx[home_team]
        away_idx = self.team_to_idx[away_team]
        
        # Calculate expected goals
        lambda_home = np.exp(self.attack_params[home_idx] + self.defense_params[away_idx] + self.home_advantage)
        lambda_away = np.exp(self.attack_params[away_idx] + self.defense_params[home_idx])
        
        # Calculate outcome probabilities
        home_win_prob = 0
        away_win_prob = 0
        draw_prob = 0
        
        scoreline_probs = {}
        for h_goals in range(6):
            for a_goals in range(6):
                prob = self.match_probability(h_goals, a_goals, lambda_home, lambda_away, self.rho)
                scoreline_probs[f"{h_goals}-{a_goals}"] = prob
                
                if h_goals > a_goals:
                    home_win_prob += prob
                elif h_goals < a_goals:
                    away_win_prob += prob
                else:
                    draw_prob += prob
        
        return {
            'expected_goals_home': lambda_home,
            'expected_goals_away': lambda_away,
            'home_win_prob': home_win_prob,
            'draw_prob': draw_prob,
            'away_win_prob': away_win_prob,
            'most_likely_scoreline': max(scoreline_probs, key=scoreline_probs.get),
            'scoreline_probabilities': scoreline_probs
        }
    
    def get_team_ratings(self):
        """Get team strength ratings from fitted parameters."""
        if not self.is_fitted:
            raise RuntimeError("Model must be fitted before getting ratings")
        
        # Following the R implementation's approach
        attack_vs_avg = []
        defense_vs_avg = []
        
        for i, team in enumerate(self.teams):
            # Attack: goals scored vs average defense (defense=0 after sum-to-zero)
            goals_scored_vs_avg = np.exp(self.attack_params[i])
            
            # Defense: goals conceded vs average attack
            avg_attack = np.mean(self.attack_params)
            goals_conceded_vs_avg = np.exp(avg_attack + self.defense_params[i])
            
            attack_vs_avg.append(goals_scored_vs_avg)
            defense_vs_avg.append(goals_conceded_vs_avg)
        
        ratings = pd.DataFrame({
            'team': self.teams,
            'attack_rating': attack_vs_avg,
            'defense_rating': defense_vs_avg,
            'goal_difference': np.array(attack_vs_avg) - np.array(defense_vs_avg)
        })
        
        return ratings.sort_values('goal_difference', ascending=False)
    
    def get_average_xg_per_game(self, matches_df):
        """Calculate actual average xG per game from the original data."""
        total_xg = matches_df['xg_home'].sum() + matches_df['xg_away'].sum()
        total_games = len(matches_df)
        avg_xg_per_game = total_xg / total_games
        avg_home_xg = matches_df['xg_home'].mean()
        avg_away_xg = matches_df['xg_away'].mean()
        
        return {
            'avg_total_xg_per_game': avg_xg_per_game,
            'avg_home_xg_per_game': avg_home_xg,
            'avg_away_xg_per_game': avg_away_xg
        }
    
    def predict_average_goals(self):
        """Predict what the model thinks average goals should be."""
        if not self.is_fitted:
            raise RuntimeError("Model must be fitted before predictions")
            
        # Calculate average expected goals across all possible matchups
        total_home_goals = 0
        total_away_goals = 0
        total_matches = 0
        
        for i, home_team in enumerate(self.teams):
            for j, away_team in enumerate(self.teams):
                if i != j:  # Teams don't play themselves
                    pred = self.predict_match(home_team, away_team)
                    total_home_goals += pred['expected_goals_home']
                    total_away_goals += pred['expected_goals_away']
                    total_matches += 1
        
        avg_home_predicted = total_home_goals / total_matches
        avg_away_predicted = total_away_goals / total_matches
        avg_total_predicted = avg_home_predicted + avg_away_predicted
        
        return {
            'predicted_avg_home_goals': avg_home_predicted,
            'predicted_avg_away_goals': avg_away_predicted,
            'predicted_avg_total_goals': avg_total_predicted
        }
    
    def model_calibration_check(self, matches_df):
        """Compare model predictions to actual xG data statistics."""
        actual_xg_stats = self.get_average_xg_per_game(matches_df)
        predicted_stats = self.predict_average_goals()
        
        print("\n=== MODEL CALIBRATION CHECK ===")
        print("(Comparing model predictions to original xG data)")
        print()
        print(f"Actual avg home xG per game:    {actual_xg_stats['avg_home_xg_per_game']:.2f}")
        print(f"Model avg home goals per game:  {predicted_stats['predicted_avg_home_goals']:.2f}")
        print(f"Difference: {predicted_stats['predicted_avg_home_goals'] - actual_xg_stats['avg_home_xg_per_game']:.2f}")
        print()
        print(f"Actual avg away xG per game:    {actual_xg_stats['avg_away_xg_per_game']:.2f}")
        print(f"Model avg away goals per game:  {predicted_stats['predicted_avg_away_goals']:.2f}")
        print(f"Difference: {predicted_stats['predicted_avg_away_goals'] - actual_xg_stats['avg_away_xg_per_game']:.2f}")
        print()
        print(f"Actual avg total xG per game:   {actual_xg_stats['avg_total_xg_per_game']:.2f}")
        print(f"Model avg total goals per game: {predicted_stats['predicted_avg_total_goals']:.2f}")
        print(f"Difference: {predicted_stats['predicted_avg_total_goals'] - actual_xg_stats['avg_total_xg_per_game']:.2f}")
        print()
        print("Note: Small differences (~10%) are expected due to the Dixon-Coles")
        print("adjustments (rho parameter for low-scoring games, team effects, etc.)")
        
        return {
            'actual_xg': actual_xg_stats,
            'predicted_goals': predicted_stats
        }

In [27]:
model = DixonColesXGModel(xi=0.0018)  # Slightly faster decay
model.fit(df)
    
# Get team ratings
print("\nTeam ratings:")
print(model.get_team_ratings())
    


Generating weighted scorelines for 380 matches...
Processing match 1/380
Processing match 101/380
Processing match 201/380
Processing match 301/380
Created 12194 weighted match records from 380 original matches
Optimizing model parameters...
Model fitted successfully!
Home advantage: 1.0498
Rho parameter: -0.0447
Used 12194 weighted data points from 380 original matches
Total weight check - Original: 299.81, Combined: 297.82

Team ratings:
               team  attack_rating  defense_rating  goal_difference
11        Liverpool       1.804911        1.012646         0.792264
5           Chelsea       1.612257        1.265761         0.346497
2       Bournemouth       1.615219        1.311978         0.303241
4          Brighton       1.648634        1.394437         0.254196
0           Arsenal       1.405199        1.162174         0.243025
12  Manchester City       1.609117        1.387917         0.221199
1       Aston Villa       1.470564        1.259800         0.210764
3         Br

In [28]:
# Make a prediction
home_team = "Arsenal"
away_team = "Liverpool"
prediction = model.predict_match(home_team, away_team)
print(f"\n{home_team} vs {away_team} prediction:")
print(f"Expected goals - {home_team}: {prediction['expected_goals_home']:.2f}, {away_team}: {prediction['expected_goals_away']:.2f}")
print(f"Win probabilities - {home_team}: {prediction['home_win_prob']:.3f}, Draw: {prediction['draw_prob']:.3f}, {away_team}: {prediction['away_win_prob']:.3f}")
print(f"Most likely scoreline: {prediction['most_likely_scoreline']}")


Arsenal vs Liverpool prediction:
Expected goals - Arsenal: 1.08, Liverpool: 1.52
Win probabilities - Arsenal: 0.264, Draw: 0.267, Liverpool: 0.463
Most likely scoreline: 1-1


In [29]:
calibration = model.model_calibration_check(df)


=== MODEL CALIBRATION CHECK ===
(Comparing model predictions to original xG data)

Actual avg home xG per game:    1.53
Model avg home goals per game:  1.48
Difference: -0.04

Actual avg away xG per game:    1.37
Model avg away goals per game:  1.41
Difference: 0.04

Actual avg total xG per game:   2.90
Model avg total goals per game: 2.90
Difference: -0.00

Note: Small differences (~10%) are expected due to the Dixon-Coles
adjustments (rho parameter for low-scoring games, team effects, etc.)
