In [11]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.special import factorial
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
import sqlite3

conn = sqlite3.connect(r'C:\Users\Owner\dev\algobetting\infra\data\db\algobetting.db')

df = pd.read_sql_query("""
                SELECT 
                    team as home_team,
                    opp_team as away_team,
                    summary_xg as home_xg,
                    opp_summary_xg as away_xg,
                    summary_goals as home_goals,
                    opp_summary_goals as away_goals,
                    match_date as date
                FROM fbref_match_all_columns
                WHERE division = 'Premier League'
                    AND season = '2024-2025'
                    AND summary_xg IS NOT NULL
                    AND opp_summary_xg IS NOT NULL
                    AND is_home = 1
                       """, conn)

conn.close()

df

Unnamed: 0,home_team,away_team,home_xg,away_xg,home_goals,away_goals,date
0,Tottenham,Brighton,2.0,2.2,1.0,4.0,2025-05-25
1,Bournemouth,Leicester City,1.6,0.3,2.0,0.0,2025-05-25
2,Newcastle Utd,Everton,1.2,1.2,0.0,1.0,2025-05-25
3,Fulham,Manchester City,1.3,3.1,0.0,2.0,2025-05-25
4,Nott'ham Forest,Chelsea,1.2,1.1,0.0,1.0,2025-05-25
...,...,...,...,...,...,...,...
375,Nott'ham Forest,Bournemouth,1.4,1.2,1.0,1.0,2024-08-17
376,Everton,Brighton,0.5,1.4,0.0,3.0,2024-08-17
377,Newcastle Utd,Southampton,0.3,1.8,1.0,0.0,2024-08-17
378,Ipswich Town,Liverpool,0.5,2.6,0.0,2.0,2024-08-17


In [12]:
class DixonColesModelWithResim:
    """
    Enhanced Dixon-Coles model with xG-based resimulation for better parameter estimation.
    
    The model can optionally resimulate matches using xG data to create additional
    training examples, potentially improving parameter estimation accuracy.
    """
    
    def __init__(self, xi=0.0018, n_simulations=100, use_resimulation=True):
        """
        Initialize the enhanced Dixon-Coles model.
        
        Parameters:
        xi (float): Time decay parameter. Higher values = faster decay of old matches.
                   Default 0.0018 gives half-weight to matches ~1 year old.
        n_simulations (int): Number of resimulations per match when using xG data.
        use_resimulation (bool): Whether to use xG resimulation or standard fitting.
        """
        self.xi = xi
        self.n_simulations = n_simulations
        self.use_resimulation = use_resimulation
        self.teams = None
        self.attack_params = None
        self.defense_params = None
        self.home_advantage = None
        self.rho = None  # Low-scoring game adjustment parameter
        self.is_fitted = False
        
    def time_weight(self, match_dates, current_date=None):
        """Calculate time decay weights for matches."""
        if current_date is None:
            current_date = max(match_dates)
        
        days_ago = [(current_date - date).days for date in match_dates]
        weights = np.exp(-self.xi * np.array(days_ago))
        return weights
    
    def tau_correction(self, home_goals, away_goals, lambda_home, lambda_away, rho):
        """
        Dixon-Coles correction factor for low-scoring games.
        Adjusts probabilities for 0-0, 0-1, 1-0, and 1-1 scorelines.
        """
        if home_goals == 0 and away_goals == 0:
            return 1 - lambda_home * lambda_away * rho
        elif home_goals == 0 and away_goals == 1:
            return 1 + lambda_home * rho
        elif home_goals == 1 and away_goals == 0:
            return 1 + lambda_away * rho
        elif home_goals == 1 and away_goals == 1:
            return 1 - rho
        else:
            return 1.0
    
    def poisson_prob(self, goals, lambda_param):
        """Calculate Poisson probability."""
        return (lambda_param ** goals) * np.exp(-lambda_param) / factorial(goals)
    
    def match_probability(self, home_goals, away_goals, lambda_home, lambda_away, rho):
        """Calculate probability of specific scoreline with Dixon-Coles correction."""
        basic_prob = self.poisson_prob(home_goals, lambda_home) * self.poisson_prob(away_goals, lambda_away)
        correction = self.tau_correction(home_goals, away_goals, lambda_home, lambda_away, rho)
        return basic_prob * correction
    
    def _resimulate_matches_with_xg(self, matches_df):
        """
        Resimulate matches using xG data to create additional training examples.
        
        Parameters:
        matches_df (DataFrame): Original match data with xG columns
        
        Returns:
        DataFrame: Expanded dataset with original + simulated matches
        """
        if not self.use_resimulation:
            # Add weight column and return original data
            matches_df = matches_df.copy()
            matches_df['weight'] = 1.0
            matches_df['is_simulation'] = False
            return matches_df
        
        # Check if xG data is available
        has_xg_data = 'home_xg' in matches_df.columns and 'away_xg' in matches_df.columns
        if not has_xg_data:
            print("Warning: No xG data found. Using original matches only.")
            matches_df = matches_df.copy()
            matches_df['weight'] = 1.0
            matches_df['is_simulation'] = False
            return matches_df
        
        print(f"Resimulating {len(matches_df)} matches ({self.n_simulations} simulations each)")
        
        # Start with original matches (zero weight)
        expanded_matches = matches_df.copy()
        expanded_matches['weight'] = 0.3  # Original matches get zero weight
        expanded_matches['is_simulation'] = False
        expanded_matches['simulation_id'] = -1
        
        # Generate simulated matches
        simulated_matches = []
        
        for idx, match in matches_df.iterrows():
            home_xg = match.get('home_xg', 0)
            away_xg = match.get('away_xg', 0)
            
            # Skip matches without valid xG data
            if pd.isna(home_xg) or pd.isna(away_xg) or home_xg <= 0 or away_xg <= 0:
                continue
            
            # Create n_simulations for this match
            for sim_id in range(self.n_simulations):
                # Generate simulated goals using Poisson distribution
                home_goals_sim = np.random.poisson(home_xg)
                away_goals_sim = np.random.poisson(away_xg)
                
                # Create simulated match record
                sim_match = match.copy()
                sim_match['home_goals'] = home_goals_sim
                sim_match['away_goals'] = away_goals_sim
                sim_match['is_simulation'] = True
                sim_match['simulation_id'] = sim_id
                sim_match['weight'] = 0.7 / self.n_simulations  # Equal weight distributed across sims
                
                simulated_matches.append(sim_match)
        
        # Combine original and simulated matches
        if simulated_matches:
            sim_df = pd.DataFrame(simulated_matches)
            expanded_df = pd.concat([expanded_matches, sim_df], ignore_index=True)
            print(f"Expanded from {len(matches_df)} to {len(expanded_df)} matches")
        else:
            expanded_df = expanded_matches
            print("No valid xG data found for resimulation. Using original matches with standard weights.")
            expanded_df['weight'] = 1.0
        
        return expanded_df
    
    def negative_log_likelihood(self, params, matches, weights):
        """Calculate negative log-likelihood for parameter optimization."""
        n_teams = len(self.teams)
        
        # Extract parameters following R approach
        attack_params = params[:n_teams]
        defense_params_partial = params[n_teams:2*n_teams-1]  # n-1 defense params
        home_advantage = params[2*n_teams-1]
        rho = params[2*n_teams]
        
        # Compute first team's defense parameter to ensure sum = 0 (R approach)
        first_team_defense = -np.sum(defense_params_partial)
        defense_params = np.concatenate([[first_team_defense], defense_params_partial])
        
        total_log_likelihood = 0
        
        for i, match in matches.iterrows():
            home_idx = self.team_to_idx[match['home_team']]
            away_idx = self.team_to_idx[match['away_team']]
            
            # Calculate expected goals (no baseline, like R)
            lambda_home = np.exp(attack_params[home_idx] + defense_params[away_idx] + home_advantage)
            lambda_away = np.exp(attack_params[away_idx] + defense_params[home_idx])
                        
            # Get match probability with Dixon-Coles correction
            prob = self.match_probability(
                match['home_goals'], match['away_goals'], 
                lambda_home, lambda_away, rho
            )
            
            # Add to weighted log-likelihood
            if prob > 0:
                total_log_likelihood += weights[i] * np.log(prob)
            else:
                total_log_likelihood += weights[i] * np.log(1e-10)  # Avoid log(0)
        
        return -total_log_likelihood
    
    def fit(self, matches_df, current_date=None):
        """
        Fit the Dixon-Coles model to match data with optional xG resimulation.
        
        Parameters:
        matches_df (DataFrame): Must contain columns: 'date', 'home_team', 'away_team', 
                               'home_goals', 'away_goals'. Optional: 'home_xg', 'away_xg'
        current_date (datetime): Date to calculate time weights from (default: latest match date)
        """
        # Ensure date column is datetime
        if 'date' in matches_df.columns:
            matches_df['date'] = pd.to_datetime(matches_df['date'])
        else:
            raise ValueError("DataFrame must contain a 'date' column")
        
        # Apply xG resimulation if enabled
        if self.use_resimulation:
            print("Using xG resimulation for enhanced parameter estimation...")
            expanded_matches = self._resimulate_matches_with_xg(matches_df)
        else:
            print("Using standard Dixon-Coles fitting...")
            expanded_matches = matches_df.copy()
            expanded_matches['weight'] = 1.0
            expanded_matches['is_simulation'] = False
        
        # Get unique teams from expanded dataset
        self.teams = sorted(list(set(expanded_matches['home_team'].tolist() + expanded_matches['away_team'].tolist())))
        self.team_to_idx = {team: i for i, team in enumerate(self.teams)}
        
        # Calculate time weights
        time_weights = self.time_weight(expanded_matches['date'].tolist(), current_date)
        
        # Combine time weights with resimulation weights
        if 'weight' in expanded_matches.columns:
            combined_weights = time_weights * expanded_matches['weight'].values
        else:
            combined_weights = time_weights
        
        # Initial parameter guess - R approach
        n_teams = len(self.teams)
        initial_params = np.concatenate([
            np.zeros(n_teams),      # attack parameters for all teams
            np.zeros(n_teams - 1),  # defense parameters for n-1 teams (first computed automatically)
            [0.1],  # home advantage
            [0.0]   # rho (low-scoring correction)
        ])
        
        # Optimize parameters
        result = minimize(
            self.negative_log_likelihood,
            initial_params,
            args=(expanded_matches, combined_weights),
            method='L-BFGS-B',
            options={'maxiter': 1000}
        )
        
        if result.success:
            # Extract fitted parameters following R approach
            self.attack_params = result.x[:n_teams]
            defense_params_partial = result.x[n_teams:2*n_teams-1]
            
            # Compute first team's defense parameter
            first_team_defense = -np.sum(defense_params_partial)
            self.defense_params = np.concatenate([[first_team_defense], defense_params_partial])
            
            self.home_advantage = result.x[2*n_teams-1]
            self.rho = result.x[2*n_teams]
            self.is_fitted = True
            
            print(f"Model fitted successfully!")
            print(f"Home advantage: {np.exp(self.home_advantage):.4f}")
            print(f"Rho parameter: {self.rho:.4f}")
            if self.use_resimulation:
                n_original = len(matches_df)
                n_expanded = len(expanded_matches)
                print(f"Used {n_expanded} total matches ({n_original} original + {n_expanded - n_original} simulated)")
        else:
            raise RuntimeError("Optimization failed to converge")
    
    def predict_match(self, home_team, away_team):
        """
        Predict expected goals and match probabilities for a single match.
        
        Returns:
        dict: Contains expected goals, win probabilities, and scoreline probabilities
        """
        if not self.is_fitted:
            raise RuntimeError("Model must be fitted before making predictions")
        
        if home_team not in self.teams or away_team not in self.teams:
            raise ValueError("One or both teams not in training data")
        
        home_idx = self.team_to_idx[home_team]
        away_idx = self.team_to_idx[away_team]
        
        # Calculate expected goals (no baseline, like R)
        lambda_home = np.exp(self.attack_params[home_idx] + self.defense_params[away_idx] + self.home_advantage)
        lambda_away = np.exp(self.attack_params[away_idx] + self.defense_params[home_idx])
        
        # Calculate outcome probabilities
        home_win_prob = 0
        away_win_prob = 0
        draw_prob = 0
        
        # Calculate probabilities for scorelines up to 5-5
        scoreline_probs = {}
        for h_goals in range(6):
            for a_goals in range(6):
                prob = self.match_probability(h_goals, a_goals, lambda_home, lambda_away, self.rho)
                scoreline_probs[f"{h_goals}-{a_goals}"] = prob
                
                if h_goals > a_goals:
                    home_win_prob += prob
                elif h_goals < a_goals:
                    away_win_prob += prob
                else:
                    draw_prob += prob
        
        return {
            'expected_goals_home': lambda_home,
            'expected_goals_away': lambda_away,
            'home_win_prob': home_win_prob,
            'draw_prob': draw_prob,
            'away_win_prob': away_win_prob,
            'most_likely_scoreline': max(scoreline_probs, key=scoreline_probs.get),
            'scoreline_probabilities': scoreline_probs
        }
    
    def get_team_ratings(self):
        """Get team ratings showing attack/defense strength vs average opposition."""
        if not self.is_fitted:
            raise RuntimeError("Model must be fitted before getting ratings")
        
        # Calculate what each team scores/concedes vs "average" opposition
        # With defense sum-to-zero constraint, average defense = 0
        attack_vs_avg = []
        defense_vs_avg = []
        
        for i, team in enumerate(self.teams):
            # Goals this team scores vs average defense (defense=0)
            goals_scored_vs_avg = np.exp(self.attack_params[i] + 0)
            
            # Goals this team concedes vs average attack (attack=mean(attack))
            avg_attack = np.mean(self.attack_params)
            goals_conceded_vs_avg = np.exp(avg_attack + self.defense_params[i])
            
            attack_vs_avg.append(goals_scored_vs_avg)
            defense_vs_avg.append(goals_conceded_vs_avg)
        
        ratings = pd.DataFrame({
            'team': self.teams,
            'attack_rating': attack_vs_avg,  # Goals scored vs avg team
            'defense_rating': defense_vs_avg,  # Goals conceded vs avg team  
            'goal_difference': np.array(attack_vs_avg) - np.array(defense_vs_avg)
        })
        
        return ratings.sort_values('goal_difference', ascending=False)

    def get_average_goals_per_game(self, matches_df):
        """Calculate actual average goals per game from the data."""
        if not self.is_fitted:
            raise RuntimeError("Model must be fitted before analysis")
            
        total_goals = matches_df['home_goals'].sum() + matches_df['away_goals'].sum()
        total_games = len(matches_df)
        avg_goals_per_game = total_goals / total_games
        avg_home_goals = matches_df['home_goals'].mean()
        avg_away_goals = matches_df['away_goals'].mean()
        
        return {
            'avg_total_goals_per_game': avg_goals_per_game,
            'avg_home_goals_per_game': avg_home_goals,
            'avg_away_goals_per_game': avg_away_goals
        }
    
    def predict_average_goals(self):
        """Predict what the model thinks average goals should be."""
        if not self.is_fitted:
            raise RuntimeError("Model must be fitted before predictions")
            
        # Calculate average expected goals across all possible matchups
        total_home_goals = 0
        total_away_goals = 0
        total_matches = 0
        
        for i, home_team in enumerate(self.teams):
            for j, away_team in enumerate(self.teams):
                if i != j:  # Teams don't play themselves
                    pred = self.predict_match(home_team, away_team)
                    total_home_goals += pred['expected_goals_home']
                    total_away_goals += pred['expected_goals_away']
                    total_matches += 1
        
        avg_home_predicted = total_home_goals / total_matches
        avg_away_predicted = total_away_goals / total_matches
        avg_total_predicted = avg_home_predicted + avg_away_predicted
        
        return {
            'predicted_avg_home_goals': avg_home_predicted,
            'predicted_avg_away_goals': avg_away_predicted,
            'predicted_avg_total_goals': avg_total_predicted
        }
    
    def model_calibration_check(self, matches_df):
        """Compare model predictions to actual data statistics."""
        actual_stats = self.get_average_goals_per_game(matches_df)
        predicted_stats = self.predict_average_goals()
        
        print("=== MODEL CALIBRATION CHECK ===")
        print(f"Actual avg home goals per game: {actual_stats['avg_home_goals_per_game']:.2f}")
        print(f"Model avg home goals per game:  {predicted_stats['predicted_avg_home_goals']:.2f}")
        print(f"Difference: {predicted_stats['predicted_avg_home_goals'] - actual_stats['avg_home_goals_per_game']:.2f}")
        print()
        print(f"Actual avg away goals per game: {actual_stats['avg_away_goals_per_game']:.2f}")
        print(f"Model avg away goals per game:  {predicted_stats['predicted_avg_away_goals']:.2f}")
        print(f"Difference: {predicted_stats['predicted_avg_away_goals'] - actual_stats['avg_away_goals_per_game']:.2f}")
        print()
        print(f"Actual avg total goals per game: {actual_stats['avg_total_goals_per_game']:.2f}")
        print(f"Model avg total goals per game:  {predicted_stats['predicted_avg_total_goals']:.2f}")
        print(f"Difference: {predicted_stats['predicted_avg_total_goals'] - actual_stats['avg_total_goals_per_game']:.2f}")
        
        return {
            'actual': actual_stats,
            'predicted': predicted_stats
        }

In [13]:
model = DixonColesModelWithResim(n_simulations=100, xi=0.0056, use_resimulation=True)  # Slightly faster decay
model.fit(df)
    
# Get team ratings
print("\nTeam ratings:")
print(model.get_team_ratings())
    


Using xG resimulation for enhanced parameter estimation...
Resimulating 380 matches (100 simulations each)
Expanded from 380 to 38280 matches
Model fitted successfully!
Home advantage: 1.0809
Rho parameter: -0.0140
Used 38280 total matches (380 original + 37900 simulated)

Team ratings:
               team  attack_rating  defense_rating  goal_difference
11        Liverpool       2.058387        1.113823         0.944564
0           Arsenal       1.544176        0.904530         0.639647
12  Manchester City       1.703666        1.094811         0.608856
14    Newcastle Utd       1.723777        1.128338         0.595440
5           Chelsea       1.537354        1.074133         0.463222
2       Bournemouth       1.460005        1.176664         0.283342
6    Crystal Palace       1.527373        1.259583         0.267790
3         Brentford       1.556847        1.309312         0.247534
1       Aston Villa       1.433781        1.199668         0.234112
4          Brighton       1.5651

In [14]:
# Make a prediction
home_team = "Arsenal"
away_team = "Liverpool"
prediction = model.predict_match(home_team, away_team)
print(f"\n{home_team} vs {away_team} prediction:")
print(f"Expected goals - {home_team}: {prediction['expected_goals_home']:.2f}, {away_team}: {prediction['expected_goals_away']:.2f}")
print(f"Win probabilities - {home_team}: {prediction['home_win_prob']:.3f}, Draw: {prediction['draw_prob']:.3f}, {away_team}: {prediction['away_win_prob']:.3f}")
print(f"Most likely scoreline: {prediction['most_likely_scoreline']}")


Arsenal vs Liverpool prediction:
Expected goals - Arsenal: 1.42, Liverpool: 1.42
Win probabilities - Arsenal: 0.369, Draw: 0.254, Liverpool: 0.370
Most likely scoreline: 1-1


In [15]:
calibration = model.model_calibration_check(df)

=== MODEL CALIBRATION CHECK ===
Actual avg home goals per game: 1.51
Model avg home goals per game:  1.50
Difference: -0.02

Actual avg away goals per game: 1.42
Model avg away goals per game:  1.38
Difference: -0.04

Actual avg total goals per game: 2.93
Model avg total goals per game:  2.88
Difference: -0.05
