In [2]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.special import factorial
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
import sqlite3

conn = sqlite3.connect(r'C:\Users\Owner\dev\algobetting\infra\data\db\algobetting.db')

df = pd.read_sql_query("""
                SELECT 
                    team as home_team,
                    opp_team as away_team,
                    summary_goals as home_goals,
                    opp_summary_goals as away_goals,
                    match_date as date
                FROM fbref_match_all_columns
                WHERE division = 'Premier League'
                    AND season = '2024-2025'
                    AND summary_xg IS NOT NULL
                    AND opp_summary_xg IS NOT NULL
                    AND is_home = 1
                       """, conn)

conn.close()

df

Unnamed: 0,home_team,away_team,home_goals,away_goals,date
0,Tottenham,Brighton,1.0,4.0,2025-05-25
1,Bournemouth,Leicester City,2.0,0.0,2025-05-25
2,Newcastle Utd,Everton,0.0,1.0,2025-05-25
3,Fulham,Manchester City,0.0,2.0,2025-05-25
4,Nott'ham Forest,Chelsea,0.0,1.0,2025-05-25
...,...,...,...,...,...
375,Nott'ham Forest,Bournemouth,1.0,1.0,2024-08-17
376,Everton,Brighton,0.0,3.0,2024-08-17
377,Newcastle Utd,Southampton,1.0,0.0,2024-08-17
378,Ipswich Town,Liverpool,0.0,2.0,2024-08-17


In [3]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.special import factorial
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

class DixonColesModel:
    """
    Dixon-Coles model for predicting football match outcomes with time decay.
    
    The model estimates attack and defense parameters for each team, with recent
    matches weighted more heavily than older ones using exponential time decay.
    """
    
    def __init__(self, xi=0.0018):
        """
        Initialize the Dixon-Coles model.
        
        Parameters:
        xi (float): Time decay parameter. Higher values = faster decay of old matches.
                   Default 0.0018 gives half-weight to matches ~1 year old.
        """
        self.xi = xi
        self.teams = None
        self.attack_params = None
        self.defense_params = None
        self.home_advantage = None
        self.rho = None  # Low-scoring game adjustment parameter
        self.is_fitted = False
        
    def time_weight(self, match_dates, current_date=None):
        """Calculate time decay weights for matches."""
        if current_date is None:
            current_date = max(match_dates)
        
        days_ago = [(current_date - date).days for date in match_dates]
        weights = np.exp(-self.xi * np.array(days_ago))
        return weights
    
    def tau_correction(self, home_goals, away_goals, lambda_home, lambda_away, rho):
        """
        Dixon-Coles correction factor for low-scoring games.
        Adjusts probabilities for 0-0, 0-1, 1-0, and 1-1 scorelines.
        """
        if home_goals == 0 and away_goals == 0:
            return 1 - lambda_home * lambda_away * rho
        elif home_goals == 0 and away_goals == 1:
            return 1 + lambda_home * rho
        elif home_goals == 1 and away_goals == 0:
            return 1 + lambda_away * rho
        elif home_goals == 1 and away_goals == 1:
            return 1 - rho
        else:
            return 1.0
    
    def poisson_prob(self, goals, lambda_param):
        """Calculate Poisson probability."""
        return (lambda_param ** goals) * np.exp(-lambda_param) / factorial(goals)
    
    def match_probability(self, home_goals, away_goals, lambda_home, lambda_away, rho):
        """Calculate probability of specific scoreline with Dixon-Coles correction."""
        basic_prob = self.poisson_prob(home_goals, lambda_home) * self.poisson_prob(away_goals, lambda_away)
        correction = self.tau_correction(home_goals, away_goals, lambda_home, lambda_away, rho)
        return basic_prob * correction
    
    def negative_log_likelihood(self, params, matches, weights):
        """Calculate negative log-likelihood for parameter optimization."""
        n_teams = len(self.teams)
        
        # Extract parameters following R approach
        attack_params = params[:n_teams]
        defense_params_partial = params[n_teams:2*n_teams-1]  # n-1 defense params
        home_advantage = params[2*n_teams-1]
        rho = params[2*n_teams]
        
        # Compute first team's defense parameter to ensure sum = 0 (R approach)
        first_team_defense = -np.sum(defense_params_partial)
        defense_params = np.concatenate([[first_team_defense], defense_params_partial])
        
        total_log_likelihood = 0
        
        for i, match in matches.iterrows():
            home_idx = self.team_to_idx[match['home_team']]
            away_idx = self.team_to_idx[match['away_team']]
            
            # Calculate expected goals (no baseline, like R)
            lambda_home = np.exp(attack_params[home_idx] + defense_params[away_idx] + home_advantage)
            lambda_away = np.exp(attack_params[away_idx] + defense_params[home_idx])
                        
            # Get match probability with Dixon-Coles correction
            prob = self.match_probability(
                match['home_goals'], match['away_goals'], 
                lambda_home, lambda_away, rho
            )
            
            # Add to weighted log-likelihood
            if prob > 0:
                total_log_likelihood += weights[i] * np.log(prob)
            else:
                total_log_likelihood += weights[i] * np.log(1e-10)  # Avoid log(0)
        
        return -total_log_likelihood
    
    def fit(self, matches_df, current_date=None):
        """
        Fit the Dixon-Coles model to match data.
        
        Parameters:
        matches_df (DataFrame): Must contain columns: 'date', 'home_team', 'away_team', 
                               'home_goals', 'away_goals'
        current_date (datetime): Date to calculate time weights from (default: latest match date)
        """
        # Ensure date column is datetime
        if 'date' in matches_df.columns:
            matches_df['date'] = pd.to_datetime(matches_df['date'])
        else:
            raise ValueError("DataFrame must contain a 'date' column")
        
        # Get unique teams
        self.teams = sorted(list(set(matches_df['home_team'].tolist() + matches_df['away_team'].tolist())))
        self.team_to_idx = {team: i for i, team in enumerate(self.teams)}
        
        # Calculate time weights
        weights = self.time_weight(matches_df['date'].tolist(), current_date)
        
        # Initial parameter guess - R approach
        n_teams = len(self.teams)
        initial_params = np.concatenate([
            np.zeros(n_teams),      # attack parameters for all teams
            np.zeros(n_teams - 1),  # defense parameters for n-1 teams (first computed automatically)
            [0.1],  # home advantage
            [0.0]   # rho (low-scoring correction)
        ])
        
        # Optimize parameters (no constraints needed - handled automatically)
        result = minimize(
            self.negative_log_likelihood,
            initial_params,
            args=(matches_df, weights),
            method='L-BFGS-B',
            options={'maxiter': 1000}
        )
        
        if result.success:
            # Extract fitted parameters following R approach
            self.attack_params = result.x[:n_teams]
            defense_params_partial = result.x[n_teams:2*n_teams-1]
            
            # Compute first team's defense parameter
            first_team_defense = -np.sum(defense_params_partial)
            self.defense_params = np.concatenate([[first_team_defense], defense_params_partial])
            
            self.home_advantage = result.x[2*n_teams-1]
            self.rho = result.x[2*n_teams]
            self.is_fitted = True
            
            print(f"Model fitted successfully!")
            print(f"Home advantage: {np.exp(self.home_advantage):.4f}")
            print(f"Rho parameter: {self.rho:.4f}")
        else:
            raise RuntimeError("Optimization failed to converge")
    
    def predict_match(self, home_team, away_team):
        """
        Predict expected goals and match probabilities for a single match.
        
        Returns:
        dict: Contains expected goals, win probabilities, and scoreline probabilities
        """
        if not self.is_fitted:
            raise RuntimeError("Model must be fitted before making predictions")
        
        if home_team not in self.teams or away_team not in self.teams:
            raise ValueError("One or both teams not in training data")
        
        home_idx = self.team_to_idx[home_team]
        away_idx = self.team_to_idx[away_team]
        
        # Calculate expected goals (no baseline, like R)
        lambda_home = np.exp(self.attack_params[home_idx] + self.defense_params[away_idx] + self.home_advantage)
        lambda_away = np.exp(self.attack_params[away_idx] + self.defense_params[home_idx])
        
        # Calculate outcome probabilities
        home_win_prob = 0
        away_win_prob = 0
        draw_prob = 0
        
        # Calculate probabilities for scorelines up to 5-5
        scoreline_probs = {}
        for h_goals in range(6):
            for a_goals in range(6):
                prob = self.match_probability(h_goals, a_goals, lambda_home, lambda_away, self.rho)
                scoreline_probs[f"{h_goals}-{a_goals}"] = prob
                
                if h_goals > a_goals:
                    home_win_prob += prob
                elif h_goals < a_goals:
                    away_win_prob += prob
                else:
                    draw_prob += prob
        
        return {
            'expected_goals_home': lambda_home,
            'expected_goals_away': lambda_away,
            'home_win_prob': home_win_prob,
            'draw_prob': draw_prob,
            'away_win_prob': away_win_prob,
            'most_likely_scoreline': max(scoreline_probs, key=scoreline_probs.get),
            'scoreline_probabilities': scoreline_probs
        }
    
    def get_team_ratings(self):
        if not self.is_fitted:
            raise RuntimeError("Model must be fitted before getting ratings")
        
        # Calculate what each team scores/concedes vs "average" opposition
        # With defense sum-to-zero constraint, average defense = 0
        attack_vs_avg = []
        defense_vs_avg = []
        
        for i, team in enumerate(self.teams):
            # Goals this team scores vs average defense (defense=0)
            goals_scored_vs_avg = np.exp(self.attack_params[i] + 0)
            
            # Goals this team concedes vs average attack (attack=mean(attack))
            avg_attack = np.mean(self.attack_params)
            goals_conceded_vs_avg = np.exp(avg_attack + self.defense_params[i])
            
            attack_vs_avg.append(goals_scored_vs_avg)
            defense_vs_avg.append(goals_conceded_vs_avg)
        
        ratings = pd.DataFrame({
            'team': self.teams,
            'attack_rating': attack_vs_avg,  # Goals scored vs avg team
            'defense_rating': defense_vs_avg,  # Goals conceded vs avg team  
            'goal_difference': np.array(attack_vs_avg) - np.array(defense_vs_avg)
        })
        
        return ratings.sort_values('goal_difference', ascending=False)

    def get_average_goals_per_game(self, matches_df):
        """Calculate actual average goals per game from the data."""
        if not self.is_fitted:
            raise RuntimeError("Model must be fitted before analysis")
            
        total_goals = matches_df['home_goals'].sum() + matches_df['away_goals'].sum()
        total_games = len(matches_df)
        avg_goals_per_game = total_goals / total_games
        avg_home_goals = matches_df['home_goals'].mean()
        avg_away_goals = matches_df['away_goals'].mean()
        
        return {
            'avg_total_goals_per_game': avg_goals_per_game,
            'avg_home_goals_per_game': avg_home_goals,
            'avg_away_goals_per_game': avg_away_goals
        }
    
    def predict_average_goals(self):
        """Predict what the model thinks average goals should be."""
        if not self.is_fitted:
            raise RuntimeError("Model must be fitted before predictions")
            
        # Calculate average expected goals across all possible matchups
        total_home_goals = 0
        total_away_goals = 0
        total_matches = 0
        
        for i, home_team in enumerate(self.teams):
            for j, away_team in enumerate(self.teams):
                if i != j:  # Teams don't play themselves
                    pred = self.predict_match(home_team, away_team)
                    total_home_goals += pred['expected_goals_home']
                    total_away_goals += pred['expected_goals_away']
                    total_matches += 1
        
        avg_home_predicted = total_home_goals / total_matches
        avg_away_predicted = total_away_goals / total_matches
        avg_total_predicted = avg_home_predicted + avg_away_predicted
        
        return {
            'predicted_avg_home_goals': avg_home_predicted,
            'predicted_avg_away_goals': avg_away_predicted,
            'predicted_avg_total_goals': avg_total_predicted
        }
    
    def model_calibration_check(self, matches_df):
        """Compare model predictions to actual data statistics."""
        actual_stats = self.get_average_goals_per_game(matches_df)
        predicted_stats = self.predict_average_goals()
        
        print("=== MODEL CALIBRATION CHECK ===")
        print(f"Actual avg home goals per game: {actual_stats['avg_home_goals_per_game']:.2f}")
        print(f"Model avg home goals per game:  {predicted_stats['predicted_avg_home_goals']:.2f}")
        print(f"Difference: {predicted_stats['predicted_avg_home_goals'] - actual_stats['avg_home_goals_per_game']:.2f}")
        print()
        print(f"Actual avg away goals per game: {actual_stats['avg_away_goals_per_game']:.2f}")
        print(f"Model avg away goals per game:  {predicted_stats['predicted_avg_away_goals']:.2f}")
        print(f"Difference: {predicted_stats['predicted_avg_away_goals'] - actual_stats['avg_away_goals_per_game']:.2f}")
        print()
        print(f"Actual avg total goals per game: {actual_stats['avg_total_goals_per_game']:.2f}")
        print(f"Model avg total goals per game:  {predicted_stats['predicted_avg_total_goals']:.2f}")
        print(f"Difference: {predicted_stats['predicted_avg_total_goals'] - actual_stats['avg_total_goals_per_game']:.2f}")
        
        return {
            'actual': actual_stats,
            'predicted': predicted_stats
        }

In [9]:
model = DixonColesModel(xi=0.0018)  # Slightly faster decay
model.fit(df)
    
# Get team ratings
print("\nTeam ratings:")
print(model.get_team_ratings())
    


Model fitted successfully!
Home advantage: 1.0679
Rho parameter: -0.0177

Team ratings:
               team  attack_rating  defense_rating  goal_difference
11        Liverpool       2.097231        1.080320         1.016910
0           Arsenal       1.662329        0.848957         0.813372
12  Manchester City       1.749922        1.065375         0.684547
14    Newcastle Utd       1.700746        1.166156         0.534590
5           Chelsea       1.511660        1.036369         0.475291
15  Nott'ham Forest       1.424624        1.152486         0.272139
2       Bournemouth       1.401803        1.129855         0.271949
3         Brentford       1.634254        1.381134         0.253120
1       Aston Villa       1.432422        1.209110         0.223312
4          Brighton       1.665966        1.480630         0.185335
6    Crystal Palace       1.302454        1.250243         0.052211
7           Everton       1.051234        1.021265         0.029969
8            Fulham       1.

In [8]:
# Make a prediction
home_team = "Liverpool"
away_team = "Arsenal"
prediction = model.predict_match(home_team, away_team)
print(f"\n{home_team} vs {away_team} prediction:")
print(f"Expected goals - {home_team}: {prediction['expected_goals_home']:.2f}, {away_team}: {prediction['expected_goals_away']:.2f}")
print(f"Win probabilities - {home_team}: {prediction['home_win_prob']:.3f}, Draw: {prediction['draw_prob']:.3f}, {away_team}: {prediction['away_win_prob']:.3f}")
print(f"Most likely scoreline: {prediction['most_likely_scoreline']}")


Liverpool vs Arsenal prediction:
Expected goals - Liverpool: 1.46, Arsenal: 1.45
Win probabilities - Liverpool: 0.377, Draw: 0.244, Arsenal: 0.371
Most likely scoreline: 1-1


In [6]:
calibration = model.model_calibration_check(df)

=== MODEL CALIBRATION CHECK ===
Actual avg home goals per game: 1.51
Model avg home goals per game:  1.51
Difference: -0.00

Actual avg away goals per game: 1.42
Model avg away goals per game:  1.41
Difference: -0.01

Actual avg total goals per game: 2.93
Model avg total goals per game:  2.92
Difference: -0.01
