In [3]:
import pandas as pd
import sqlite3
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.stats import poisson
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')


conn = sqlite3.connect(r'C:\Users\Owner\dev\algobetting\infra\data\db\algobetting.db')

df = pd.read_sql_query("""
                        SELECT DISTINCT
                            match_date,
                            team as home_team,
                            summary_goals as home_goals,
                            opp_team as away_team,
                            opp_summary_goals as away_goals
                        FROM 
                            fbref_match_summary_v2
                        WHERE 
                            is_home = 1 AND division = 'Premier League' AND season = '2024-2025'
                       """, conn)

df

Unnamed: 0,match_date,home_team,home_goals,away_team,away_goals
0,2025-05-25,Tottenham,1.0,Brighton,4.0
1,2025-05-25,Bournemouth,2.0,Leicester City,0.0
2,2025-05-25,Newcastle Utd,0.0,Everton,1.0
3,2025-05-25,Fulham,0.0,Manchester City,2.0
4,2025-05-25,Nott'ham Forest,0.0,Chelsea,1.0
...,...,...,...,...,...
375,2024-08-17,Nott'ham Forest,1.0,Bournemouth,1.0
376,2024-08-17,Everton,0.0,Brighton,3.0
377,2024-08-17,Newcastle Utd,1.0,Southampton,0.0
378,2024-08-17,Ipswich Town,0.0,Liverpool,2.0


In [43]:
class DixonColesModel:
    """
    Dixon-Coles model for football match prediction with time decay.
    Based on multiplicative approach with penalty constraints.
    """
    
    def __init__(self, xi=0.0018):
        """
        Initialize the Dixon-Coles model.
        
        Parameters:
        xi (float): Time decay parameter (default: 0.0018, equivalent to half-life of ~1 year)
        """
        self.xi = xi
        self.teams = None
        self.team_attack = {}
        self.team_defense = {}
        self.home_advantage = 0.0
        self.rho = 0.0
        self.fitted = False
    
    @staticmethod
    def dc_probability(home_goals, away_goals, lambda_home, lambda_away, rho):
        """Calculate Dixon-Coles adjusted probability for a match outcome."""
        # Base Poisson probabilities
        p_home = poisson.pmf(home_goals, lambda_home)
        p_away = poisson.pmf(away_goals, lambda_away)
        
        # Dixon-Coles adjustment for low-scoring dependencies
        tau = 1.0
        if home_goals == 0 and away_goals == 0:
            tau = 1 - lambda_home * lambda_away * rho
        elif home_goals == 0 and away_goals == 1:
            tau = 1 + lambda_home * rho
        elif home_goals == 1 and away_goals == 0:
            tau = 1 + lambda_away * rho
        elif home_goals == 1 and away_goals == 1:
            tau = 1 - rho
        
        return tau * p_home * p_away
    
    def _time_weight(self, match_date, reference_date):
        """Calculate time decay weight for a match."""
        if isinstance(match_date, str):
            match_date = pd.to_datetime(match_date)
        if isinstance(reference_date, str):
            reference_date = pd.to_datetime(reference_date)
        
        days_diff = (reference_date - match_date).days
        return np.exp(-self.xi * days_diff)
    
    def _log_likelihood(self, params, matches, teams, reference_date):
        """Calculate log-likelihood with penalty constraints."""
        # Extract parameters
        home_advantage = params[0]
        rho = params[1]
        attack_params = params[2:2+len(teams)]
        defense_params = params[2+len(teams):]
        
        # Create team parameter dictionaries
        attack = {team: attack_params[i] for i, team in enumerate(teams)}
        defense = {team: defense_params[i] for i, team in enumerate(teams)}
        
        log_likelihood = 0
        
        for match in matches:
            home_team = match['home_team']
            away_team = match['away_team']
            home_goals = int(match['home_goals'])
            away_goals = int(match['away_goals'])
            
            # Time weight
            time_weight = self._time_weight(match['match_date'], reference_date)
            
            # Expected goals
            lambda_home = attack[home_team] * defense[away_team] * home_advantage
            lambda_away = attack[away_team] * defense[home_team]
            
            # Calculate probability with Dixon-Coles adjustment
            probability = self.dc_probability(home_goals, away_goals, lambda_home, lambda_away, rho)
            
            # Safeguard against log(0)
            if probability <= 0:
                probability = 1e-10
            
            log_likelihood += np.log(probability) * time_weight
        
        # Constraint penalties to ensure attack and defense parameters average to 1
        constraint_penalty = 0
        sum_attack = sum(attack.values())
        sum_defense = sum(defense.values())
        constraint_penalty += (sum_attack - len(teams)) ** 2
        constraint_penalty += (sum_defense - len(teams)) ** 2
        
        return -log_likelihood + constraint_penalty
    
    def fit(self, matches):
        """
        Fit the Dixon-Coles model to match data.
        
        Parameters:
        matches (DataFrame or list): Match data with columns/keys ['home_team', 'away_team', 'home_goals', 'away_goals', 'match_date']
        """
        # Convert to list of dictionaries if DataFrame
        if isinstance(matches, pd.DataFrame):
            matches = matches.to_dict('records')
        
        # Ensure dates are datetime
        for match in matches:
            if isinstance(match['match_date'], str):
                match['match_date'] = pd.to_datetime(match['match_date'])
        
        # Sort by date
        matches = sorted(matches, key=lambda x: x['match_date'])
        
        # Get unique teams and reference date
        teams = set()
        for match in matches:
            teams.add(match['home_team'])
            teams.add(match['away_team'])
        
        self.teams = sorted(list(teams))
        reference_date = max(match['match_date'] for match in matches)
        
        print(f"Fitting model with {len(matches)} matches and {len(self.teams)} teams")
        
        # Initial parameters
        initial_params = [1.2, 0.0]  # home_advantage, rho
        initial_params.extend([1.0] * len(self.teams))  # attack parameters
        initial_params.extend([1.0] * len(self.teams))  # defense parameters
        
        # Parameter bounds
        bounds = [(0.8, 2.0), (-0.5, 0.5)]  # home_advantage, rho
        bounds.extend([(0.1, 3.0)] * len(self.teams))  # attack bounds
        bounds.extend([(0.1, 3.0)] * len(self.teams))  # defense bounds
        
        # Optimize
        result = minimize(
            self._log_likelihood,
            initial_params,
            args=(matches, self.teams, reference_date),
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 1000}
        )
        
        if not result.success:
            print(f"Warning: Optimization did not converge. Message: {result.message}")
        
        # Extract and store results
        self.home_advantage = result.x[0]
        self.rho = result.x[1]
        
        attack_params = result.x[2:2+len(self.teams)]
        defense_params = result.x[2+len(self.teams):]
        
        for i, team in enumerate(self.teams):
            self.team_attack[team] = attack_params[i]
            self.team_defense[team] = defense_params[i]
        
        self.fitted = True
        
        print(f"Model fitted successfully. Log-likelihood: {-result.fun:.2f}")
        print(f"Home advantage: {self.home_advantage:.3f}")
        print(f"Rho parameter: {self.rho:.3f}")
        
        # Validation
        avg_attack = np.mean(list(self.team_attack.values()))
        avg_defense = np.mean(list(self.team_defense.values()))
        print(f"Average attack: {avg_attack:.3f}")
        print(f"Average defense: {avg_defense:.3f}")
    
    def predict_match(self, home_team, away_team, max_goals=10):
        """Predict match outcome probabilities."""
        if not self.fitted:
            raise ValueError("Model must be fitted before making predictions")
        
        if home_team not in self.team_attack or away_team not in self.team_attack:
            available_teams = sorted(self.team_attack.keys())
            raise ValueError(f"Teams not found. Available teams: {available_teams}")
        
        # Calculate expected goals
        lambda_home = self.team_attack[home_team] * self.team_defense[away_team] * self.home_advantage
        lambda_away = self.team_attack[away_team] * self.team_defense[home_team]
        
        # Calculate probability matrix
        prob_matrix = np.zeros((max_goals + 1, max_goals + 1))
        
        for i in range(max_goals + 1):
            for j in range(max_goals + 1):
                prob_matrix[i, j] = self.dc_probability(i, j, lambda_home, lambda_away, self.rho)
        
        # Normalize probabilities
        prob_matrix = prob_matrix / prob_matrix.sum()
        
        # Calculate outcome probabilities
        home_win = np.sum(np.tril(prob_matrix, -1))
        draw = np.sum(np.diag(prob_matrix))
        away_win = np.sum(np.triu(prob_matrix, 1))
        
        return {
            'home_team': home_team,
            'away_team': away_team,
            'expected_home_goals': lambda_home,
            'expected_away_goals': lambda_away,
            'home_win_prob': home_win,
            'draw_prob': draw,
            'away_win_prob': away_win,
            'probability_matrix': prob_matrix
        }
    
    def get_team_strengths(self):
        """Get team attack and defense strengths."""
        if not self.fitted:
            raise ValueError("Model must be fitted before getting team strengths")
        
        strengths = []
        for team in self.teams:
            attack = self.team_attack[team]
            defense = self.team_defense[team]
            
            strengths.append({
                'team': team,
                'attack': attack,
                'defense': defense,
                'overall': attack - defense
            })
        
        return pd.DataFrame(strengths).sort_values(by='overall',ascending=False).reset_index(drop=True)

In [45]:
model = DixonColesModel(xi=0.0018)
model.fit(df)

print("Team Strengths:")
strengths = model.get_team_strengths()
print(strengths)

Fitting model with 380 matches and 20 teams
Model fitted successfully. Log-likelihood: -873.57
Home advantage: 1.297
Rho parameter: 0.005
Average attack: 1.078
Average defense: 1.078
Team Strengths:
               team    attack   defense   overall
0         Liverpool  1.661098  0.838776  0.822322
1           Arsenal  1.321675  0.679528  0.642148
2   Manchester City  1.386082  0.828242  0.557840
3     Newcastle Utd  1.331376  0.915405  0.415971
4           Chelsea  1.197146  0.827855  0.369291
5         Brentford  1.293375  1.064479  0.228896
6       Bournemouth  1.112497  0.902420  0.210077
7   Nott'ham Forest  1.130985  0.923780  0.207205
8       Aston Villa  1.118791  0.967072  0.151719
9          Brighton  1.298346  1.180645  0.117701
10           Fulham  1.056640  1.030803  0.025838
11          Everton  0.833617  0.814822  0.018795
12   Crystal Palace  0.995716  0.978305  0.017411
13        Tottenham  1.180013  1.324469 -0.144457
14   Manchester Utd  0.862825  1.037127 -0.174302
1