In [14]:
import pandas as pd
import sqlite3
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.stats import poisson
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("prem_shots.csv")

print(df.head())

   round  match_date          home_team away_team  \
0      1  2024-08-16  Manchester United    Fulham   
1      1  2024-08-16  Manchester United    Fulham   
2      1  2024-08-16  Manchester United    Fulham   
3      1  2024-08-16  Manchester United    Fulham   
4      1  2024-08-16  Manchester United    Fulham   

                                           match_url         player_name  \
0  https://www.fotmob.com/en-GB/matches/fulham-vs...      Joshua Zirkzee   
1  https://www.fotmob.com/en-GB/matches/fulham-vs...       Calvin Bassey   
2  https://www.fotmob.com/en-GB/matches/fulham-vs...  Alejandro Garnacho   
3  https://www.fotmob.com/en-GB/matches/fulham-vs...        Adama Traoré   
4  https://www.fotmob.com/en-GB/matches/fulham-vs...          Kenny Tete   

         Team  minute    xg  xgot         result  
0  Man United      87  0.07  0.86           Goal  
1      Fulham      91  0.05   NaN           Miss  
2  Man United      94  0.67   NaN           Miss  
3      Fulham       

In [27]:
import numpy as np
import pandas as pd
from scipy.stats import poisson
from scipy.optimize import minimize


class DixonColesModel:
    """
    Dixon-Coles model for football match prediction with time decay.
    Uses resimulation approach for xG data instead of Poisson-Binomial.
    """
    
    def __init__(self, xi=0.0018, n_simulations=1000):
        """
        Initialize the Dixon-Coles model.
        
        Parameters:
        xi (float): Time decay parameter (default: 0.0018, equivalent to half-life of ~1 year)
        n_simulations (int): Number of simulations per match when using xG data
        """
        self.xi = xi
        self.n_simulations = n_simulations
        self.teams = None
        self.team_attack = {}
        self.team_defense = {}
        self.home_advantage = 0.0
        self.rho = 0.0
        self.fitted = False
    
    @staticmethod
    def dc_probability(home_goals, away_goals, lambda_home, lambda_away, rho):
        """Calculate Dixon-Coles adjusted probability for a match outcome."""
        # Base Poisson probabilities
        p_home = poisson.pmf(home_goals, lambda_home)
        p_away = poisson.pmf(away_goals, lambda_away)
        
        # Dixon-Coles adjustment for low-scoring dependencies
        tau = 1.0
        if home_goals == 0 and away_goals == 0:
            tau = 1 - lambda_home * lambda_away * rho
        elif home_goals == 0 and away_goals == 1:
            tau = 1 + lambda_home * rho
        elif home_goals == 1 and away_goals == 0:
            tau = 1 + lambda_away * rho
        elif home_goals == 1 and away_goals == 1:
            tau = 1 - rho
        
        return tau * p_home * p_away
    
    def _time_weight(self, match_date, reference_date):
        """Calculate time decay weight for a match."""
        if isinstance(match_date, str):
            match_date = pd.to_datetime(match_date)
        if isinstance(reference_date, str):
            reference_date = pd.to_datetime(reference_date)
        
        days_diff = (reference_date - match_date).days
        return np.exp(-self.xi * days_diff)
    
    def _resimulate_matches_with_xg(self, matches_with_shots):
        """
        Resimulate matches using xG values for each shot.
        
        Parameters:
        matches_with_shots (list): List of match dictionaries with 'home_shots' and 'away_shots' xG lists
        
        Returns:
        list: Expanded list of matches including simulations
        """
        expanded_matches = []
        
        print(f"Resimulating {len(matches_with_shots)} matches ({self.n_simulations} simulations each)")
        
        for match in matches_with_shots:
            home_xg_values = match.get('home_shots', [])
            away_xg_values = match.get('away_shots', [])
            
            # Skip if no shot data
            if not home_xg_values and not away_xg_values:
                # Add original match with weight 1.0
                match_copy = match.copy()
                match_copy['weight'] = 1.0
                match_copy['home_goals'] = match.get('home_goals', 0)
                match_copy['away_goals'] = match.get('away_goals', 0)
                expanded_matches.append(match_copy)
                continue
            
            # Create simulations
            for sim_id in range(self.n_simulations):
                # Simulate each shot as Bernoulli trial with p = xG
                home_goals_sim = sum(np.random.random() < xg for xg in home_xg_values)
                away_goals_sim = sum(np.random.random() < xg for xg in away_xg_values)
                
                # Create simulated match
                sim_match = {
                    'home_team': match['home_team'],
                    'away_team': match['away_team'],
                    'match_date': match['match_date'],
                    'home_goals': home_goals_sim,
                    'away_goals': away_goals_sim,
                    'weight': 1.0 / self.n_simulations,
                    'is_simulation': True,
                    'simulation_id': sim_id
                }
                
                expanded_matches.append(sim_match)
        
        print(f"Expanded from {len(matches_with_shots)} to {len(expanded_matches)} matches")
        return expanded_matches
    
    def _log_likelihood(self, params, matches, teams, reference_date):
        """Calculate log-likelihood with penalty constraints, using match weights."""
        # Extract parameters
        home_advantage = params[0]
        rho = params[1]
        attack_params = params[2:2+len(teams)]
        defense_params = params[2+len(teams):]
        
        # Create team parameter dictionaries
        attack = {team: attack_params[i] for i, team in enumerate(teams)}
        defense = {team: defense_params[i] for i, team in enumerate(teams)}
        
        log_likelihood = 0
        
        for match in matches:
            home_team = match['home_team']
            away_team = match['away_team']
            home_goals = int(match['home_goals'])
            away_goals = int(match['away_goals'])
            
            # Get match weight (1.0 for real matches, 1/n_simulations for simulations)
            match_weight = match.get('weight', 1.0)
            
            # Time weight
            time_weight = self._time_weight(match['match_date'], reference_date)
            
            # Combined weight
            combined_weight = match_weight * time_weight
            
            # Expected goals
            lambda_home = attack[home_team] * defense[away_team] * home_advantage
            lambda_away = attack[away_team] * defense[home_team]
            
            # Calculate probability with Dixon-Coles adjustment
            probability = self.dc_probability(home_goals, away_goals, lambda_home, lambda_away, rho)
            
            # Safeguard against log(0)
            if probability <= 0:
                probability = 1e-10
            
            log_likelihood += np.log(probability) * combined_weight
        
        # Constraint penalties to ensure attack and defense parameters average to 1
        constraint_penalty = 0
        sum_attack = sum(attack.values())
        sum_defense = sum(defense.values())
        constraint_penalty += (sum_attack - len(teams)) ** 2
        constraint_penalty += (sum_defense - len(teams)) ** 2
        
        return -log_likelihood + constraint_penalty
    
    def fit(self, matches):
        """
        Fit the Dixon-Coles model to match data.
        
        Parameters:
        matches (DataFrame or list): Match data with columns/keys ['home_team', 'away_team', 'home_goals', 'away_goals', 'match_date']
        """
        # Convert to list of dictionaries if DataFrame
        if isinstance(matches, pd.DataFrame):
            matches = matches.to_dict('records')
        
        # Ensure dates are datetime
        for match in matches:
            if isinstance(match['match_date'], str):
                match['match_date'] = pd.to_datetime(match['match_date'])
        
        # Get unique teams and reference date
        teams = set()
        for match in matches:
            teams.add(match['home_team'])
            teams.add(match['away_team'])
        
        self.teams = sorted(list(teams))
        reference_date = max(match['match_date'] for match in matches)
        
        # Add weight to matches (all real matches have weight 1.0)
        for match in matches:
            match['weight'] = 1.0
        
        print(f"Fitting model with {len(matches)} matches and {len(self.teams)} teams")
        
        # Initial parameters
        initial_params = [1.2, 0.0]  # home_advantage, rho
        initial_params.extend([1.0] * len(self.teams))  # attack parameters
        initial_params.extend([1.0] * len(self.teams))  # defense parameters
        
        # Parameter bounds
        bounds = [(0.8, 2.0), (-0.5, 0.5)]  # home_advantage, rho
        bounds.extend([(0.1, 3.0)] * len(self.teams))  # attack bounds
        bounds.extend([(0.1, 3.0)] * len(self.teams))  # defense bounds
        
        # Optimize
        result = minimize(
            self._log_likelihood,
            initial_params,
            args=(matches, self.teams, reference_date),
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 1000}
        )
        
        if not result.success:
            print(f"Warning: Optimization did not converge. Message: {result.message}")
        
        # Extract and store results
        self.home_advantage = result.x[0]
        self.rho = result.x[1]
        
        attack_params = result.x[2:2+len(self.teams)]
        defense_params = result.x[2+len(self.teams):]
        
        for i, team in enumerate(self.teams):
            self.team_attack[team] = attack_params[i]
            self.team_defense[team] = defense_params[i]
        
        self.fitted = True
        
        print(f"Model fitted successfully. Log-likelihood: {-result.fun:.2f}")
        print(f"Home advantage: {self.home_advantage:.3f}")
        print(f"Rho parameter: {self.rho:.3f}")
        
        # Validation
        avg_attack = np.mean(list(self.team_attack.values()))
        avg_defense = np.mean(list(self.team_defense.values()))
        print(f"Average attack: {avg_attack:.3f}")
        print(f"Average defense: {avg_defense:.3f}")
    
    def fit_with_xg(self, matches_with_shots):
        """
        Fit the Dixon-Coles model using xG shot data.
        
        Parameters:
        matches_with_shots (list): List of match dictionaries with:
            - home_team, away_team, match_date
            - home_shots: list of xG values for home team shots
            - away_shots: list of xG values for away team shots
        """
        # Resimulate matches using xG data
        expanded_matches = self._resimulate_matches_with_xg(matches_with_shots)
        
        # Fit model on expanded dataset
        self.fit(expanded_matches)
    
    def predict_match(self, home_team, away_team, max_goals=10):
        """Predict match outcome probabilities."""
        if not self.fitted:
            raise ValueError("Model must be fitted before making predictions")
        
        if home_team not in self.team_attack or away_team not in self.team_attack:
            available_teams = sorted(self.team_attack.keys())
            raise ValueError(f"Teams not found. Available teams: {available_teams}")
        
        # Calculate expected goals
        lambda_home = self.team_attack[home_team] * self.team_defense[away_team] * self.home_advantage
        lambda_away = self.team_attack[away_team] * self.team_defense[home_team]
        
        # Calculate probability matrix
        prob_matrix = np.zeros((max_goals + 1, max_goals + 1))
        
        for i in range(max_goals + 1):
            for j in range(max_goals + 1):
                prob_matrix[i, j] = self.dc_probability(i, j, lambda_home, lambda_away, self.rho)
        
        # Normalize probabilities
        prob_matrix = prob_matrix / prob_matrix.sum()
        
        # Calculate outcome probabilities
        home_win = np.sum(np.tril(prob_matrix, -1))
        draw = np.sum(np.diag(prob_matrix))
        away_win = np.sum(np.triu(prob_matrix, 1))
        
        return {
            'home_team': home_team,
            'away_team': away_team,
            'expected_home_goals': lambda_home,
            'expected_away_goals': lambda_away,
            'home_win_prob': home_win,
            'draw_prob': draw,
            'away_win_prob': away_win,
            'probability_matrix': prob_matrix
        }
    
    def get_team_strengths(self):
        """Get team attack and defense strengths."""
        if not self.fitted:
            raise ValueError("Model must be fitted before getting team strengths")
        
        strengths = []
        for team in self.teams:
            attack = self.team_attack[team]
            defense = self.team_defense[team]
            
            strengths.append({
                'team': team,
                'attack': attack,
                'defense': defense,
                'overall': attack - defense
            })
        
        return pd.DataFrame(strengths).sort_values(by='overall', ascending=False).reset_index(drop=True)


def preprocess_shot_dataframe(df):
    """
    Convert shot-level DataFrame to match-level format required by Dixon-Coles xG model.
    
    Parameters:
    df (DataFrame): Shot data with columns:
        - match_date: Date of match
        - home_team: Home team name
        - away_team: Away team name
        - Team: Team that took the shot
        - xg: Expected goals value
        - (other columns are ignored)
    
    Returns:
    list: List of match dictionaries with aggregated shot xG values
    """
    matches_with_shots = []
    
    # Group by match (assuming unique combination of date, home_team, away_team)
    match_groups = df.groupby(['match_date', 'home_team', 'away_team'])
    
    for (match_date, home_team, away_team), match_shots in match_groups:
        # Separate shots by team
        home_shots_df = match_shots[match_shots['Team'] == home_team]
        away_shots_df = match_shots[match_shots['Team'] == away_team]
        
        # Extract xG values (filtering out NaN values)
        home_xg_values = home_shots_df['xg'].dropna().tolist()
        away_xg_values = away_shots_df['xg'].dropna().tolist()
        
        # Create match dictionary
        match_data = {
            'home_team': home_team,
            'away_team': away_team,
            'match_date': match_date,
            'home_shots': home_xg_values,
            'away_shots': away_xg_values
        }
        
        matches_with_shots.append(match_data)
    
    return matches_with_shots




In [28]:
matches_with_shots = preprocess_shot_dataframe(df)

model = DixonColesModel(n_simulations=5)
model.fit_with_xg(matches_with_shots)

# Get team strengths
print("\nTeam Strengths:")
print(model.get_team_strengths())

Resimulating 380 matches (5 simulations each)
Expanded from 380 to 1460 matches
Fitting model with 1460 matches and 20 teams
Model fitted successfully. Log-likelihood: -2018.51
Home advantage: 0.965
Rho parameter: -0.019
Average attack: 0.921
Average defense: 0.806

Team Strengths:
                       team    attack   defense   overall
0                 Liverpool  2.869247  0.616454  2.252793
1                   Chelsea  2.391527  0.731612  1.659915
2                   Arsenal  1.955058  0.501762  1.453296
3            Crystal Palace  1.996258  0.862526  1.133731
4               Aston Villa  1.908151  0.782414  1.125737
5                 Brentford  2.005391  0.901401  1.103991
6                    Fulham  1.586819  0.770994  0.815825
7                   Everton  1.369336  0.661002  0.708335
8               Southampton  1.234395  1.278696 -0.044301
9           AFC Bournemouth  0.100000  0.579241 -0.479241
10        Nottingham Forest  0.100000  0.722334 -0.622334
11        Manchester 