In [7]:
import numpy as np
from scipy.stats import poisson
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from skopt import gp_minimize
from scipy.optimize import minimize
import pandas as pd

In [8]:
class TeamModel:
    def __init__(self, xg_weight=0.78, model_weight=0.6): # weights optimnised last 26/2/25
        # Weight for blending xG and PSxG in resimming
        self.xg_weight = xg_weight

        # Weight for blending goals DC model and resimmed DC model predictions
        self.model_weight = model_weight

        # Team attack and defense strength parameters
        self.team_attack = {}
        self.team_defense = {}
        self.home_advantage = 0.0
        self.rho = 0.0 # Dixon-Coles parameter to account for low scoring games

        # Same parameters for resimmed model
        self.resim_team_attack = {}
        self.resim_team_defense = {}
        self.resim_home_advantage = 0.0
        self.resim_rho = 0.0

    def resim_matches(self, matches, num_sims=10):
        resimulated_matches = []

        for match in matches:
            home_team = match['home_team']
            away_team = match['away_team']

            # Blend xG and PSxG using weight parameter
            home_blended_xg = (self.xg_weight * match['home_xg'] +
                              (1 - self.xg_weight) * match['home_psxg'])
            away_blended_xg = (self.xg_weight * match['away_xg'] +
                              (1 - self.xg_weight) * match['away_psxg'])
            
            # Resimulate match num_sims times
            for _ in range(num_sims):
                # Generate random goals via Poisson distribution
                home_goals = np.random.poisson(home_blended_xg)
                away_goals = np.random.poisson(away_blended_xg)

                # Add resimmed match to list
                sim_match = match.copy()
                sim_match['home_goals'] = home_goals
                sim_match['away_goals'] = away_goals
                sim_match['is_simulation'] = True
                sim_match['simulation_weight'] = 1.0 / num_sims

                resimulated_matches.append(sim_match)
                
        return resimulated_matches
    
    def _get_unique_teams(self, matches):
        teams = set()
        for match in matches:
            teams.add(match['home_team'])
            teams.add(match['away_team'])
        return teams
    
    @staticmethod
    def dc_probability(home_goals, away_goals, lambda_home, lambda_away, rho):
        # Base Poisson probabilities
        p_home = poisson.pmf(home_goals, lambda_home)
        p_away = poisson.pmf(away_goals, lambda_away)
        
        # Dixon-Coles adjustment for low-scoring dependencies
        tau = 1.0
        if home_goals == 0 and away_goals == 0:
            tau = 1 - rho
        elif home_goals == 0 and away_goals == 1:
            tau = 1 + rho * lambda_home
        elif home_goals == 1 and away_goals == 0:
            tau = 1 + rho * lambda_away
        elif home_goals == 1 and away_goals == 1:
            tau = 1 - rho * lambda_home * lambda_away
        
        return tau * p_home * p_away
    
    @staticmethod
    def dc_log_likelihood(params, matches, teams):
        # Add debug prints for first call
        if getattr(TeamModel.dc_log_likelihood, 'first_call', True):
            print(f"First likelihood call with {len(matches)} matches")
            print(f"First few params: {params[:5]}")
            TeamModel.dc_log_likelihood.first_call = False

        # Extract parameters
        home_advantage = params[0]
        rho = params[1]
        attack_params = params[2:2+len(teams)]
        defense_params = params[2+len(teams):]

        # Assign attack/defense parameters to teams
        attack = {team: attack_params[i] for i, team in enumerate(teams)}
        defense = {team: defense_params[i] for i, team in enumerate(teams)}

        # Initalise log likelihood
        log_likelihood = 0

        # Calculate log-likelihood for each match
        for match in matches:
            home_team = match['home_team']
            away_team = match['away_team']
            home_goals = match['home_goals']
            away_goals = match['away_goals']

            # Expected goals parameter
            lambda_home = attack[home_team] * defense[away_team] * home_advantage
            lambda_away = attack[away_team] * defense[home_team]

            # Calculate probbaility of this specific scoreline with rho adjustment
            probability = TeamModel.dc_probability(home_goals, away_goals, lambda_home, lambda_away, rho)

            # Add safeguard to prevent log(0) errors
            if probability <= 0:
                probability = 1e-10  # Set to small positive value

            # Apply weigth to log likelihood (1.0 for actual matches, simulation_weight for resimmed matches)
            weight = match.get('simulation_weight', 1.0)
            log_likelihood += np.log(probability) * weight

        # Constraint to ensure parameters are valid
        constraint_penalty = 0
        sum_attack = sum(attack.values())
        sum_defense = sum(defense.values())
        constraint_penalty += (sum_attack - len(teams)) ** 2
        constraint_penalty += (sum_defense - len(teams)) ** 2

        # Return negative log likelihood (to be minimized) with constraint penalty
        return -log_likelihood + constraint_penalty
    
    def fit_models(self, actual_matches):
        # Fit standard model using actual match results
        teams = self._get_unique_teams(actual_matches)
        team_list = sorted(list(teams))
        
        # Fit standard model
        standard_params = self._optimize_dc_parameters(actual_matches, team_list)
        
        # Extract parameters for standard model
        self.home_advantage = standard_params[0]
        self.rho = standard_params[1]
        for i, team in enumerate(team_list):
            self.team_attack[team] = standard_params[2+i]
            self.team_defense[team] = standard_params[2+len(team_list)+i]
        
        # Generate resimulated matches
        resimulated_matches = self.resim_matches(actual_matches)
        
        # Fit resimulated model
        resim_params = self._optimize_dc_parameters(resimulated_matches, team_list)
        
        # Extract parameters for resimulated model
        self.resim_home_advantage = resim_params[0]
        self.resim_rho = resim_params[1]
        for i, team in enumerate(team_list):
            self.resim_team_attack[team] = resim_params[2+i]
            self.resim_team_defense[team] = resim_params[2+len(team_list)+i]
        
        return self

    def _optimize_dc_parameters(self, matches, team_list):
        # Add debugging
        print(f"Optimizing for {len(matches)} matches with {len(team_list)} teams")
        
        # Check first few matches
        for i, match in enumerate(matches[:3]):
            print(f"Match {i}: {match}")

        # Initial parameter guesses
        initial_params = [1.3, 0.1]  # Home advantage, rho
        initial_params.extend([1.0] * len(team_list))  # Attack
        initial_params.extend([1.0] * len(team_list))  # Defense
        
        # Define bounds for parameters
        bounds = [(0.5, 2.0), (-0.3, 0.3)]  # Home advantage, rho
        bounds.extend([(0.1, 3.0)] * len(team_list))  # Attack
        bounds.extend([(0.1, 3.0)] * len(team_list))  # Defense
        
        # Minimize negative log-likelihood
        from scipy.optimize import minimize
        result = minimize(
            lambda params: TeamModel.dc_log_likelihood(params, matches, team_list),
            initial_params,
            method='L-BFGS-B',
            bounds=bounds
        )
        
        # Print optimization results
        print(f"Optimization success: {result.success}")
        print(f"Final function value: {result.fun}")
        print(f"Number of iterations: {result.nit}")

        return result.x
    
    def optimize_weights(self, training_matches, validation_matches):
        def objective(weights):
            # Unpack weights
            xg_weight, model_weight = weights
            
            # Set current weights
            self.xg_weight = xg_weight
            self.model_weight = model_weight
            
            # Fit both models
            self.fit_models(training_matches)
            
            # Calculate error in goals prediction
            home_errors = []
            away_errors = []
            
            for match in validation_matches:
                home_team = match['home_team']
                away_team = match['away_team']
                
                # Standard DC model expected goals
                lambda_home_std = self.team_attack[home_team] * self.team_defense[away_team] * self.home_advantage
                lambda_away_std = self.team_attack[away_team] * self.team_defense[home_team]
                
                # Resimmed DC model expected goals
                lambda_home_resim = self.resim_team_attack[home_team] * self.resim_team_defense[away_team] * self.resim_home_advantage
                lambda_away_resim = self.resim_team_attack[away_team] * self.resim_team_defense[home_team]
                
                # Blend expected goals predictions
                lambda_home_blend = model_weight * lambda_home_std + (1 - model_weight) * lambda_home_resim
                lambda_away_blend = model_weight * lambda_away_std + (1 - model_weight) * lambda_away_resim
                
                # Calculate squared errors
                home_error = (match['home_goals'] - lambda_home_blend) ** 2
                away_error = (match['away_goals'] - lambda_away_blend) ** 2
                
                home_errors.append(home_error)
                away_errors.append(away_error)
            
            # Root mean squared error for goals prediction
            rmse = np.sqrt(np.mean(home_errors + away_errors))
            
            return rmse
        
        # Define the search space
        dimensions = [(0.0, 1.0), (0.0, 1.0)]  # xG weight, model weight
        
        print("Starting Bayesian optimization...")
        
        # Run Bayesian optimization
        from skopt import gp_minimize
        result = gp_minimize(
            objective, 
            dimensions, 
            n_calls=10,  # Increased number of calls for better optimization
            n_initial_points=10,  # More initial points for better exploration
            random_state=42, 
            verbose=True
        )
        
        # Store the best RMSE value for reference
        self.last_rmse = result.fun
        
        # Print optimization results
        print("\nOptimization Results:")
        print(f"Best parameters: xG={result.x[0]:.4f}, model={result.x[1]:.4f}")
        print(f"Best RMSE: {result.fun:.4f}")
        
        # Show top 5 weight combinations
        points_with_scores = [(result.x_iters[i][0], result.x_iters[i][1], result.func_vals[i]) 
                            for i in range(len(result.func_vals))]
        points_with_scores.sort(key=lambda x: x[2])  # Sort by RMSE
        
        print("\nTop 5 weight combinations:")
        for i, (xg_w, model_w, rmse) in enumerate(points_with_scores[:5]):
            print(f"{i+1}. xG={xg_w:.4f}, model={model_w:.4f}, RMSE={rmse:.4f}")
        
        # Set the optimal weights
        self.xg_weight, self.model_weight = result.x
        
        return self
    

    def print_team_strengths(self):
        # Get all teams from both models
        all_teams = set(self.team_attack.keys()).union(set(self.resim_team_attack.keys()))
        
        # Create a list of team data
        team_data = []
        for team in all_teams:
            std_attack = self.team_attack.get(team, float('nan'))
            std_defense = self.team_defense.get(team, float('nan'))
            resim_attack = self.resim_team_attack.get(team, float('nan'))
            resim_defense = self.resim_team_defense.get(team, float('nan'))
            
            # Calculate blended attack and defense parameters
            blended_attack = self.model_weight * std_attack + (1 - self.model_weight) * resim_attack
            blended_defense = self.model_weight * std_defense + (1 - self.model_weight) * resim_defense
            
            # Calculate overall strength using the log scale (which is the natural scale for the DC model)
            # Higher attack and lower defense values are better
            overall_strength = np.log(blended_attack) - np.log(blended_defense)
            
            team_data.append({
                'team': team,
                'std_attack': std_attack,
                'std_defense': std_defense,
                'resim_attack': resim_attack,
                'resim_defense': resim_defense,
                'blended_attack': blended_attack,
                'blended_defense': blended_defense,
                'overall_strength': overall_strength
            })
        
        # Sort by overall strength (descending)
        team_data = sorted(team_data, key=lambda x: x['overall_strength'], reverse=True)
        
        # Print header
        print("\n{:<20} {:^20} {:^20} {:^20}".format('', 'Standard Model', 'Resimmed Model', 'Blended Model'))
        print("{:<20} {:^10} {:^10} {:^10} {:^10} {:^10} {:^10} {:^10}".format(
            'Team', 'Attack', 'Defense', 'Attack', 'Defense', 'Attack', 'Defense', 'Strength'))
        print("-" * 100)
        
        # Print team data
        for team in team_data:
            print("{:<20} {:^10.3f} {:^10.3f} {:^10.3f} {:^10.3f} {:^10.3f} {:^10.3f} {:^10.3f}".format(
                team['team'],
                team['std_attack'],
                team['std_defense'],
                team['resim_attack'],
                team['resim_defense'],
                team['blended_attack'],
                team['blended_defense'],
                team['overall_strength']
            ))
        
        # Print model parameters
        print("\nModel Parameters:")
        print(f"Home Advantage: Standard={self.home_advantage:.3f}, Resimmed={self.resim_home_advantage:.3f}")
        print(f"Rho Parameter: Standard={self.rho:.3f}, Resimmed={self.resim_rho:.3f}")
        print(f"Blend Weights: xG/PSxG={self.xg_weight:.3f}, Models={self.model_weight:.3f}")

            


In [9]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv(r"C:\Users\Owner\dev\team-model\shot_data_prem_2024.csv")

#df = df[df["match_date"] > '2025-01-01']

# Add a goal column
df['is_goal'] = df['Outcome'].apply(lambda x: 1 if x == 'Goal' else 0)

# First, create separate DataFrames for home and away shots
home_shots = df[df['Team'] == df['home_team']]
away_shots = df[df['Team'] == df['away_team']]

# Group by match to get match-level aggregates
home_stats = home_shots.groupby(['match_url', 'match_date', 'home_team', 'away_team'], as_index=False).agg({
    'is_goal': 'sum',  # Total goals
    'xG': 'sum',       # Total xG
    'PSxG': 'sum'      # Total PSxG
})

away_stats = away_shots.groupby(['match_url', 'match_date', 'home_team', 'away_team'], as_index=False).agg({
    'is_goal': 'sum',  # Total goals
    'xG': 'sum',       # Total xG
    'PSxG': 'sum'      # Total PSxG
})

# Rename columns for clarity
home_stats = home_stats.rename(columns={
    'is_goal': 'home_goals',
    'xG': 'home_xg',
    'PSxG': 'home_psxg'
})

away_stats = away_stats.rename(columns={
    'is_goal': 'away_goals',
    'xG': 'away_xg',
    'PSxG': 'away_psxg'
})

# Merge home and away stats
match_stats = pd.merge(
    home_stats, 
    away_stats, 
    on=['match_url', 'match_date', 'home_team', 'away_team'],
    how='inner'
)

matches = match_stats.to_dict('records')

# Split data
train_matches, val_matches = train_test_split(matches, test_size=0.2, random_state=42)

# Initialize model
model = TeamModel()

# Optimize weights (this will call fit_models() multiple times internally)
model.fit_models(matches)

# Print results
model.print_team_strengths()

Optimizing for 259 matches with 20 teams
Match 0: {'match_url': 'https://fbref.com/en/matches/01e63a1f/Bournemouth-Arsenal-October-19-2024-Premier-League', 'match_date': '2024-10-19', 'home_team': 'Bournemouth', 'away_team': 'Arsenal', 'home_goals': 2, 'home_xg': 1.83, 'home_psxg': 1.6400000000000001, 'away_goals': 0, 'away_xg': 0.71, 'away_psxg': 0.32}
Match 1: {'match_url': 'https://fbref.com/en/matches/038dfa98/Fulham-Arsenal-December-8-2024-Premier-League', 'match_date': '2024-12-08', 'home_team': 'Fulham', 'away_team': 'Arsenal', 'home_goals': 1, 'home_xg': 0.16, 'home_psxg': 0.44, 'away_goals': 1, 'away_xg': 2.0, 'away_psxg': 1.24}
Match 2: {'match_url': 'https://fbref.com/en/matches/03ac4a9c/West-Ham-United-Crystal-Palace-January-18-2025-Premier-League', 'match_date': '2025-01-18', 'home_team': 'West Ham', 'away_team': 'Crystal Palace', 'home_goals': 0, 'home_xg': 0.28, 'home_psxg': 0.0, 'away_goals': 2, 'away_xg': 1.32, 'away_psxg': 1.6300000000000001}
First likelihood call wit