In [7]:
import numpy as np
import pandas as pd
from scipy.stats import poisson
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
from datetime import datetime, date

class PoiBin:
    """
    Poisson Binomial distribution for random variables.
    This class implements the Poisson Binomial distribution for Bernoulli
    trials with different success probabilities.
    """
    def __init__(self, probabilities):
        """Initialize with an array of success probabilities."""
        self.success_probabilities = np.array(probabilities)
        self.number_trials = self.success_probabilities.size
        
        # Safety checks for probabilities
        if not np.all((self.success_probabilities >= 0) & (self.success_probabilities <= 1)):
            raise ValueError("All probabilities must be between 0 and 1")
        
        # Calculate PMF and CDF
        self.omega = 2 * np.pi / (self.number_trials + 1)
        self.pmf_list = self.get_pmf_xi()
        self.cdf_list = self.get_cdf(self.pmf_list)

    def pmf(self, number_successes):
        """Calculate the probability mass function for the given number of successes."""
        if isinstance(number_successes, (list, np.ndarray)):
            return np.array([self.pmf_list[k] if 0 <= k <= self.number_trials else 0 
                            for k in number_successes])
        else:
            if 0 <= number_successes <= self.number_trials:
                return self.pmf_list[number_successes]
            return 0

    def get_cdf(self, event_probabilities):
        """Calculate the cumulative distribution function."""
        cdf = np.empty(self.number_trials + 1)
        cdf[0] = event_probabilities[0]
        for i in range(1, self.number_trials + 1):
            cdf[i] = cdf[i - 1] + event_probabilities[i]
        return cdf

    def get_pmf_xi(self):
        """Calculate the probability mass function using FFT method."""
        chi = np.empty(self.number_trials + 1, dtype=complex)
        chi[0] = 1
        half_number_trials = int(self.number_trials / 2 + self.number_trials % 2)
        
        # Set first half of chis
        chi[1:half_number_trials + 1] = self.get_chi(np.arange(1, half_number_trials + 1))
        
        # Set second half of chis using symmetry
        chi[half_number_trials + 1:self.number_trials + 1] = np.conjugate(
            chi[1:self.number_trials - half_number_trials + 1][::-1])
        
        chi /= self.number_trials + 1
        xi = np.fft.fft(chi)
        
        # Ensure results are real numbers
        if np.all(np.abs(xi.imag) < 1e-10):
            xi = xi.real
        else:
            raise TypeError("PMF values must be real numbers")
        
        # Handle numerical imprecision
        xi = np.maximum(xi, 0)
        xi /= xi.sum()
        
        return xi

    def get_chi(self, idx_array):
        """Helper function to calculate chi values."""
        exp_value = np.exp(self.omega * idx_array * 1j)
        xy = 1 - self.success_probabilities + self.success_probabilities * exp_value[:, np.newaxis]
        
        # Sum over the principal values of the arguments of z
        argz_sum = np.arctan2(xy.imag, xy.real).sum(axis=1)
        
        # Get d value
        exparg = np.log(np.abs(xy)).sum(axis=1)
        d_value = np.exp(exparg)
        
        # Get chi values
        chi = d_value * np.exp(argz_sum * 1j)
        return chi

In [8]:
class TeamModel:
    def __init__(self, xg_weight=0.5, model_weight=0.3):
        """
        Initialize the TeamModel.
        
        Parameters:
        -----------
        xg_weight : float
            Weight for blending xG and PSxG in resimming (0.0 to 1.0)
        model_weight : float
            Weight for blending standard and resimulated model predictions (0.0 to 1.0)
        """
        # Weight for blending xG and PSxG in resimming
        self.xg_weight = xg_weight

        # Weight for blending goals DC model and resimmed DC model predictions
        self.model_weight = model_weight

        # Team attack and defense strength parameters
        self.team_attack = {}
        self.team_defense = {}
        self.home_advantage = 0.0
        self.rho = 0.0  # Dixon-Coles parameter to account for low scoring games

        # Same parameters for resimmed model
        self.resim_team_attack = {}
        self.resim_team_defense = {}
        self.resim_home_advantage = 0.0
        self.resim_rho = 0.0

    def resim_matches(self, matches, num_sims=10):
        """
        Resimulate matches using Poisson distribution on match-level xG totals.
        
        Parameters:
        -----------
        matches : list of dict
            List of match dictionaries
        num_sims : int
            Number of simulations per match
            
        Returns:
        --------
        list of dict
            Resimulated matches
        """
        resimulated_matches = []

        for match in matches:
            home_team = match['home_team']
            away_team = match['away_team']

            # Blend xG and PSxG using weight parameter
            home_blended_xg = (self.xg_weight * match['home_xg'] +
                              (1 - self.xg_weight) * match['home_psxg'])
            away_blended_xg = (self.xg_weight * match['away_xg'] +
                              (1 - self.xg_weight) * match['away_psxg'])
            
            # Resimulate match num_sims times
            for _ in range(num_sims):
                # Generate random goals via Poisson distribution
                home_goals = np.random.poisson(home_blended_xg)
                away_goals = np.random.poisson(away_blended_xg)

                # Add resimmed match to list
                sim_match = match.copy()
                sim_match['home_goals'] = home_goals
                sim_match['away_goals'] = away_goals
                sim_match['is_simulation'] = True
                sim_match['simulation_weight'] = 1.0 / num_sims

                resimulated_matches.append(sim_match)
                
        return resimulated_matches

    def resim_matches_poibin(self, matches, shots_data, min_xg=0.0001, num_sims=10, max_goals=10):
        """
        Resimulate matches using Poisson-Binomial distribution for individual shots.
        
        Parameters:
        -----------
        matches : list of dict
            List of match dictionaries with match metadata
        shots_data : DataFrame
            Shot-by-shot data with xG values
        min_xg : float
            Minimum xG value to prevent underflow
        num_sims : int
            Number of simulations per match
        max_goals : int
            Maximum number of goals to consider in simulations
            
        Returns:
        --------
        list of dict
            Resimulated matches with simulated goal values
        """
        resimulated_matches = []
        
        # Group shots by match_url
        shot_groups = shots_data.groupby('match_url')
        
        for match in matches:
            match_url = match.get('match_url')
            home_team = match['home_team']
            away_team = match['away_team']
            
            # Get shots for this match
            try:
                match_shots = shot_groups.get_group(match_url)
                # Separate into home and away shots
                home_shots = match_shots[match_shots['Team'] == home_team]
                away_shots = match_shots[match_shots['Team'] == away_team]
            except (KeyError, ValueError):
                print(f"Warning: No shots found for match {match_url}")
                # Fall back to using match totals
                home_shots = pd.DataFrame()
                away_shots = pd.DataFrame()
            
            # Blend xG and PSxG for each shot
            if not home_shots.empty:
                home_shot_xgs = []
                for _, shot in home_shots.iterrows():
                    shot_xg = shot.get('xG', 0)
                    shot_psxg = shot.get('PSxG', shot_xg)  # Default to xG if PSxG is 0 or NaN
                    if pd.isna(shot_psxg) or shot_psxg == 0:
                        shot_psxg = shot_xg
                    blended_xg = max(self.xg_weight * shot_xg + (1 - self.xg_weight) * shot_psxg, min_xg)
                    home_shot_xgs.append(blended_xg)
            else:
                home_shot_xgs = []
            
            if not away_shots.empty:
                away_shot_xgs = []
                for _, shot in away_shots.iterrows():
                    shot_xg = shot.get('xG', 0)
                    shot_psxg = shot.get('PSxG', shot_xg)  # Default to xG if PSxG is 0 or NaN
                    if pd.isna(shot_psxg) or shot_psxg == 0:
                        shot_psxg = shot_xg
                    blended_xg = max(self.xg_weight * shot_xg + (1 - self.xg_weight) * shot_psxg, min_xg)
                    away_shot_xgs.append(blended_xg)
            else:
                away_shot_xgs = []
            
            # If no valid shots, use simple Poisson with match totals
            if not home_shot_xgs:
                home_total_xg = match['home_xg'] if 'home_xg' in match else 0
                home_pmf = np.array([poisson.pmf(i, home_total_xg) for i in range(max_goals+1)])
            else:
                # Calculate Poisson-Binomial PMF for home team
                try:
                    home_poibin = PoiBin(home_shot_xgs)
                    home_pmf = home_poibin.pmf(np.arange(min(len(home_shot_xgs)+1, max_goals+1)))
                    # Pad with zeros if needed
                    if len(home_pmf) < max_goals+1:
                        home_pmf = np.pad(home_pmf, (0, max_goals+1-len(home_pmf)), 'constant')
                except Exception as e:
                    print(f"Error calculating home PMF for {home_team}, using Poisson: {str(e)}")
                    home_total_xg = sum(home_shot_xgs)
                    home_pmf = np.array([poisson.pmf(i, home_total_xg) for i in range(max_goals+1)])
            
            if not away_shot_xgs:
                away_total_xg = match['away_xg'] if 'away_xg' in match else 0
                away_pmf = np.array([poisson.pmf(i, away_total_xg) for i in range(max_goals+1)])
            else:
                # Calculate Poisson-Binomial PMF for away team
                try:
                    away_poibin = PoiBin(away_shot_xgs)
                    away_pmf = away_poibin.pmf(np.arange(min(len(away_shot_xgs)+1, max_goals+1)))
                    # Pad with zeros if needed
                    if len(away_pmf) < max_goals+1:
                        away_pmf = np.pad(away_pmf, (0, max_goals+1-len(away_pmf)), 'constant')
                except Exception as e:
                    print(f"Error calculating away PMF for {away_team}, using Poisson: {str(e)}")
                    away_total_xg = sum(away_shot_xgs)
                    away_pmf = np.array([poisson.pmf(i, away_total_xg) for i in range(max_goals+1)])
            
            # Normalize PMFs
            home_pmf = home_pmf / home_pmf.sum()
            away_pmf = away_pmf / away_pmf.sum()
            
            # Generate simulations based on the calculated PMFs
            for _ in range(num_sims):
                # Sample from PMFs
                home_goals = np.random.choice(max_goals+1, p=home_pmf)
                away_goals = np.random.choice(max_goals+1, p=away_pmf)
                
                # Create simulated match
                sim_match = match.copy()
                sim_match['home_goals'] = int(home_goals)
                sim_match['away_goals'] = int(away_goals)
                sim_match['is_simulation'] = True
                sim_match['simulation_weight'] = 1.0 / num_sims
                sim_match['home_shots_count'] = len(home_shots)
                sim_match['away_shots_count'] = len(away_shots)
                
                resimulated_matches.append(sim_match)
        
        return resimulated_matches

    def _get_unique_teams(self, matches):
        """Extract unique teams from matches."""
        teams = set()
        for match in matches:
            teams.add(match['home_team'])
            teams.add(match['away_team'])
        return teams
    
    @staticmethod
    def dc_probability(home_goals, away_goals, lambda_home, lambda_away, rho):
        """Calculate Dixon-Coles adjusted probability for a match outcome."""
        # Base Poisson probabilities
        p_home = poisson.pmf(home_goals, lambda_home)
        p_away = poisson.pmf(away_goals, lambda_away)
        
        # Dixon-Coles adjustment for low-scoring dependencies
        tau = 1.0
        if home_goals == 0 and away_goals == 0:
            tau = 1 - rho
        elif home_goals == 0 and away_goals == 1:
            tau = 1 + rho * lambda_home
        elif home_goals == 1 and away_goals == 0:
            tau = 1 + rho * lambda_away
        elif home_goals == 1 and away_goals == 1:
            tau = 1 - rho * lambda_home * lambda_away
        
        return tau * p_home * p_away
    
    @staticmethod
    def dc_log_likelihood(params, matches, teams, metadata, epsilon=0.01, season_penalty=0.75):
        """Optimized log-likelihood function with season penalty."""
        # Extract parameters
        home_advantage = params[0]
        rho = params[1]
        attack_params = params[2:2+len(teams)]
        defense_params = params[2+len(teams):]
        
        # Assign attack/defense parameters to teams
        attack = {team: attack_params[i] for i, team in enumerate(teams)}
        defense = {team: defense_params[i] for i, team in enumerate(teams)}
        
        # Initialize log likelihood
        log_likelihood = 0
        
        # Get reference values from metadata
        reference_date = metadata.get('reference_date')
        current_season = metadata.get('current_season')
            
        # Calculate log-likelihood for each match
        for match in matches:
            home_team = match['home_team']
            away_team = match['away_team']
            home_goals = match['home_goals']
            away_goals = match['away_goals']
            match_season = match.get('season', current_season)
            
            # Weight calculation
            # Time weight - days-based decay
            time_weight = 1.0
            if 'days_from_ref' in match:
                # If we've pre-calculated days from reference
                days_ago = match['days_from_ref']
                time_weight = 1.0 / (1.0 + epsilon * days_ago)
            elif reference_date and 'match_date' in match:
                # If we need to calculate it now
                match_date = match['match_date']
                if isinstance(match_date, str):
                    match_date = pd.Timestamp(match_date)
                days_ago = max(0, (reference_date - match_date).days)
                time_weight = 1.0 / (1.0 + epsilon * days_ago)
            
            # Apply season penalty if match is from a previous season
            seasons_ago = current_season - match_season if current_season and match_season else 0
            if seasons_ago > 0:
                time_weight *= season_penalty ** seasons_ago
            
            # Expected goals parameter
            lambda_home = attack[home_team] * defense[away_team] * home_advantage
            lambda_away = attack[away_team] * defense[home_team]
            
            # Calculate probability with rho adjustment
            probability = TeamModel.dc_probability(home_goals, away_goals, lambda_home, lambda_away, rho)
            
            # Safeguard against log(0)
            if probability <= 0:
                probability = 1e-10
                
            # Apply weights to log likelihood
            base_weight = match.get('simulation_weight', 1.0)
            combined_weight = base_weight * time_weight
            
            log_likelihood += np.log(probability) * combined_weight
        
        # Constraint penalty
        constraint_penalty = 0
        sum_attack = sum(attack.values())
        sum_defense = sum(defense.values())
        constraint_penalty += (sum_attack - len(teams)) ** 2
        constraint_penalty += (sum_defense - len(teams)) ** 2
        
        return -log_likelihood + constraint_penalty
    
    def _preprocess_matches(self, matches):
        """Preprocess matches to optimize calculations."""
        # Find reference date and current season
        dates = [m.get('match_date') for m in matches if m.get('match_date') is not None]
        seasons = [m.get('season', 0) for m in matches]
        
        reference_date = None
        if dates:
            reference_date = max(dates)
            
        current_season = max(seasons) if seasons else None
        
        # Precompute days from reference for each match
        for match in matches:
            if reference_date and 'match_date' in match:
                match_date = match['match_date']
                if isinstance(match_date, str):
                    match_date = pd.Timestamp(match_date)
                
                if isinstance(match_date, (pd.Timestamp, datetime, date)):
                    # Convert datetime to pandas Timestamp if it's not already
                    if not isinstance(match_date, pd.Timestamp):
                        match_date = pd.Timestamp(match_date)
                    
                    # Calculate and store days from reference
                    match['days_from_ref'] = max(0, (reference_date - match_date).days)
        
        # Return metadata for optimization
        return {
            'reference_date': reference_date,
            'current_season': current_season
        }
        
    def fit_models(self, actual_matches, shots_data=None, epsilon=0.0065, season_penalty=0.75, num_sims=10):
        """
        Fit both standard and resimulated models.
        
        Parameters:
        -----------
        actual_matches : list of dict
            List of actual match results
        shots_data : DataFrame, optional
            Shot-by-shot data for Poisson-Binomial resimulation
        epsilon : float
            Time decay parameter
        season_penalty : float
            Penalty for previous season matches (0 to 1)
        num_sims : int
            Number of simulations per match
        """
        # Preprocess matches
        matches_metadata = self._preprocess_matches(actual_matches)
        
        # Get unique teams
        teams = self._get_unique_teams(actual_matches)
        team_list = sorted(list(teams))
        
        # Fit standard model with season penalty
        standard_params = self._optimize_dc_parameters(
            actual_matches, team_list, matches_metadata, epsilon, season_penalty
        )
        
        # Extract parameters for standard model
        self.home_advantage = standard_params[0]
        self.rho = standard_params[1]
        for i, team in enumerate(team_list):
            self.team_attack[team] = standard_params[2+i]
            self.team_defense[team] = standard_params[2+len(team_list)+i]
        
        # Generate resimulated matches based on method
        if shots_data is not None:
            print(f"Using Poisson-Binomial resimulation with {len(shots_data)} shots")
            resimulated_matches = self.resim_matches_poibin(actual_matches, shots_data, num_sims=num_sims)
        else:
            print("Using traditional Poisson resimulation with match totals")
            resimulated_matches = self.resim_matches(actual_matches, num_sims=num_sims)
        
        # Copy time information to resimulated matches
        for i, sim_match in enumerate(resimulated_matches):
            orig_match_idx = i % len(actual_matches)
            # Copy all time-related fields
            sim_match['season'] = actual_matches[orig_match_idx].get('season')
            sim_match['match_date'] = actual_matches[orig_match_idx].get('match_date')
            if 'days_from_ref' in actual_matches[orig_match_idx]:
                sim_match['days_from_ref'] = actual_matches[orig_match_idx]['days_from_ref']
        
        # Fit resimulated model with the same parameters
        resim_params = self._optimize_dc_parameters(
            resimulated_matches, team_list, matches_metadata, epsilon, season_penalty
        )
        
        # Extract parameters for resimulated model
        self.resim_home_advantage = resim_params[0]
        self.resim_rho = resim_params[1]
        for i, team in enumerate(team_list):
            self.resim_team_attack[team] = resim_params[2+i]
            self.resim_team_defense[team] = resim_params[2+len(team_list)+i]
        
        return self

    def _optimize_dc_parameters(self, matches, team_list, metadata, epsilon=0.0065, season_penalty=0.75):
        """Optimize Dixon-Coles model parameters."""
        # Add debugging
        print(f"Optimizing for {len(matches)} matches with {len(team_list)} teams")
        
        # Check first few matches
        for i, match in enumerate(matches[:3]):
            print(f"Match {i}: {match}")

        # Initial parameter guesses
        initial_params = [1.3, 0.1]  # Home advantage, rho
        initial_params.extend([1.0] * len(team_list))  # Attack
        initial_params.extend([1.0] * len(team_list))  # Defense
        
        # Define bounds for parameters
        bounds = [(0.5, 2.0), (-0.3, 0.3)]  # Home advantage, rho
        bounds.extend([(0.1, 3.0)] * len(team_list))  # Attack
        bounds.extend([(0.1, 3.0)] * len(team_list))  # Defense
        
        # Minimize negative log-likelihood
        result = minimize(
            lambda params: TeamModel.dc_log_likelihood(
                params, matches, team_list, metadata, 
                epsilon=epsilon, season_penalty=season_penalty
            ),
            initial_params,
            method='L-BFGS-B',
            bounds=bounds
        )
        
        # Print optimization results
        print(f"Optimization success: {result.success}")
        print(f"Final function value: {result.fun}")
        print(f"Number of iterations: {result.nit}")

        return result.x
    
    def optimize_weights(self, training_matches, validation_matches, shots_data=None):
        """Optimize model weights using Bayesian optimization."""
        def objective(weights):
            # Unpack weights
            xg_weight, model_weight = weights
            
            # Set current weights
            self.xg_weight = xg_weight
            self.model_weight = model_weight
            
            # Fit both models
            self.fit_models(training_matches, shots_data=shots_data)
            
            # Calculate error in goals prediction
            home_errors = []
            away_errors = []
            
            for match in validation_matches:
                home_team = match['home_team']
                away_team = match['away_team']
                
                # Handle cases where a team might not be in the model
                if home_team not in self.team_attack or away_team not in self.team_attack:
                    continue
                
                # Standard DC model expected goals
                lambda_home_std = self.team_attack[home_team] * self.team_defense[away_team] * self.home_advantage
                lambda_away_std = self.team_attack[away_team] * self.team_defense[home_team]
                
                # Resimmed DC model expected goals
                lambda_home_resim = self.resim_team_attack[home_team] * self.resim_team_defense[away_team] * self.resim_home_advantage
                lambda_away_resim = self.resim_team_attack[away_team] * self.resim_team_defense[home_team]
                
                # Blend expected goals predictions
                lambda_home_blend = model_weight * lambda_home_std + (1 - model_weight) * lambda_home_resim
                lambda_away_blend = model_weight * lambda_away_std + (1 - model_weight) * lambda_away_resim
                
                # Calculate squared errors
                home_error = (match['home_goals'] - lambda_home_blend) ** 2
                away_error = (match['away_goals'] - lambda_away_blend) ** 2
                
                home_errors.append(home_error)
                away_errors.append(away_error)
            
            # Root mean squared error for goals prediction
            rmse = np.sqrt(np.mean(home_errors + away_errors))
            
            return rmse
        
        # Define the search space
        dimensions = [(0.0, 1.0), (0.0, 1.0)]  # xG weight, model weight
        
        print("Starting Bayesian optimization...")
        
        # Run Bayesian optimization
        try:
            from skopt import gp_minimize
            result = gp_minimize(
                objective, 
                dimensions, 
                n_calls=10,  # Increased number of calls for better optimization
                n_initial_points=10,  # More initial points for better exploration
                random_state=42, 
                verbose=True
            )
            
            # Store the best RMSE value for reference
            self.last_rmse = result.fun
            
            # Print optimization results
            print("\nOptimization Results:")
            print(f"Best parameters: xG={result.x[0]:.4f}, model={result.x[1]:.4f}")
            print(f"Best RMSE: {result.fun:.4f}")
            
            # Show top 5 weight combinations
            points_with_scores = [(result.x_iters[i][0], result.x_iters[i][1], result.func_vals[i]) 
                                for i in range(len(result.func_vals))]
            points_with_scores.sort(key=lambda x: x[2])  # Sort by RMSE
            
            print("\nTop 5 weight combinations:")
            for i, (xg_w, model_w, rmse) in enumerate(points_with_scores[:5]):
                print(f"{i+1}. xG={xg_w:.4f}, model={model_w:.4f}, RMSE={rmse:.4f}")
            
            # Set the optimal weights
            self.xg_weight, self.model_weight = result.x
        except ImportError:
            print("skopt not found, using grid search instead")
            best_rmse = float('inf')
            best_weights = (0.5, 0.3)
            
            # Simple grid search
            for xg_w in np.linspace(0.0, 1.0, 6):
                for model_w in np.linspace(0.0, 1.0, 6):
                    rmse = objective((xg_w, model_w))
                    print(f"xG={xg_w:.4f}, model={model_w:.4f}, RMSE={rmse:.4f}")
                    if rmse < best_rmse:
                        best_rmse = rmse
                        best_weights = (xg_w, model_w)
            
            print(f"Best weights: xG={best_weights[0]:.4f}, model={best_weights[1]:.4f}, RMSE={best_rmse:.4f}")
            self.xg_weight, self.model_weight = best_weights
            self.last_rmse = best_rmse
        
        return self
    
    def print_team_strengths(self, exclude_teams=None):
        """Print team strength analysis in a formatted table."""
        if exclude_teams is None:
            exclude_teams = []

        # Get all teams from both models
        all_teams = set(self.team_attack.keys()).union(set(self.resim_team_attack.keys()))
        all_teams = [team for team in all_teams if team not in exclude_teams]
        
        # Create a list of team data
        team_data = []
        for team in all_teams:
            std_attack = self.team_attack.get(team, float('nan'))
            std_defense = self.team_defense.get(team, float('nan'))
            resim_attack = self.resim_team_attack.get(team, float('nan'))
            resim_defense = self.resim_team_defense.get(team, float('nan'))
            
            # Calculate blended attack and defense parameters
            blended_attack = self.model_weight * std_attack + (1 - self.model_weight) * resim_attack
            blended_defense = self.model_weight * std_defense + (1 - self.model_weight) * resim_defense
            
            # Calculate overall strength using the log scale (which is the natural scale for the DC model)
            # Higher attack and lower defense values are better
            overall_log_strength = np.log(blended_attack) - np.log(blended_defense)
            overall_abs_strength = blended_attack - blended_defense
            
            team_data.append({
                'team': team,
                'std_attack': std_attack,
                'std_defense': std_defense,
                'resim_attack': resim_attack,
                'resim_defense': resim_defense,
                'blended_attack': blended_attack,
                'blended_defense': blended_defense,
                'overall_log_strength': overall_log_strength,
                'overall_abs_strength': overall_abs_strength
            })
        
        # Sort by overall strength (descending)
        team_data = sorted(team_data, key=lambda x: x['overall_abs_strength'], reverse=True)
        
        # Print header
        print("\n{:<20} {:^20} {:^20} {:^20} {:^20}".format('', 'Standard Model', 'Resimmed Model', 'Blended Model', 'Strength'))
        print("{:<20} {:^10} {:^10} {:^10} {:^10} {:^10} {:^10} {:^10} {:^10}".format(
            'Team', 'Attack', 'Defense', 'Attack', 'Defense', 'Attack', 'Defense', 'Log', 'Abs'))
        print("-" * 110)
        
        # Print team data
        # Print team data
        for team in team_data:
            print("{:<20} {:^10.3f} {:^10.3f} {:^10.3f} {:^10.3f} {:^10.3f} {:^10.3f} {:^10.3f} {:^10.3f}".format(
                team['team'],
                team['std_attack'],
                team['std_defense'],
                team['resim_attack'],
                team['resim_defense'],
                team['blended_attack'],
                team['blended_defense'],
                team['overall_log_strength'],
                team['overall_abs_strength']
            ))
        # Print model parameters
        print("\nModel Parameters:")
        print(f"Home Advantage: Standard={self.home_advantage:.3f}, Resimmed={self.resim_home_advantage:.3f}")
        print(f"Rho Parameter: Standard={self.rho:.3f}, Resimmed={self.resim_rho:.3f}")
        print(f"Blend Weights: xG/PSxG={self.xg_weight:.3f}, Models={self.model_weight:.3f}")
        
        return team_data
    

    def predict_match(self, home_team, away_team, max_goals=10):
        """
        Predict the outcome of a match between home_team and away_team.
        
        Parameters:
        -----------
        home_team : str
            Name of the home team
        away_team : str
            Name of the away team
        max_goals : int
            Maximum number of goals to consider in the probability matrix
            
        Returns:
        --------
        dict
            Dictionary containing expected goals, win probabilities, and full score probability matrix
        """
        # Check if teams exist in the model
        if home_team not in self.team_attack or away_team not in self.team_attack:
            raise ValueError(f"Teams not found in the model. Available teams: {sorted(self.team_attack.keys())}")
        
        # Calculate expected goals from standard model
        lambda_home_std = self.team_attack[home_team] * self.team_defense[away_team] * self.home_advantage
        lambda_away_std = self.team_attack[away_team] * self.team_defense[home_team]
        
        # Calculate expected goals from resimmed model
        lambda_home_resim = self.resim_team_attack[home_team] * self.resim_team_defense[away_team] * self.resim_home_advantage
        lambda_away_resim = self.resim_team_attack[away_team] * self.resim_team_defense[home_team]
        
        # Blend expected goals using model_weight
        lambda_home = self.model_weight * lambda_home_std + (1 - self.model_weight) * lambda_home_resim
        lambda_away = self.model_weight * lambda_away_std + (1 - self.model_weight) * lambda_away_resim
        
        # Calculate match outcome probabilities
        home_win_prob = 0
        draw_prob = 0
        away_win_prob = 0
        
        # Create score probability matrix
        score_matrix = {}
        
        for i in range(max_goals + 1):
            for j in range(max_goals + 1):
                # Calculate Dixon-Coles adjusted probability
                prob = self.dc_probability(i, j, lambda_home, lambda_away, self.rho)
                
                # Add to outcome probabilities
                if i > j:
                    home_win_prob += prob
                elif i == j:
                    draw_prob += prob
                else:
                    away_win_prob += prob
                    
                # Store in score matrix
                score_matrix[f"{i}-{j}"] = prob
        
        # Calculate over/under probabilities
        over_under = {}
        for threshold in [0.5, 1.5, 2.5, 3.5, 4.5]:
            over_prob = sum(score_matrix[f"{i}-{j}"] for i in range(max_goals + 1) 
                            for j in range(max_goals + 1) if i + j > threshold)
            over_under[f"O{threshold}"] = over_prob
            over_under[f"U{threshold}"] = 1 - over_prob
        
        # Both teams to score probability
        btts_yes = sum(score_matrix[f"{i}-{j}"] for i in range(1, max_goals + 1) 
                    for j in range(1, max_goals + 1))
        btts_no = 1 - btts_yes
        
        # Return results
        return {
            'home_team': home_team,
            'away_team': away_team,
            'expected_goals': {
                'home': {
                    'standard': lambda_home_std,
                    'resimmed': lambda_home_resim,
                    'blended': lambda_home
                },
                'away': {
                    'standard': lambda_away_std,
                    'resimmed': lambda_away_resim,
                    'blended': lambda_away
                }
            },
            'win_probabilities': {
                'home': home_win_prob,
                'draw': draw_prob,
                'away': away_win_prob
            },
            'decimal_odds': {
                'home': 1 / home_win_prob if home_win_prob > 0 else float('inf'),
                'draw': 1 / draw_prob if draw_prob > 0 else float('inf'),
                'away': 1 / away_win_prob if away_win_prob > 0 else float('inf')
            },
            'over_under': over_under,
            'btts': {
                'yes': btts_yes,
                'no': btts_no
            },
            'score_matrix': {k: v for k, v in sorted(score_matrix.items(), key=lambda x: x[1], reverse=True)}
        }

In [9]:
# Load your shot data
def load_shot_data():
    # Load shot data from CSV files
    df = pd.read_csv("shot_data_prem_2024.csv")
    df_2 = pd.read_csv("shot_data_prem_2023.csv")
    
    # Combine datasets
    df = pd.concat([df, df_2])
    
    # Process dates and seasons
    df['match_date'] = pd.to_datetime(df['match_date'])
    df['season'] = np.where(df['match_date'] > pd.Timestamp('2024-08-01'), 2024, 2023)
    
    # Filter data if needed
    df = df[df["match_date"] > '2024-02-27']
    
    # Add a goal column
    df['is_goal'] = df['Outcome'].apply(lambda x: 1 if x == 'Goal' else 0)
    
    return df

# Create match summaries from shot data
def create_match_summaries(shot_data):
    # Split into home and away shots
    home_shots = shot_data[shot_data['Team'] == shot_data['home_team']]
    away_shots = shot_data[shot_data['Team'] == shot_data['away_team']]
    
    # Aggregate by match
    home_stats = home_shots.groupby(['match_url', 'match_date', 'home_team', 'away_team', 'season'], as_index=False).agg({
        'is_goal': 'sum',  # Total goals
        'xG': 'sum',       # Total xG
        'PSxG': 'sum'      # Total PSxG
    })
    
    away_stats = away_shots.groupby(['match_url', 'match_date', 'home_team', 'away_team', 'season'], as_index=False).agg({
        'is_goal': 'sum',  # Total goals
        'xG': 'sum',       # Total xG
        'PSxG': 'sum'      # Total PSxG
    })
    
    # Rename columns
    home_stats = home_stats.rename(columns={
        'is_goal': 'home_goals',
        'xG': 'home_xg',
        'PSxG': 'home_psxg'
    })
    
    away_stats = away_stats.rename(columns={
        'is_goal': 'away_goals',
        'xG': 'away_xg',
        'PSxG': 'away_psxg'
    })
    
    # Merge home and away stats
    match_stats = pd.merge(
        home_stats, 
        away_stats, 
        on=['match_url', 'match_date', 'home_team', 'away_team', 'season'],
        how='inner'
    )
    
    return match_stats


# Load shot data
shot_data = load_shot_data()
print(f"Loaded {len(shot_data)} shots")
    
# Create match summaries
match_stats = create_match_summaries(shot_data)
print(f"Created {len(match_stats)} match summaries")
    
# Convert to dictionaries for matches
matches = match_stats.to_dict('records')
    
    # For optimal weight tuning, you might want to use train/validation split
    # train_matches, val_matches = train_test_split(matches, test_size=0.2, random_state=42)
    # model_tune = TeamModel()
    # model_tune.optimize_weights(train_matches, val_matches, shots_data=shot_data)
    # optimal_xg_weight = model_tune.xg_weight
    # optimal_model_weight = model_tune.model_weight
    
# But for final team strength ratings, use ALL data
print("\n=== Poisson-Binomial Shot-by-Shot Resimulation (All Data) ===")
model_poibin = TeamModel(xg_weight=1.0, model_weight=0.3)  # Use your preferred weights
model_poibin.fit_models(matches, shots_data=shot_data, epsilon=0.0001, season_penalty=0.8)
model_poibin.print_team_strengths(exclude_teams=['Sheffield Utd', 'Luton Town', 'Burnley'])
    


Loaded 10738 shots
Created 381 match summaries

=== Poisson-Binomial Shot-by-Shot Resimulation (All Data) ===
Optimizing for 381 matches with 23 teams
Match 0: {'match_url': 'https://fbref.com/en/matches/00bcfc31/Arsenal-Bournemouth-May-4-2024-Premier-League', 'match_date': Timestamp('2024-05-04 00:00:00'), 'home_team': 'Arsenal', 'away_team': 'Bournemouth', 'season': 2023, 'home_goals': 3, 'home_xg': 3.39, 'home_psxg': 1.98, 'away_goals': 0, 'away_xg': 0.46, 'away_psxg': 0.33, 'days_from_ref': 294}
Match 1: {'match_url': 'https://fbref.com/en/matches/01e63a1f/Bournemouth-Arsenal-October-19-2024-Premier-League', 'match_date': Timestamp('2024-10-19 00:00:00'), 'home_team': 'Bournemouth', 'away_team': 'Arsenal', 'season': 2024, 'home_goals': 2, 'home_xg': 1.83, 'home_psxg': 1.6400000000000001, 'away_goals': 0, 'away_xg': 0.71, 'away_psxg': 0.32, 'days_from_ref': 126}
Match 2: {'match_url': 'https://fbref.com/en/matches/038dfa98/Fulham-Arsenal-December-8-2024-Premier-League', 'match_date'

[{'team': 'Liverpool',
  'std_attack': np.float64(1.6280416682702024),
  'std_defense': np.float64(0.7774865199581464),
  'resim_attack': np.float64(1.7302325263529497),
  'resim_defense': np.float64(0.7404905672836347),
  'blended_attack': np.float64(1.6995752689281254),
  'blended_defense': np.float64(0.7515893530859882),
  'overall_log_strength': np.float64(0.8159435552479456),
  'overall_abs_strength': np.float64(0.9479859158421372)},
 {'team': 'Manchester City',
  'std_attack': np.float64(1.750371398785961),
  'std_defense': np.float64(0.8285782816316213),
  'resim_attack': np.float64(1.488174940878347),
  'resim_defense': np.float64(0.8616599989122371),
  'blended_attack': np.float64(1.5668338782506313),
  'blended_defense': np.float64(0.8517354837280523),
  'overall_log_strength': np.float64(0.6095362106158623),
  'overall_abs_strength': np.float64(0.715098394522579)},
 {'team': 'Arsenal',
  'std_attack': np.float64(1.4024467065155681),
  'std_defense': np.float64(0.554845279530

In [16]:
prediction = model_poibin.predict_match("Liverpool", "Newcastle Utd")

prediction

{'home_team': 'Liverpool',
 'away_team': 'Newcastle Utd',
 'expected_goals': {'home': {'standard': np.float64(2.0306012053488174),
   'resimmed': np.float64(2.172066231748273),
   'blended': np.float64(2.1296267238284363)},
  'away': {'standard': np.float64(1.0739016097673761),
   'resimmed': np.float64(1.0336964837370728),
   'blended': np.float64(1.0457580215461637)}},
 'win_probabilities': {'home': np.float64(0.6035556286514338),
  'draw': np.float64(0.24989625421079326),
  'away': np.float64(0.15878643856311986)},
 'decimal_odds': {'home': np.float64(1.6568481056739863),
  'draw': np.float64(4.001660621757367),
  'away': np.float64(6.297767045152826)},
 'over_under': {'O0.5': np.float64(0.962322555870935),
  'U0.5': np.float64(0.03767744412906504),
  'O1.5': np.float64(0.8659079411985338),
  'U1.5': np.float64(0.13409205880146624),
  'O2.5': np.float64(0.6149205508661106),
  'U2.5': np.float64(0.38507944913388936),
  'O3.5': np.float64(0.391981709847093),
  'U3.5': np.float64(0.608