In [None]:
import pandas as pd
import sqlite3
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb
import optuna
import os
import json 


conn = sqlite3.connect(r'C:\Users\Owner\dev\algobetting\infra\data\db\algobetting.db')

df = pd.read_sql_query("""
    SELECT DISTINCT
        match_date as date,
        team as home_team,
        opp_team as away_team,
        summary_goals as home_goals,
        opp_summary_goals as away_goals,
        summary_xg as home_xg,
        opp_summary_xg as away_xg,
        is_home
    FROM 
        fbref_match_all_columns

    WHERE 
        division = 'Premier League'
        AND is_home = 1
        AND season = '2024-2025'
                       """, conn)
df['date'] = pd.to_datetime(df['date'])


df

Unnamed: 0,date,home_team,away_team,home_goals,away_goals,home_xg,away_xg,is_home
0,2025-05-25,Tottenham,Brighton,1.0,4.0,2.0,2.2,1
1,2025-05-25,Bournemouth,Leicester City,2.0,0.0,1.6,0.3,1
2,2025-05-25,Newcastle Utd,Everton,0.0,1.0,1.2,1.2,1
3,2025-05-25,Fulham,Manchester City,0.0,2.0,1.3,3.1,1
4,2025-05-25,Nott'ham Forest,Chelsea,0.0,1.0,1.2,1.1,1
...,...,...,...,...,...,...,...,...
375,2024-08-17,Nott'ham Forest,Bournemouth,1.0,1.0,1.4,1.2,1
376,2024-08-17,Everton,Brighton,0.0,3.0,0.5,1.4,1
377,2024-08-17,Newcastle Utd,Southampton,1.0,0.0,0.3,1.8,1
378,2024-08-17,Ipswich Town,Liverpool,0.0,2.0,0.5,2.6,1


In [None]:

import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.special import gammaln, loggamma

# Get unique teams
teams = sorted(set(df['home_team'].tolist() + df['away_team'].tolist()))
team_to_idx = {team: i for i, team in enumerate(teams)}
n_teams = len(teams)

def negative_log_likelihood_composite(params, data, teams, team_to_idx, decay_rate=0.00385):
    """
    Composite model: Single attack/defense ratings derived from Goals (Poisson) + xG (Gamma)
    
    params structure:
    [attack_1, ..., attack_n,                       # 0 to n_teams-1 (composite ratings)
     defense_1, ..., defense_n,                     # n_teams to 2*n_teams-1 (composite ratings)  
     xg_shape_1, ..., xg_shape_n,                   # 2*n_teams to 3*n_teams-1 (for xG consistency)
     goals_weight,                                  # -3 (weight for goals in composite)
     xg_weight,                                     # -2 (weight for xG in composite)  
     home_advantage]                                # -1
    """
    n_teams = len(teams)
    
    # Extract parameters
    attack = params[:n_teams]                    # composite attack ratings
    defense = params[n_teams:2*n_teams]         # composite defense ratings
    xg_shape = params[2*n_teams:3*n_teams]      # shape for xG gamma distribution
    goals_weight = params[-3]                    # weight for goals component
    xg_weight = params[-2]                       # weight for xG component  
    home_adv = params[-1]                        # home advantage
    
    ll = 0
    max_date = data['date'].max()
    
    for _, row in data.iterrows():
        home_idx = team_to_idx[row['home_team']]
        away_idx = team_to_idx[row['away_team']]
        
        # Time decay weight
        days_ago = (max_date - row['date']).days
        time_weight = np.exp(-decay_rate * days_ago)
        
        # Composite strength gives expected performance for BOTH metrics
        composite_home = np.exp(attack[home_idx] - defense[away_idx] + home_adv)
        composite_away = np.exp(attack[away_idx] - defense[home_idx])
        
        # === GOALS (Regular Poisson) ===
        # Expected goals from composite strength, weighted by goals_weight
        lambda_home_goals = composite_home * np.exp(goals_weight)
        lambda_away_goals = composite_away * np.exp(goals_weight)
        
        # Standard Poisson likelihood for goals
        ll_goals_home = (row['home_goals'] * np.log(lambda_home_goals) - 
                        lambda_home_goals - gammaln(row['home_goals'] + 1))
        ll_goals_away = (row['away_goals'] * np.log(lambda_away_goals) - 
                        lambda_away_goals - gammaln(row['away_goals'] + 1))
        
        # === xG (Gamma Distribution) ===
        # Expected xG from composite strength, weighted by xg_weight
        mu_xg_home = composite_home * np.exp(xg_weight)
        mu_xg_away = composite_away * np.exp(xg_weight)
        
        # Gamma parameters
        shape_home = np.exp(xg_shape[home_idx])  # ensure positive
        shape_away = np.exp(xg_shape[away_idx])
        scale_home = mu_xg_home / shape_home
        scale_away = mu_xg_away / shape_away
        
        # Gamma log-likelihood
        if row['home_xg'] > 0:
            ll_xg_home = ((shape_home - 1) * np.log(row['home_xg']) - 
                         row['home_xg'] / scale_home - 
                         shape_home * np.log(scale_home) - 
                         loggamma(shape_home))
        else:
            ll_xg_home = -1000  # penalty for xG = 0
            
        if row['away_xg'] > 0:
            ll_xg_away = ((shape_away - 1) * np.log(row['away_xg']) - 
                         row['away_xg'] / scale_away - 
                         shape_away * np.log(scale_away) - 
                         loggamma(shape_away))
        else:
            ll_xg_away = -1000
        
        # Combine all likelihoods
        ll += time_weight * (ll_goals_home + ll_goals_away + ll_xg_home + ll_xg_away)
    
    return -ll

# Constraints for identifiability
def constraint_attack_sum(params):
    return np.sum(params[:n_teams])

def constraint_defense_sum(params):
    return np.sum(params[n_teams:2*n_teams])

def constraint_weights_sum(params):
    # Goals weight + xG weight should sum to 0 for identifiability
    return params[-3] + params[-2]  # goals_weight + xg_weight = 0

constraints_composite = [
    {'type': 'eq', 'fun': constraint_attack_sum},
    {'type': 'eq', 'fun': constraint_defense_sum},
    {'type': 'eq', 'fun': constraint_weights_sum}
]

# Initial parameters (fewer parameters now - no zero inflation!)
np.random.seed(42)
initial_params_composite = np.concatenate([
    np.random.normal(0, 0.1, n_teams),      # attack (composite)
    np.random.normal(0, 0.1, n_teams),      # defense (composite) 
    np.random.normal(0, 0.1, n_teams),      # xg_shape
    [0.1, -0.1, 0.0]                        # goals_weight, xg_weight, home_advantage
])

print(f"Composite model parameters: {len(initial_params_composite)}")
print(f"That's {3 * n_teams + 3} parameters total")
print("\nModel structure:")
print("- Single composite attack/defense ratings")
print("- Goals: Regular Poisson (weighted)")
print("- xG: Gamma Distribution (weighted)")
print("- Learns optimal goals vs xG weighting")
print("\n" + "="*50)

# Convert date column if needed
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])

# Fit the model
print("Fitting composite model...")
result_composite = minimize(
    negative_log_likelihood_composite,
    initial_params_composite,
    args=(df, teams, team_to_idx),
    method='SLSQP',
    constraints=constraints_composite,
    options={'maxiter': 1000}
)

if result_composite.success:
    print("✅ Optimization successful!")
    
    # Extract fitted parameters
    attack = result_composite.x[:n_teams]
    defense = result_composite.x[n_teams:2*n_teams]
    xg_shape = result_composite.x[2*n_teams:3*n_teams]
    goals_weight = result_composite.x[-3]
    xg_weight = result_composite.x[-2]
    home_adv = result_composite.x[-1]
    
    # Create results dataframe
    results_composite = pd.DataFrame({
        'team': teams,
        'attack': np.exp(attack),
        'defense': np.exp(-defense),
        'overall': np.exp(attack) - np.exp(-defense),
        'xg_consistency': np.exp(xg_shape)
    })
    
    print("\n" + "="*50)
    print("TEAM RATINGS (Composite Model):")
    print(results_composite.sort_values('overall', ascending=False)[
        ['team', 'attack', 'defense', 'overall', 'xg_consistency']
    ].round(3))
    
    print(f"\nModel Weights:")
    print(f"Goals weight: {goals_weight:.3f} (relative importance: {np.exp(goals_weight):.3f})")
    print(f"xG weight: {xg_weight:.3f} (relative importance: {np.exp(xg_weight):.3f})")
    print(f"Home Advantage: {np.exp(home_adv):.3f}")
    
    # The model learns whether goals or xG is more predictive!
    if goals_weight > xg_weight:
        print("📈 Model says: Actual goals are more predictive than xG")
    else:
        print("📊 Model says: xG is more predictive than actual goals")
        
else:
    print("❌ Optimization failed!")
    print(f"Message: {result_composite.message}")

Composite model parameters: 63
That's 63 parameters total

Model structure:
- Single composite attack/defense ratings
- Goals: Regular Poisson (weighted)
- xG: Gamma Distribution (weighted)
- Learns optimal goals vs xG weighting

Fitting composite model...
✅ Optimization successful!

TEAM RATINGS (Composite Model):
               team  attack  defense  overall  xg_consistency
11        Liverpool   1.589    0.827    0.761           6.650
0           Arsenal   1.190    0.658    0.532           4.765
12  Manchester City   1.322    0.861    0.462           4.494
14    Newcastle Utd   1.291    0.848    0.443           4.354
5           Chelsea   1.190    0.824    0.366           4.280
2       Bournemouth   1.160    0.921    0.240           4.913
6    Crystal Palace   1.179    0.948    0.231           3.165
1       Aston Villa   1.081    0.893    0.188           3.409
3         Brentford   1.157    1.026    0.131           3.857
4          Brighton   1.184    1.054    0.129           6.845
7

In [37]:
# Predict a match (now gives single expected goals value!)
def predict_match(home_team, away_team, result_params, team_to_idx):
    home_idx = team_to_idx[home_team]
    away_idx = team_to_idx[away_team]
    
    n_teams = len(team_to_idx)
    
    # Extract parameters
    attack = result_params[:n_teams]
    defense = result_params[n_teams:2*n_teams]
    home_adv = result_params[-1]
    
    # Single composite expected goals
    expected_goals_home = np.exp(attack[home_idx] - defense[away_idx] + home_adv)
    expected_goals_away = np.exp(attack[away_idx] - defense[home_idx])
    
    return expected_goals_home, expected_goals_away

# Example prediction
if 'result_composite' in locals() and result_composite.success:
    home_team = 'Aston Villa'
    away_team = 'Newcastle Utd'
    
    if home_team in team_to_idx and away_team in team_to_idx:
        home_exp, away_exp = predict_match(home_team, away_team, result_composite.x, team_to_idx)
        print(f"\n{home_team} vs {away_team} prediction:")
        print(f"{home_team} expected goals: {home_exp:.2f}")
        print(f"{away_team} expected goals: {away_exp:.2f}")
    else:
        print(f"\nTeam not found. Available teams: {list(teams)[:5]}...")


Aston Villa vs Newcastle Utd prediction:
Aston Villa expected goals: 1.07
Newcastle Utd expected goals: 1.15


In [38]:
# Model validation functions
def validate_predictions(df, result_params, team_to_idx):
    """Check if model predictions match actual data patterns"""
    
    predictions = []
    actuals = []
    
    for _, row in df.iterrows():
        # Get prediction
        pred_home, pred_away = predict_match(row['home_team'], row['away_team'], 
                                           result_params, team_to_idx)
        predictions.extend([pred_home, pred_away])
        actuals.extend([row['home_goals'], row['away_goals']])
    
    predictions = np.array(predictions)
    actuals = np.array(actuals)
    
    print("🔍 MODEL VALIDATION:")
    print(f"Average predicted goals per team: {predictions.mean():.2f}")
    print(f"Average actual goals per team: {actuals.mean():.2f}")
    print(f"Prediction vs actual correlation: {np.corrcoef(predictions, actuals)[0,1]:.3f}")
    
    print(f"\nGoals distribution:")
    print(f"Predicted: min={predictions.min():.2f}, max={predictions.max():.2f}")
    print(f"Actual: min={actuals.min():.0f}, max={actuals.max():.0f}")
    
    # Check if predictions are reasonable
    if predictions.mean() < 0.8:
        print("⚠️  Predictions seem LOW - check your data scaling")
    elif predictions.mean() > 2.5:
        print("⚠️  Predictions seem HIGH - check for data issues")
    else:
        print("✅ Prediction range looks reasonable")

def check_league_averages(df):
    """Show what your data looks like"""
    print("📈 YOUR DATA SUMMARY:")
    print(f"Average goals per team per match: {df[['home_goals', 'away_goals']].values.mean():.2f}")
    print(f"Average xG per team per match: {df[['home_xg', 'away_xg']].values.mean():.2f}")
    print(f"Home advantage in goals: {df['home_goals'].mean() - df['away_goals'].mean():.2f}")
    print(f"Home advantage in xG: {df['home_xg'].mean() - df['away_xg'].mean():.2f}")

# Run validation if model fitted successfully
if 'result_composite' in locals() and result_composite.success:
    print("\n" + "="*60)
    check_league_averages(df)
    validate_predictions(df, result_composite.x, team_to_idx)



📈 YOUR DATA SUMMARY:
Average goals per team per match: 1.47
Average xG per team per match: 1.45
Home advantage in goals: 0.09
Home advantage in xG: 0.15
🔍 MODEL VALIDATION:
Average predicted goals per team: 1.15
Average actual goals per team: 1.47
Prediction vs actual correlation: 0.398

Goals distribution:
Predicted: min=0.37, max=3.00
Actual: min=0, max=7
✅ Prediction range looks reasonable
