# Load Data

In [24]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.stats import poisson
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import sqlite3

conn = sqlite3.connect(r'C:\Users\Owner\dev\algobetting\infra\data\db\algobetting.db')

df = pd.read_sql_query("""
    SELECT DISTINCT
       f.match_date,
       f.team,
       f.team_rolling_summary_goals,
       f.team_rolling_summary_xg,
       f.team_rolling_summary_npxg,
       f.team_rolling_keeper_psxg,
       f.team_rolling_conceded_summary_goals,
       f.team_rolling_conceded_summary_xg,
       f.team_rolling_conceded_summary_npxg,
       f.team_rolling_conceded_keeper_psxg,
       fb.summary_goals as goals,
       f.opp_team,
       f.opp_team_rolling_summary_goals,
       f.opp_team_rolling_summary_xg,
       f.opp_team_rolling_summary_npxg,
       f.opp_team_rolling_keeper_psxg,
       f.opp_team_rolling_conceded_summary_goals,
       f.opp_team_rolling_conceded_summary_xg,
       f.opp_team_rolling_conceded_summary_npxg,
       f.opp_team_rolling_conceded_keeper_psxg,
       fb.opp_summary_goals as opp_goals
    FROM 
        team_all_features_365_00050 f
    JOIN
        fbref_match_all_columns fb
            ON fb.match_url = f.match_url AND fb.team = f.team
    WHERE f.division = 'Premier League'
            AND f.season = '2024-2025'
            AND team_rolling_summary_goals IS NOT NULL
            AND f.is_home = 1
                        """, conn)

conn.close()

df

Unnamed: 0,match_date,team,team_rolling_summary_goals,team_rolling_summary_xg,team_rolling_summary_npxg,team_rolling_keeper_psxg,team_rolling_conceded_summary_goals,team_rolling_conceded_summary_xg,team_rolling_conceded_summary_npxg,team_rolling_conceded_keeper_psxg,...,opp_team,opp_team_rolling_summary_goals,opp_team_rolling_summary_xg,opp_team_rolling_summary_npxg,opp_team_rolling_keeper_psxg,opp_team_rolling_conceded_summary_goals,opp_team_rolling_conceded_summary_xg,opp_team_rolling_conceded_summary_npxg,opp_team_rolling_conceded_keeper_psxg,opp_goals
0,2025-05-25 00:00:00,Tottenham,1.500408,1.419083,1.338374,1.413362,1.838188,1.755202,1.720501,1.805191,...,Brighton,1.706913,1.605220,1.414512,1.829008,1.582692,1.398281,1.220604,1.299603,4.0
1,2025-05-25 00:00:00,Bournemouth,1.608621,1.726044,1.600831,1.742328,1.334089,1.331399,1.220682,1.460762,...,Leicester City,0.807435,0.817617,0.765214,0.826943,2.064578,1.948015,1.838669,1.811285,0.0
2,2025-05-25 00:00:00,Newcastle Utd,1.895577,1.739969,1.610403,1.693188,1.345745,1.226171,1.155783,1.312582,...,Everton,1.206227,1.169747,1.117807,1.162522,1.096034,1.188877,1.149149,1.251694,1.0
3,2025-05-25 00:00:00,Fulham,1.445887,1.245589,1.178877,1.433139,1.480788,1.251556,1.160657,1.317218,...,Manchester City,1.855950,1.747285,1.681626,1.732629,1.065178,1.212474,1.128386,1.117500,2.0
4,2025-05-25 00:00:00,Nott'ham Forest,1.578072,1.179114,1.133311,1.428674,1.311202,1.418029,1.339274,1.384869,...,Chelsea,1.562918,1.703940,1.582041,1.705193,1.074680,1.149933,1.043757,1.091591,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,2024-08-17 00:00:00,Nott'ham Forest,1.296371,1.424365,1.418664,1.215678,1.822696,1.378234,1.337462,1.181650,...,Bournemouth,1.470426,1.540680,1.459409,1.204111,1.752710,1.526086,1.373257,1.656592,1.0
376,2024-08-17 00:00:00,Everton,1.068233,1.453716,1.360224,1.158816,1.353958,1.555156,1.398703,1.456195,...,Brighton,0.996871,1.365028,1.244065,1.401611,1.662358,1.523636,1.445034,1.406198,3.0
377,2024-08-17 00:00:00,Newcastle Utd,2.211730,2.023807,1.848153,1.832712,1.693598,1.686495,1.570390,1.615495,...,Southampton,1.746484,1.640530,1.565486,1.708088,1.195353,1.029815,0.972503,0.988693,0.0
378,2024-08-17 00:00:00,Ipswich Town,1.987621,1.652822,1.601648,1.665831,1.218097,1.005455,0.949889,1.026538,...,Liverpool,2.356665,2.474557,2.301927,2.187080,1.220762,1.213207,1.208176,1.128665,2.0


In [28]:
class MatchPredictionModel(nn.Module):
    """
    Neural network that takes home and away team features and predicts match goals.
    Also outputs interpretable team ratings (attack/defense strengths).
    """
    
    def __init__(self, n_team_features):
        super().__init__()
        
        # Shared feature processing
        self.feature_encoder = nn.Sequential(
            nn.Linear(n_team_features, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 16),
            nn.ReLU(),
        )
        
        # Team strength heads (for interpretable ratings)
        self.attack_head = nn.Sequential(
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 1)
        )
        
        self.defense_head = nn.Sequential(
            nn.Linear(16, 8),
            nn.ReLU(), 
            nn.Linear(8, 1)
        )
        
        # Learnable home advantage
        self.home_advantage = nn.Parameter(torch.tensor(0.2))
    
    def get_team_strengths(self, team_features):
        """Get interpretable attack and defense ratings for teams"""
        encoded = self.feature_encoder(team_features)
        
        # Use softplus to ensure positive values
        attack = torch.nn.functional.softplus(self.attack_head(encoded))
        defense = torch.nn.functional.softplus(self.defense_head(encoded))
        
        return attack.squeeze(), defense.squeeze()
    
    def forward(self, home_features, away_features):
        """
        Predict expected goals using Dixon-Coles style team strengths
        
        Args:
            home_features: Tensor of home team's rolling stats
            away_features: Tensor of away team's rolling stats
            
        Returns:
            home_lambda, away_lambda: Expected goals for home and away teams
        """
        # Get team strengths
        home_attack, home_defense = self.get_team_strengths(home_features)
        away_attack, away_defense = self.get_team_strengths(away_features)
        
        # Dixon-Coles style calculation
        home_lambda = home_attack * away_defense * torch.exp(self.home_advantage)
        away_lambda = away_attack * home_defense
        
        return home_lambda, away_lambda

def prepare_training_data(df):
   """
   Prepare your dataframe for training the neural network
   
   Args:
       df: Your dataframe with rolling features and match results
       
   Returns:
       Features, targets, scaler, and column info for training
   """
   
   print(f"📊 Preparing {len(df)} matches for training...")
   
   # Define feature columns split into attack and defense
   attack_feature_cols = [
       'team_rolling_summary_goals', 
       'team_rolling_summary_xg',
       #'team_rolling_summary_npxg',
       #'team_rolling_keeper_psxg'
   ]
   
   defense_feature_cols = [
       'team_rolling_conceded_summary_goals',
       'team_rolling_conceded_summary_xg', 
       #'team_rolling_conceded_summary_npxg',
       #'team_rolling_conceded_keeper_psxg'
   ]
   
   # Combine for full team features
   team_feature_cols = attack_feature_cols + defense_feature_cols
   
   # Opponent features (same structure)
   opp_attack_feature_cols = [
       'opp_team_rolling_summary_goals', 
       'opp_team_rolling_summary_xg',
       #'opp_team_rolling_summary_npxg',
       #'opp_team_rolling_keeper_psxg'
   ]
   
   opp_defense_feature_cols = [
       'opp_team_rolling_conceded_summary_goals',
       'opp_team_rolling_conceded_summary_xg',
       #'opp_team_rolling_conceded_summary_npxg', 
       #'opp_team_rolling_conceded_keeper_psxg'
   ]
   
   opp_feature_cols = opp_attack_feature_cols + opp_defense_feature_cols
   
   print(f"🔧 Using {len(team_feature_cols)} features per team:")
   print("⚔️ Attack features:")
   for col in attack_feature_cols:
       print(f"   - {col}")
   print("🛡️ Defense features:")
   for col in defense_feature_cols:
       print(f"   - {col}")
   
   # Extract features - home team is 'team', away team is 'opp_team'
   home_features = df[team_feature_cols].fillna(0).values  # Fill NaN with 0
   away_features = df[opp_feature_cols].fillna(0).values
   
   # Extract targets
   home_goals = df['goals'].values
   away_goals = df['opp_goals'].values
   
   # Scale features (important for neural networks)
   scaler = StandardScaler()
   
   # Fit scaler on all features combined for consistency
   all_features = np.vstack([home_features, away_features])
   scaler.fit(all_features)
   
   # Transform features
   home_features_scaled = scaler.transform(home_features)
   away_features_scaled = scaler.transform(away_features)
   
   print(f"✅ Prepared {len(home_features_scaled)} training examples")
   print(f"📈 Feature shape: {home_features_scaled.shape}")
   print(f"⚽ Average goals - Home: {home_goals.mean():.2f}, Away: {away_goals.mean():.2f}")
   
   return (home_features_scaled, away_features_scaled, 
           home_goals, away_goals, scaler, team_feature_cols)

def train_model(df, epochs=100000, learning_rate=0.001):
    """
    Train the match prediction model
    
    Args:
        df: Your prepared dataframe
        epochs: Number of training epochs
        learning_rate: Learning rate for optimizer
    
    Returns:
        Trained model, scaler, and feature columns
    """
    
    # Prepare data
    (home_features, away_features, home_goals, away_goals, 
     scaler, feature_cols) = prepare_training_data(df)
    
    # Convert to tensors
    home_tensor = torch.FloatTensor(home_features)
    away_tensor = torch.FloatTensor(away_features)
    home_goals_tensor = torch.FloatTensor(home_goals)
    away_goals_tensor = torch.FloatTensor(away_goals)
    
    # Create model
    n_features = home_features.shape[1]
    model = MatchPredictionModel(n_features)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    print(f"\n🧠 Training model for {epochs} epochs...")
    
    # Training loop
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        
        # Forward pass
        pred_home_goals, pred_away_goals = model(home_tensor, away_tensor)
        
        # Poisson negative log-likelihood loss
        home_loss = torch.nn.PoissonNLLLoss()(pred_home_goals, home_goals_tensor)
        away_loss = torch.nn.PoissonNLLLoss()(pred_away_goals, away_goals_tensor)
        total_loss = home_loss + away_loss
        
        # Backward pass
        total_loss.backward()
        optimizer.step()
        
        # Print progress
        if epoch % 200 == 0 or epoch == epochs - 1:
            model.eval()
            with torch.no_grad():
                pred_h, pred_a = model(home_tensor, away_tensor)
                mae_home = torch.mean(torch.abs(pred_h - home_goals_tensor))
                mae_away = torch.mean(torch.abs(pred_a - away_goals_tensor))
            
            print(f"Epoch {epoch:4d}: Loss={total_loss.item():.4f}, "
                  f"MAE Home={mae_home:.3f}, MAE Away={mae_away:.3f}")
            model.train()
    
    print(f"\n✅ Training complete!")
    print(f"🏠 Learned home advantage: +{model.home_advantage.item():.3f} goals")
    
    return model, scaler, feature_cols

def extract_team_ratings(model, df, scaler, feature_cols):
    """
    Extract interpretable team attack and defense ratings from the trained model
    
    Args:
        model: Trained MatchPredictionModel
        df: Your dataframe with team data
        scaler: Fitted StandardScaler
        feature_cols: List of feature column names
    
    Returns:
        DataFrame with team ratings
    """
    
    model.eval()
    
    print("🏆 Extracting Team Ratings...")
    
    # Get unique teams and their latest features
    team_ratings = []
    
    # Get all unique teams from both home and away
    home_teams = df[['team'] + feature_cols].drop_duplicates('team')
    away_teams = df[['opp_team'] + [col.replace('team_rolling_', 'opp_team_rolling_') 
                                   for col in feature_cols]].drop_duplicates('opp_team')
    
    # Process home teams
    for _, row in home_teams.iterrows():
        team_name = row['team']
        team_features = row[feature_cols].fillna(0).values.reshape(1, -1)
        team_features_scaled = scaler.transform(team_features)
        
        with torch.no_grad():
            features_tensor = torch.FloatTensor(team_features_scaled)
            attack, defense = model.get_team_strengths(features_tensor)
        
        team_ratings.append({
            'team': team_name,
            'attack_rating': attack.item(),
            'defense_rating': defense.item()
        })
    
    # Process away teams (if different features)
    opp_feature_cols = [col.replace('team_rolling_', 'opp_team_rolling_') for col in feature_cols]
    
    for _, row in away_teams.iterrows():
        team_name = row['opp_team']
        
        # Skip if already processed as home team
        if team_name in [r['team'] for r in team_ratings]:
            continue
            
        team_features = row[opp_feature_cols].fillna(0).values.reshape(1, -1)
        team_features_scaled = scaler.transform(team_features)
        
        with torch.no_grad():
            features_tensor = torch.FloatTensor(team_features_scaled)
            attack, defense = model.get_team_strengths(features_tensor)
        
        team_ratings.append({
            'team': team_name,
            'attack_rating': attack.item(),
            'defense_rating': defense.item()
        })
    
    # Convert to DataFrame and sort
    ratings_df = pd.DataFrame(team_ratings)
    
    # Display rankings
    attack_rankings = ratings_df.sort_values('attack_rating', ascending=False).reset_index(drop=True)
    defense_rankings = ratings_df.sort_values('defense_rating', ascending=False).reset_index(drop=True)
    
    print("\n⚔️ ATTACK RATINGS (Higher = Better Attack)")
    print("=" * 50)
    for i, row in attack_rankings.head(20).iterrows():
        print(f"{i+1:2d}. {row['team']:20s} {row['attack_rating']:.3f}")
    
    print("\n🛡️ DEFENSE RATINGS (Higher = Better Defense)")
    print("=" * 50)
    for i, row in defense_rankings.head(20).iterrows():
        print(f"{i+1:2d}. {row['team']:20s} {row['defense_rating']:.3f}")
    
    return ratings_df
def predict_match(model, scaler, feature_cols, home_team_stats, away_team_stats):
    """
    Predict a match using the trained model
    
    Args:
        model: Trained MatchPredictionModel
        scaler: Fitted StandardScaler
        feature_cols: List of feature column names
        home_team_stats: Dict with home team's rolling stats
        away_team_stats: Dict with away team's rolling stats
    
    Returns:
        Expected goals for home and away teams
    """
    
    model.eval()
    
    # Convert stats to arrays in correct order
    home_array = np.array([[home_team_stats.get(col.replace('team_rolling_', ''), 0) 
                           for col in feature_cols]])
    away_array = np.array([[away_team_stats.get(col.replace('opp_team_rolling_', ''), 0) 
                           for col in feature_cols]])
    
    # Scale features
    home_scaled = scaler.transform(home_array)
    away_scaled = scaler.transform(away_array)
    
    # Predict
    with torch.no_grad():
        home_tensor = torch.FloatTensor(home_scaled)
        away_tensor = torch.FloatTensor(away_scaled)
        pred_home, pred_away = model(home_tensor, away_tensor)
    
    return pred_home.item(), pred_away.item()

# Example usage with your data:
if __name__ == "__main__":
    print("🚀 Training Match Prediction Model")
    print("=" * 50)
    
    # Train the model (assuming df is your loaded dataframe)
    model, scaler, feature_cols = train_model(df)
    
    # Extract team ratings
    ratings_df = extract_team_ratings(model, df, scaler, feature_cols)


🚀 Training Match Prediction Model
📊 Preparing 380 matches for training...
🔧 Using 4 features per team:
⚔️ Attack features:
   - team_rolling_summary_goals
   - team_rolling_summary_xg
🛡️ Defense features:
   - team_rolling_conceded_summary_goals
   - team_rolling_conceded_summary_xg
✅ Prepared 380 training examples
📈 Feature shape: (380, 4)
⚽ Average goals - Home: 1.51, Away: 1.42

🧠 Training model for 100000 epochs...
Epoch    0: Loss=1.8368, MAE Home=1.229, MAE Away=1.178
Epoch  200: Loss=1.7001, MAE Home=1.255, MAE Away=1.189
Epoch  400: Loss=1.6642, MAE Home=1.255, MAE Away=1.173
Epoch  600: Loss=1.6322, MAE Home=1.261, MAE Away=1.176
Epoch  800: Loss=1.6230, MAE Home=1.259, MAE Away=1.180
Epoch 1000: Loss=1.5924, MAE Home=1.252, MAE Away=1.173
Epoch 1200: Loss=1.5728, MAE Home=1.253, MAE Away=1.174
Epoch 1400: Loss=1.5750, MAE Home=1.254, MAE Away=1.175
Epoch 1600: Loss=1.5670, MAE Home=1.251, MAE Away=1.175
Epoch 1800: Loss=1.5687, MAE Home=1.263, MAE Away=1.186
Epoch 2000: Loss=

  team_features = row[feature_cols].fillna(0).values.reshape(1, -1)
  team_features = row[feature_cols].fillna(0).values.reshape(1, -1)
  team_features = row[feature_cols].fillna(0).values.reshape(1, -1)
  team_features = row[feature_cols].fillna(0).values.reshape(1, -1)
  team_features = row[feature_cols].fillna(0).values.reshape(1, -1)
  team_features = row[feature_cols].fillna(0).values.reshape(1, -1)
  team_features = row[feature_cols].fillna(0).values.reshape(1, -1)
  team_features = row[feature_cols].fillna(0).values.reshape(1, -1)
  team_features = row[feature_cols].fillna(0).values.reshape(1, -1)
  team_features = row[feature_cols].fillna(0).values.reshape(1, -1)
  team_features = row[feature_cols].fillna(0).values.reshape(1, -1)
  team_features = row[feature_cols].fillna(0).values.reshape(1, -1)
  team_features = row[feature_cols].fillna(0).values.reshape(1, -1)
  team_features = row[feature_cols].fillna(0).values.reshape(1, -1)
  team_features = row[feature_cols].fillna(0).va