In [208]:
import pymc as pm
import numpy as np
import pandas as pd
import os
import requests
import io
from datetime import datetime, timedelta
import multiprocessing
import arviz as az
import logging
import tqdm

In [209]:
# get data
API_KEY = os.getenv("API_KEY")
url = 'https://data-service.beatthebookie.blog/data'
headers = {"x-api-key": API_KEY}

# Function to fetch data for a specific division and season
def fetch_data(division, season):
    params = {
        'division': division,
        'season': season
    }
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return pd.read_json(io.StringIO(response.content.decode('utf-8')))
    else:
        print(f"Error fetching {division} {season}: {response.status_code}")
        print(response.content.decode('utf-8'))
        return pd.DataFrame()

# Fetch data for all combinations
seasons = ['2024_2025', '2023_2024']
divisions = ['Premier League', 'Championship']
dataframes = []

for division in divisions:
    for season in seasons:
        df = fetch_data(division, season)
        if not df.empty:
            dataframes.append(df)

# Combine all dataframes
if dataframes:
    df = pd.concat(dataframes, ignore_index=True)
    
    # Convert match_date to datetime
    df['match_date'] = pd.to_datetime(df['match_date'])

In [210]:
df = df[(df['home_red'] == 0) & (df['away_red'] == 0)].copy()

In [211]:
# Separates out the home vs away and allows every team to have an individual row of data. Allows us to feature engineer home effect easier.
home_df = df[["division", "season", "match_date",
             "home_team", "home_avg_market_value", "home_goals", "home_shots", "home_xgoals",  "home_deep", "home_ppda", "home_red",
             "away_team", "away_avg_market_value", "away_goals", "away_shots", "away_xgoals", "away_deep", "away_ppda", "away_red",
              "bet365_home_odds", "bet365_draw_odds", "bet365_away_odds"]].copy()
home_df["home?"] = 1
home_df = home_df.rename(columns={"home_team":"team", "home_avg_market_value":"avg_market_value", "home_goals":"goals",
             "home_shots":"shots", "home_xgoals":"xG",  "home_deep":"deep", "home_ppda":"ppda", "home_red": "red",
             "away_team":"opponent_team", "away_avg_market_value":"opponent_avg_market_value", "away_goals":"opponent_goals", 
             "away_shots": "opponent_shots", "away_xgoals": "opponent_xG", "away_deep":"opponent_deep", "away_ppda":"opponent_ppda", "away_red": "opponent_red"})

away_df = df[["division", "season", "match_date",
                "away_team", "away_avg_market_value", "away_goals", "away_shots", "away_xgoals",  "away_deep", "away_ppda", "away_red",
                "home_team", "home_avg_market_value", "home_goals", "home_shots", "home_xgoals", "home_deep", "home_ppda", "home_red",
                 "bet365_home_odds", "bet365_draw_odds", "bet365_away_odds"]].copy()
away_df["home?"] = 0
away_df = away_df.rename(columns={"away_team":"team", "away_avg_market_value":"avg_market_value", "away_goals":"goals",
             "away_shots":"shots", "away_xgoals":"xG", "away_deep":"deep", "away_ppda":"ppda", "away_red": "red",
             "home_team":"opponent_team", "home_avg_market_value":"opponent_avg_market_value", "home_goals":"opponent_goals", 
             "home_shots": "opponent_shots", "home_xgoals": "opponent_xG", "home_deep":"opponent_deep", "home_ppda":"opponent_ppda", "home_red": "opponent_red"})

df = pd.concat([away_df, home_df])
df["prem?"] = df["division"].apply(lambda x: 1 if x == "Premier League" else 0)
df.sort_values(["match_date", "division"], inplace=True)

In [212]:
df["prem?"]

1067    0
1067    0
1068    0
1069    0
1070    0
       ..
1061    0
1062    0
1063    0
1064    0
1066    0
Name: prem?, Length: 2676, dtype: int64

In [221]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

def penalized_ema(group_df, column_name, span=38):
    """
    Calculates EMA for each row based only on prior data (not including current row).
    """
    # Create a copy to avoid modifying the original
    data = group_df[column_name].astype(float).copy()
    
    # Initialize empty series for results
    result = pd.Series(index=data.index, dtype=float)
    
    # For each row, calculate EMA using only prior data
    for i in range(len(data)):
        if i == 0:
            # First row has no prior data, so use NaN
            result.iloc[i] = np.nan
        else:
            # Calculate EMA using only data up to the previous row
            prior_data = data.iloc[:i]
            if len(prior_data) > 0:
                result.iloc[i] = prior_data.ewm(span=span, adjust=False, min_periods=1).mean().iloc[-1]
    
    return result

# Calculate rolling stats for each team as in your original code
df.sort_values(by=["team", "match_date"], inplace=True)
stat_columns = ['goals', 'xG', 'shots', 'red']

# Calculate team attacking stats (what they produce)
for col in stat_columns:
    df[f"team_attack_{col}"] = df.groupby("team", group_keys=False).apply(
        lambda x: penalized_ema(x, col), include_groups=False
    )

# Calculate team defensive stats (what they concede)
for col in stat_columns:
    df[f"team_defense_{col}"] = df.groupby("team", group_keys=False).apply(
        lambda x: penalized_ema(x, f'opponent_{col}'), include_groups=False
    )

df.reset_index(drop=True, inplace=True)

# Now do the same for opponent teams to get their stats at the time of match
df.sort_values(by=["opponent_team", "match_date"], inplace=True)

# Calculate opponent attacking stats (what they produce)
for col in stat_columns:
    df[f"opponent_attack_{col}"] = df.groupby("opponent_team", group_keys=False).apply(
        lambda x: penalized_ema(x, f'opponent_{col}'), include_groups=False
    )

# Calculate opponent defensive stats (what they concede)
for col in stat_columns:
    df[f"opponent_defense_{col}"] = df.groupby("opponent_team", group_keys=False).apply(
        lambda x: penalized_ema(x, col), include_groups=False
    )

df.reset_index(drop=True, inplace=True)

# For prediction, combine attack and defense
# For team's expected goals: team's attack vs opponent's defense
# For opponent's expected goals: opponent's attack vs team's defense

# Filter data for recent matches
df = df[df["match_date"] >= "2024-01-01"]

# Features for predicting team's xG (combines team attack with opponent defense)
X = df[[
    "home?",  # Home advantage is still important
    "prem?", # Division is important for differences in playstyle
    # Team attack features
    "team_attack_shots", "team_attack_goals", "team_attack_xG", #"team_attack_red",
    # Opponent defense features (what they concede)
    "opponent_defense_shots", "opponent_defense_goals", "opponent_defense_xG", #"opponent_defense_red",
    # Market values if available
    "avg_market_value", "opponent_avg_market_value"
]]

y = df["goals"]  # Target is still the team's xG

# Split by time
match_year = df['match_date'].dt.year
X_train = X[match_year < 2025]
X_test = X[match_year >= 2025]
y_train = y[match_year < 2025]
y_test = y[match_year >= 2025]

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# Train an XGBoost model
model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=2,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=26,
    eta=0.06
)

model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")


Training set size: 1432 samples
Test set size: 398 samples
RMSE: 1.1764
MAE: 0.8979


In [242]:
def predict_match(model, home_team, away_team, team_stats, market_values):
    """
    Predict the expected goals for a match between two teams using attack vs defense approach.
    
    Parameters:
    model: Trained XGBoost model
    home_team (str): Name of the home team
    away_team (str): Name of the away team
    team_stats (dict): Dictionary containing team statistics
    market_values (dict): Dictionary with team market values {team_name: value}
    
    Returns:
    tuple: (predicted_home_xG, predicted_away_xG)
    """
    # Get team statistics
    home_stats = team_stats.get(home_team, {})
    away_stats = team_stats.get(away_team, {})
    
    # Create feature array for home team (home attack vs away defense)
    home_features = {
        "home?": 1,  # Home team
        "prem?": home_stats.get('prem?', 0),
        # Home team attack features
        "team_attack_shots": home_stats.get('rolling_shots', 0),
        "team_attack_goals": home_stats.get('rolling_goals', 0),
        "team_attack_xG": home_stats.get('rolling_xG', 0),
        #"team_attack_red": home_stats.get('rolling_red', 0),
        # Away team defense features
        "opponent_defense_shots": away_stats.get('rolling_shots_conceded', 0),
        "opponent_defense_goals": away_stats.get('rolling_goals_conceded', 0),
        "opponent_defense_xG": away_stats.get('rolling_xG_conceded', 0),
        #'opponent_defense_red": away_stats.get('rolling_red_conceded', 0),
        # Market values
        "avg_market_value": market_values.get(home_team, 0),
        "opponent_avg_market_value": market_values.get(away_team, 0)
    }
    
    # Create feature array for away team (away attack vs home defense)
    away_features = {
        "home?": 0,  # Away team
        "prem?": away_stats.get('prem?', 0),
        # Away team attack features
        "team_attack_shots": away_stats.get('rolling_shots', 0),
        "team_attack_goals": away_stats.get('rolling_goals', 0),
        "team_attack_xG": away_stats.get('rolling_xG', 0),
        #"team_attack_red": away_stats.get('rolling_red', 0),
        # Home team defense features
        "opponent_defense_shots": home_stats.get('rolling_shots_conceded', 0),
        "opponent_defense_goals": home_stats.get('rolling_goals_conceded', 0),
        "opponent_defense_xG": home_stats.get('rolling_xG_conceded', 0),
        #"opponent_defense_red": home_stats.get('rolling_red_conceded', 0),
        # Market values
        "avg_market_value": market_values.get(away_team, 0),
        "opponent_avg_market_value": market_values.get(home_team, 0)
    }
    
    # Convert to DataFrame for prediction
    home_X = pd.DataFrame([home_features])
    away_X = pd.DataFrame([away_features])
    
    # Ensure columns are in the exact order expected by the model
    expected_features = [
        'home?', "prem?",
        'team_attack_shots', 'team_attack_goals', 'team_attack_xG', #'team_attack_red',
        'opponent_defense_shots', 'opponent_defense_goals', 'opponent_defense_xG', #'opponent_defense_red',
        'avg_market_value', 'opponent_avg_market_value'
    ]
    
    home_X = home_X[expected_features]
    away_X = away_X[expected_features]
    
    # Predict xG for both teams
    home_xG = model.predict(home_X)[0]
    away_xG = model.predict(away_X)[0]
    
    return home_xG, away_xG

# Updated function to extract team statistics from DataFrame
def extract_team_stats(df):
    """Extract the latest statistics for each team with attack and defense metrics."""
    team_stats = {}
    market_values = {}
    
    # Group by team and get the most recent stats
    for team in df['team'].unique():
        team_data = df[df['team'] == team].sort_values('match_date', ascending=False).iloc[0]
        
        # Store both attacking and defensive metrics
        team_stats[team] = {
            'prem?': team_data.get('prem?', 0),

            # Attack metrics (what they produce)
            'rolling_shots': team_data.get('team_attack_shots', 0),
            'rolling_goals': team_data.get('team_attack_goals', 0),
            'rolling_xG': team_data.get('team_attack_xG', 0),
            #'rolling_red': team_data.get('team_attack_red', 0),
            
            # Defense metrics (what they concede)
            'rolling_shots_conceded': team_data.get('team_defense_shots', 0),
            'rolling_goals_conceded': team_data.get('team_defense_goals', 0),
            'rolling_xG_conceded': team_data.get('team_defense_xG', 0),
            #'rolling_red_conceded': team_data.get('team_defense_red', 0)
        }
        
        market_values[team] = team_data.get('avg_market_value', 0)
    
    return team_stats, market_values


# Example usage
team_stats, market_values = extract_team_stats(df)
home_team = "Newcastle"
away_team = "West Ham"
home_xG, away_xG = predict_match(model, home_team, away_team, team_stats, market_values)
print(f"Predicted xG: {home_team} {home_xG:.2f} - {away_xG:.2f} {away_team}")

df[df["team"] == home_team].sort_values("match_date", ascending=False).iloc[0]

Predicted xG: Newcastle 1.46 - 1.19 West Ham


division                          Premier League
season                                  20242025
match_date                   2025-02-26 00:00:00
team                                   Newcastle
avg_market_value                        18847143
goals                                          0
shots                                        3.0
xG                                       0.45882
deep                                         3.0
ppda                                        19.5
red                                            0
opponent_team                          Liverpool
opponent_avg_market_value               34307407
opponent_goals                                 2
opponent_shots                              12.0
opponent_xG                              1.40027
opponent_deep                                8.0
opponent_ppda                           6.777778
opponent_red                                   0
bet365_home_odds                             1.5
bet365_draw_odds    