In [21]:
import numpy as np
import pandas as pd
import os
import requests
import io
from datetime import datetime, timedelta
import multiprocessing
import arviz as az
import logging
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb

In [3]:
# get data
API_KEY = os.getenv("API_KEY")
url = 'https://data-service.beatthebookie.blog/data'
headers = {"x-api-key": API_KEY}

# Function to fetch data for a specific division and season
def fetch_data(division, season):
    params = {
        'division': division,
        'season': season
    }
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return pd.read_json(io.StringIO(response.content.decode('utf-8')))
    else:
        print(f"Error fetching {division} {season}: {response.status_code}")
        print(response.content.decode('utf-8'))
        return pd.DataFrame()

# Fetch data for all combinations
seasons = ['2024_2025', '2023_2024']
divisions = ['Premier League', 'Championship']
dataframes = []

for division in divisions:
    for season in seasons:
        df = fetch_data(division, season)
        if not df.empty:
            dataframes.append(df)

# Combine all dataframes
if dataframes:
    df = pd.concat(dataframes, ignore_index=True)
    
    # Convert match_date to datetime
    df['match_date'] = pd.to_datetime(df['match_date'])

df

Unnamed: 0,division_id,division,season_id,season,match_date,match_teams,home_team_id,home_team,away_team_id,away_team,...,away_xgoals,home_deep,away_deep,home_ppda,away_ppda,bet365_home_odds,bet365_draw_odds,bet365_away_odds,bet365_u25_odds,bet365_o25_odds
0,98b8784f6685b7289f583e0ce4b4f6f2,Premier League,3ac445d3cc1d404987efdfcfa42f3bcd,20242025,2024-08-16,Man United - Fulham,f2b82cdbdadf9d3ec47c3a6be66dcfad,Man United,8cd5e94668b139c1f42a89a1e130f3cf,Fulham,...,0.418711,7.0,3.0,7.379310,10.833333,1.60,4.20,5.25,2.50,1.53
1,98b8784f6685b7289f583e0ce4b4f6f2,Premier League,3ac445d3cc1d404987efdfcfa42f3bcd,20242025,2024-08-17,Everton - Brighton,6414a61d98ab23b6d757e888ab17a66a,Everton,0d84883ca72c88cb53c8a38262efdcbc,Brighton,...,1.790830,4.0,5.0,18.333333,7.916667,2.63,3.30,2.63,2.00,1.80
2,98b8784f6685b7289f583e0ce4b4f6f2,Premier League,3ac445d3cc1d404987efdfcfa42f3bcd,20242025,2024-08-17,Ipswich - Liverpool,e4f63bf6d6d2cd121e6c8e59bef68209,Ipswich,afce84ff226407a47c9782a742ba02f7,Liverpool,...,3.929060,2.0,13.0,18.777778,8.739130,8.50,5.50,1.33,3.00,1.40
3,98b8784f6685b7289f583e0ce4b4f6f2,Premier League,3ac445d3cc1d404987efdfcfa42f3bcd,20242025,2024-08-17,Nott'm Forest - Bournemouth,9a8e1e9fad8766fc3d69a0c26d98b928,Nott'm Forest,b436d55f36cfbe8a085c8b75fb7fe98a,Bournemouth,...,1.909150,10.0,4.0,8.653846,9.954545,2.45,3.50,2.80,2.10,1.73
4,98b8784f6685b7289f583e0ce4b4f6f2,Premier League,3ac445d3cc1d404987efdfcfa42f3bcd,20242025,2024-08-17,Newcastle - Southampton,78e9266876e7649e0a12e3840f5be006,Newcastle,5a884401673693b0bdf379fefb7ec2b2,Southampton,...,1.954830,4.0,13.0,16.250000,3.789474,1.36,5.25,8.00,3.00,1.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1652,c4781225ef2d5018f7a9df4b6cb1c1fe,Championship,18b28eb3ae0ef75bc47858e438602442,20232024,2024-05-04,Sunderland - Sheffield Weds,1820e986a8a213df782c5cf5ad0b65e1,Sunderland,ec5c8ee83966ae3a295abeee6ab46871,Sheffield Weds,...,1.300000,,,,,2.70,3.40,2.55,1.80,2.00
1653,c4781225ef2d5018f7a9df4b6cb1c1fe,Championship,18b28eb3ae0ef75bc47858e438602442,20232024,2024-05-04,Rotherham - Cardiff,095ea4179483dfdd7fd4ee61a25954f4,Rotherham,288a6430a6f57a76e823d74e0dd750d2,Cardiff,...,1.300000,,,,,3.40,3.40,2.20,1.73,2.10
1654,c4781225ef2d5018f7a9df4b6cb1c1fe,Championship,18b28eb3ae0ef75bc47858e438602442,20232024,2024-05-04,Leicester - Blackburn,a06360acafe964d244e424cbd50862e8,Leicester,8178de69a5e17f7bbe3a74331977feb1,Blackburn,...,1.700000,,,,,1.70,3.75,5.25,2.50,1.53
1655,c4781225ef2d5018f7a9df4b6cb1c1fe,Championship,18b28eb3ae0ef75bc47858e438602442,20232024,2024-05-04,Plymouth Argyle - Hull,a0111c6b8f3812f2e36ababf2c032d8c,Plymouth Argyle,63b0fc998685327831f96bbe7b07b7e2,Hull,...,0.500000,,,,,3.50,3.75,2.00,2.30,1.62


In [4]:
# Separates out the home vs away and allows every team to have an individual row of data. Allows us to feature engineer home effect easier.
home_df = df[["division", "season", "match_date",
             "home_team", "home_avg_market_value", "home_goals", "home_shots", "home_xgoals",  "home_deep", "home_ppda", "home_red",
             "away_team", "away_avg_market_value", "away_goals", "away_shots", "away_xgoals", "away_deep", "away_ppda", "away_red",
              "bet365_home_odds", "bet365_draw_odds", "bet365_away_odds"]].copy()
home_df["home?"] = 1
home_df = home_df.rename(columns={"home_team":"team", "home_avg_market_value":"avg_market_value", "home_goals":"goals",
             "home_shots":"shots", "home_xgoals":"xG",  "home_deep":"deep", "home_ppda":"ppda", "home_red": "red",
             "away_team":"opponent_team", "away_avg_market_value":"opponent_avg_market_value", "away_goals":"opponent_goals", 
             "away_shots": "opponent_shots", "away_xgoals": "opponent_xG", "away_deep":"opponent_deep", "away_ppda":"opponent_ppda", "away_red": "opponent_red"})

away_df = df[["division", "season", "match_date",
                "away_team", "away_avg_market_value", "away_goals", "away_shots", "away_xgoals",  "away_deep", "away_ppda", "away_red",
                "home_team", "home_avg_market_value", "home_goals", "home_shots", "home_xgoals", "home_deep", "home_ppda", "home_red",
                 "bet365_home_odds", "bet365_draw_odds", "bet365_away_odds"]].copy()
away_df["home?"] = 0
away_df = away_df.rename(columns={"away_team":"team", "away_avg_market_value":"avg_market_value", "away_goals":"goals",
             "away_shots":"shots", "away_xgoals":"xG", "away_deep":"deep", "away_ppda":"ppda", "away_red": "red",
             "home_team":"opponent_team", "home_avg_market_value":"opponent_avg_market_value", "home_goals":"opponent_goals", 
             "home_shots": "opponent_shots", "home_xgoals": "opponent_xG", "home_deep":"opponent_deep", "home_ppda":"opponent_ppda", "home_red": "opponent_red"})

df = pd.concat([away_df, home_df])
df["prem?"] = df["division"].apply(lambda x: 1 if x == "Premier League" else 0)
df.sort_values(["match_date", "division"], inplace=True)

df

Unnamed: 0,division,season,match_date,team,avg_market_value,goals,shots,xG,deep,ppda,...,opponent_shots,opponent_xG,opponent_deep,opponent_ppda,opponent_red,bet365_home_odds,bet365_draw_odds,bet365_away_odds,home?,prem?
1105,Championship,20232024,2023-08-04,Southampton,13127586,2,23.0,1.4,,,...,8.0,0.5,,,0,3.10,3.40,2.30,0,0
1105,Championship,20232024,2023-08-04,Sheffield Weds,602381,1,8.0,0.5,,,...,23.0,1.4,,,0,3.10,3.40,2.30,1,0
1106,Championship,20232024,2023-08-05,Preston,1371667,1,11.0,1.3,,,...,5.0,0.9,,,0,2.05,3.50,3.60,0,0
1107,Championship,20232024,2023-08-05,QPR,1563636,0,4.0,0.4,,,...,23.0,2.9,,,0,1.85,3.60,4.20,0,0
1108,Championship,20232024,2023-08-05,Millwall,1615909,1,12.0,1.2,,,...,16.0,0.8,,,0,1.91,3.30,4.33,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1100,Championship,20242025,2025-03-12,FC Portsmouth,515357,1,16.0,1.4,,,...,6.0,0.3,,,1,1.67,3.70,5.00,1,0
1101,Championship,20242025,2025-03-12,Leeds,8399000,2,18.0,0.8,,,...,4.0,0.2,,,0,1.29,5.25,10.00,1,0
1102,Championship,20242025,2025-03-12,Watford,1941071,1,9.0,0.6,,,...,12.0,0.7,,,0,2.30,3.25,3.10,1,0
1103,Championship,20242025,2025-03-12,Stoke,1480357,1,8.0,1.4,,,...,10.0,0.5,,,0,2.60,3.30,2.70,1,0


In [15]:
def apply_weighted_avg(col, match_date, match_red, decay_rate=0.001, time_window=365):
    # Create a mask for non-NaN values
    valid_mask = ~pd.isna(col)
    
    # If all values are NaN, return NaN
    if not valid_mask.any():
        return np.nan
    
    # Filter out NaN values
    valid_col = col[valid_mask]
    valid_dates = match_date[valid_mask]
    valid_red = match_red[valid_mask]
    
    # Get most recent date
    recent_date = max(valid_dates)
    
    # Create a time window mask (only include matches within time_window days)
    time_window_mask = (recent_date - valid_dates).dt.days <= time_window
    
    # If no matches in the time window, return NaN
    if not time_window_mask.any():
        return np.nan
    
    # Apply time window filter
    valid_col = valid_col[time_window_mask]
    valid_dates = valid_dates[time_window_mask]
    valid_red = valid_red[time_window_mask]
    
    # Calculate weights for matches within the time window
    match_weight = np.exp(-(recent_date - valid_dates).dt.days * decay_rate)
    
    # Reduce weight for matches with red cards
    match_weight = np.where(valid_red == 1, match_weight * 0.5, match_weight)

    # Calculate weighted average
    weighted_avg = sum(match_weight * valid_col) / sum(match_weight) 

    return weighted_avg

In [None]:
# Assuming df is already sorted by match_date
df = df.sort_values(['team', 'match_date'])

# Function to calculate metrics for a specific team up to a specific date
def calc_team_metrics_up_to_date(team, current_date, decay_rate=0.001, time_window=365):
    # Get all matches for the team up to but not including the current date
    team_matches = df[(df['team'] == team) & (df['match_date'] < current_date)]
    
    # If no prior matches, return NaN for all metrics
    if len(team_matches) == 0:
        return pd.Series({
            'rolling_goals_for': np.nan,
            'rolling_goals_against': np.nan,
            'rolling_xg_for': np.nan,
            'rolling_xg_against': np.nan,
            'rolling_shots_for': np.nan,
            'rolling_shots_against': np.nan
        })
    
    # Calculate metrics
    metrics = {
        'rolling_goals_for': apply_weighted_avg(
            team_matches['goals'], team_matches['match_date'], team_matches['red'], 
            decay_rate, time_window
        ),
        'rolling_goals_against': apply_weighted_avg(
            team_matches['opponent_goals'], team_matches['match_date'], team_matches['red'], 
            decay_rate, time_window
        ),
        'rolling_xg_for': apply_weighted_avg(
            team_matches['xG'], team_matches['match_date'], team_matches['red'], 
            decay_rate, time_window
        ),
        'rolling_xg_against': apply_weighted_avg(
            team_matches['opponent_xG'], team_matches['match_date'], team_matches['red'], 
            decay_rate, time_window
        ),
        'rolling_shots_for': apply_weighted_avg(
            team_matches['shots'], team_matches['match_date'], team_matches['red'], 
            decay_rate, time_window
        ),
        'rolling_shots_against': apply_weighted_avg(
            team_matches['opponent_shots'], team_matches['match_date'], team_matches['red'], 
            decay_rate, time_window
        )
    }
    
    return pd.Series(metrics)

# Apply the function to each row for team stats
team_results = []
opponent_results = []

for idx, row in df.iterrows():
    # Calculate team metrics
    team_metrics = calc_team_metrics_up_to_date(row['team'], row['match_date'])
    
    # Calculate opponent metrics
    opponent_metrics = calc_team_metrics_up_to_date(row['opponent_team'], row['match_date'])
    
    # Rename opponent metrics with a clearer naming convention
    opponent_metrics = pd.Series({
        'opponent_rolling_goals_for': opponent_metrics['rolling_goals_for'],
        'opponent_rolling_goals_against': opponent_metrics['rolling_goals_against'],
        'opponent_rolling_xg_for': opponent_metrics['rolling_xg_for'],
        'opponent_rolling_xg_against': opponent_metrics['rolling_xg_against'],
        'opponent_rolling_shots_for': opponent_metrics['rolling_shots_for'],
        'opponent_rolling_shots_against': opponent_metrics['rolling_shots_against']
    })
    
    team_results.append(team_metrics)
    opponent_results.append(opponent_metrics)

# Create DataFrames from the results
team_metrics_df = pd.DataFrame(team_results, index=df.index)
opponent_metrics_df = pd.DataFrame(opponent_results, index=df.index)

# Check for duplicated columns before concatenation
print("Team metrics columns:", team_metrics_df.columns.tolist())
print("Opponent metrics columns:", opponent_metrics_df.columns.tolist())

# Combine everything into the final DataFrame
final_df = pd.concat([df, team_metrics_df, opponent_metrics_df], axis=1)

Team metrics columns: ['rolling_goals_for', 'rolling_goals_against', 'rolling_xg_for', 'rolling_xg_against', 'rolling_shots_for', 'rolling_shots_against']
Opponent metrics columns: ['opponent_rolling_goals_for', 'opponent_rolling_goals_against', 'opponent_rolling_xg_for', 'opponent_rolling_xg_against', 'opponent_rolling_shots_for', 'opponent_rolling_shots_against']
Final columns: ['division', 'season', 'match_date', 'team', 'avg_market_value', 'goals', 'shots', 'xG', 'deep', 'ppda', 'red', 'opponent_team', 'opponent_avg_market_value', 'opponent_goals', 'opponent_shots', 'opponent_xG', 'opponent_deep', 'opponent_ppda', 'opponent_red', 'bet365_home_odds', 'bet365_draw_odds', 'bet365_away_odds', 'home?', 'prem?', 'rolling_goals_for', 'rolling_goals_against', 'rolling_xg_for', 'rolling_xg_against', 'rolling_shots_for', 'rolling_shots_against', 'rolling_goals_for', 'rolling_goals_against', 'rolling_xg_for', 'rolling_xg_against', 'rolling_shots_for', 'rolling_shots_against', 'opponent_rollin

In [None]:
dupe_cols = final_df.columns[final_df.columns.duplicated()]
print(f"Found {len(dupe_cols)} duplicated columns: {dupe_cols.tolist()}")

# Keep only the first occurrence of each column name
final_df = final_df.loc[:, ~final_df.columns.duplicated()]

Found 6 duplicated columns: ['rolling_goals_for', 'rolling_goals_against', 'rolling_xg_for', 'rolling_xg_against', 'rolling_shots_for', 'rolling_shots_against']


In [46]:
final_df

Unnamed: 0,division,season,match_date,team,avg_market_value,goals,shots,xG,deep,ppda,...,rolling_xg_for,rolling_xg_against,rolling_shots_for,rolling_shots_against,opponent_rolling_goals_for,opponent_rolling_goals_against,opponent_rolling_xg_for,opponent_rolling_xg_against,opponent_rolling_shots_for,opponent_rolling_shots_against
5,Premier League,20242025,2024-08-17,Arsenal,50869565,2,18.0,1.6283,14.0,7.769231,...,2.231364,0.83386,17.465263,8.411686,1.334983,1.709133,1.31473,1.948875,11.366872,14.829223
13,Premier League,20242025,2024-08-24,Arsenal,50869565,2,9.0,1.41399,10.0,5.833333,...,2.243243,0.822149,17.539658,8.486633,2.03241,1.505483,1.801775,1.712855,13.418436,12.200154
21,Premier League,20242025,2024-08-31,Arsenal,50869565,1,11.0,2.41798,6.0,9.25,...,2.216477,0.835562,17.299782,8.506546,1.334251,1.595141,1.543126,1.541787,14.222505,12.199876
38,Premier League,20242025,2024-09-15,Arsenal,50869565,1,7.0,1.1209,3.0,13.588235,...,2.196566,0.851548,17.156138,8.742333,1.940669,1.670741,1.917198,1.75278,15.216359,12.163516
48,Premier League,20242025,2024-09-22,Arsenal,50869565,2,5.0,1.28411,1.0,13.071429,...,2.167226,0.846961,16.82075,8.922419,2.566976,0.907457,2.382639,0.999531,18.361947,7.793665
51,Premier League,20242025,2024-09-28,Arsenal,50869565,4,36.0,6.05305,21.0,4.633333,...,2.184141,0.892405,16.71001,9.351164,1.845722,1.005418,1.759478,1.059139,13.480346,10.879117
66,Premier League,20242025,2024-10-05,Arsenal,50869565,3,29.0,3.25165,14.0,8.103448,...,2.310732,0.843047,17.445884,9.119769,1.729018,1.298358,1.74089,1.123271,14.640245,11.239142
76,Premier League,20242025,2024-10-19,Arsenal,50869565,0,6.0,0.740758,4.0,8.9,...,2.301471,0.862079,17.890258,9.108122,1.496242,1.643579,1.82003,1.563105,15.018023,12.907342
87,Premier League,20242025,2024-10-27,Arsenal,50869565,2,9.0,1.0883,6.0,8.071429,...,2.319729,0.884813,17.825019,9.294763,2.182565,0.885741,2.525454,1.088166,20.382693,9.899571
92,Premier League,20242025,2024-11-02,Arsenal,50869565,0,10.0,0.911305,9.0,5.24,...,2.307379,0.881193,17.637195,9.244543,1.799588,1.631757,1.968873,1.825126,14.130136,15.000117


In [50]:
final_df = final_df[final_df["match_date"] > '2024-08-01']

X = final_df[["division", "home?", "team", "opponent_team", "rolling_goals_for", "rolling_xg_for", "rolling_shots_for",
             "opponent_rolling_goals_against", "opponent_rolling_xg_against", "opponent_rolling_shots_against"]]

# Convert categorical columns to category type
cat_cols = ["division", "team", "opponent_team"]
for col in cat_cols:
    X[col] = X[col].astype('category')


y = final_df["goals"] 

feb_2025_cutoff = pd.to_datetime('2025-02-01')
X_train = X[final_df['match_date'] < feb_2025_cutoff]
y_train = y[final_df['match_date'] < feb_2025_cutoff]
X_test = X[final_df['match_date'] >= feb_2025_cutoff]
y_test = y[final_df['match_date'] >= feb_2025_cutoff]

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# Train an XGBoost model
model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=2,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=26,
    enable_categorical=True  # Enable categorical feature support
)

model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

X_test_has_nan = X_test.isna().any(axis=1)
y_test_has_nan = y_test.isna()

print(f"X_test rows with NaN: {X_test_has_nan.sum()}")
print(f"y_test rows with NaN: {y_test_has_nan.sum()}")

Training set size: 1156 samples
Test set size: 294 samples
RMSE: 1.1346
MAE: 0.8539
X_test rows with NaN: 0
y_test rows with NaN: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('category')


In [62]:
# Reset index before splitting to avoid duplicate indices
X_with_reset_index = X.reset_index(drop=True)
y_with_reset_index = y.reset_index(drop=True)
final_df_reset = final_df.reset_index(drop=True)

# Now split using the reset data
X_train = X_with_reset_index[final_df_reset['match_date'] < feb_2025_cutoff]
y_train = y_with_reset_index[final_df_reset['match_date'] < feb_2025_cutoff]
X_test = X_with_reset_index[final_df_reset['match_date'] >= feb_2025_cutoff]
y_test = y_with_reset_index[final_df_reset['match_date'] >= feb_2025_cutoff]

# Train and predict as before
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Now creating the results DataFrame is simple
test_results = pd.DataFrame({
    'actual_goals': y_test,
    'predicted_goals': y_pred,
    'error': y_test - y_pred,
    'abs_error': abs(y_test - y_pred)
})

# Add match information - this will align correctly since indices are reset
test_results['match_date'] = final_df_reset.loc[y_test.index, 'match_date']
test_results['team'] = final_df_reset.loc[y_test.index, 'team']
test_results['opponent_team'] = final_df_reset.loc[y_test.index, 'opponent_team']
test_results['home'] = final_df_reset.loc[y_test.index, 'home?']
test_results['division'] = final_df_reset.loc[y_test.index, 'division']

# Sort by date
test_results = test_results.sort_values('match_date')

test_results[test_results["division"] == 'Premier League']

Unnamed: 0,actual_goals,predicted_goals,error,abs_error,match_date,team,opponent_team,home,division
173,0,1.502293,-1.502293,1.502293,2025-02-01,Brighton,Nott'm Forest,0,Premier League
861,1,1.3594,-0.3594,0.3594,2025-02-01,Newcastle,Fulham,1,Premier League
664,2,2.100195,-0.100195,0.100195,2025-02-01,Liverpool,Bournemouth,0,Premier League
117,0,1.404286,-1.404286,1.404286,2025-02-01,Bournemouth,Liverpool,1,Premier League
637,0,0.915196,-0.915196,0.915196,2025-02-01,Leicester,Everton,0,Premier League
507,2,1.220185,0.779815,0.779815,2025-02-01,Fulham,Newcastle,0,Premier League
51,0,1.880747,-1.880747,1.880747,2025-02-01,Aston Villa,Wolves,0,Premier League
572,1,1.602085,-0.602085,0.602085,2025-02-01,Ipswich,Southampton,1,Premier League
926,7,1.510604,5.489396,5.489396,2025-02-01,Nott'm Forest,Brighton,1,Premier League
1445,2,1.79259,0.20741,0.20741,2025-02-01,Wolves,Aston Villa,1,Premier League
