In [6]:
import numpy as np
import pandas as pd
import os
import requests
import io
from datetime import datetime, timedelta
import multiprocessing
import arviz as az
import logging
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
from scipy.stats import poisson
import sqlite3
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss, brier_score_loss


In [11]:
conn = sqlite3.connect(r"C:\Users\Owner\dev\team-eval\match_db.db")
df = pd.read_sql_query("""SELECT * FROM btb_matches""", conn)

final_df = pd.read_sql_query("""SELECT * FROM xgb_match_features_0065_001 WHERE division = 'Premier League'""", conn)

final_df = final_df.dropna()

final_df['match_outcome'] = np.where(final_df['goals'] > final_df['opponent_goals'], 0,  # Home win
                                     np.where(final_df['goals'] == final_df['opponent_goals'], 1, 2))  # Draw or Away win

final_df['team_prob'] = 1 / final_df['odds']
final_df['draw_prob'] = 1 / final_df['bet365_draw_odds']
final_df['opponent_prob'] = 1 / final_df['opponent_odds']
prob_sum = final_df[["team_prob", "draw_prob", "opponent_prob"]].sum(axis=1)
final_df["team_prob_norm"] = final_df["team_prob"] / prob_sum
final_df["draw_prob_norm"] = final_df["draw_prob"] / prob_sum
final_df["opponent_prob_norm"] = final_df["opponent_prob"] / prob_sum

final_df["match_date"] = pd.to_datetime(final_df['match_date'])

final_df

Unnamed: 0,division,season,match_date,match_id,team,avg_market_value,goals,shots,shots_on_target,corners,...,opponent_rolling_ppda_against_30d,opponent_rolling_odds_for_30d,opponent_rolling_odds_against_30d,match_outcome,team_prob,draw_prob,opponent_prob,team_prob_norm,draw_prob_norm,opponent_prob_norm
39,Premier League,20212022,2021-08-22,Arsenal - Chelsea_20210822,Arsenal,19888889,0,6.0,3,9,...,71.000000,1.250000,13.000000,2,0.210526,0.266667,0.581395,0.198875,0.251908,0.549218
40,Premier League,20212022,2021-08-28,Man City - Arsenal_20210828,Arsenal,19888889,0,1.0,0,0,...,22.450435,1.339220,15.780750,2,0.083333,0.166667,0.800000,0.079365,0.158730,0.761905
41,Premier League,20212022,2021-09-11,Arsenal - Norwich_20210911,Arsenal,21096154,1,30.0,7,8,...,9.316791,13.156062,1.411286,0,0.666667,0.217391,0.166667,0.634483,0.206897,0.158621
42,Premier League,20212022,2021-09-18,Burnley - Arsenal_20210918,Arsenal,21096154,1,13.0,3,3,...,8.768627,6.043155,1.893619,0,0.512821,0.277778,0.263158,0.486660,0.263607,0.249733
43,Premier League,20212022,2021-09-26,Arsenal - Tottenham_20210926,Arsenal,21096154,3,12.0,7,4,...,11.425531,2.826007,4.577895,0,0.465116,0.303030,0.285714,0.441345,0.287543,0.271112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3613,Premier League,20242025,2025-02-16,Liverpool - Wolves_20250216,Wolves,10972727,1,16.0,4,3,...,14.780322,1.446936,7.793515,2,0.083333,0.125000,0.869565,0.077311,0.115966,0.806723
3614,Premier League,20242025,2025-02-22,Bournemouth - Wolves_20250222,Wolves,10972727,1,13.0,5,7,...,8.240788,3.252346,3.187490,0,0.181818,0.222222,0.653595,0.171910,0.210112,0.617978
3615,Premier League,20242025,2025-02-25,Wolves - Fulham_20250225,Wolves,10972727,1,18.0,5,7,...,18.219634,2.819511,2.954952,2,0.380228,0.303030,0.380228,0.357530,0.284940,0.357530
3616,Premier League,20242025,2025-03-08,Wolves - Everton_20250308,Wolves,10972727,1,11.0,3,5,...,10.016210,3.491567,2.698088,1,0.384615,0.322581,0.347222,0.364765,0.305932,0.329302


In [20]:
y = final_df['match_outcome']
feature_cols = ["home?", "avg_market_value", "opponent_avg_market_value", #"rolling_finishing_ability", "rolling_opponent_finishing_ability", 
             "rolling_goals_for", "rolling_xg_for", "rolling_shots_for", "rolling_shots_on_target_for", "rolling_corners_for", "rolling_deep_for", "rolling_ppda_for", 'rolling_goals_for_30d', 'rolling_goals_against_30d', 'rolling_xg_for_30d', 'rolling_xg_against_30d', 'rolling_shots_for_30d', 'rolling_shots_against_30d', 'rolling_shots_on_target_for_30d', 'rolling_shots_on_target_against_30d', 'rolling_corners_for_30d', 'rolling_corners_against_30d', 'rolling_deep_for_30d', 'rolling_deep_against_30d', 'rolling_ppda_for_30d', 'rolling_ppda_against_30d', 
             "opponent_rolling_goals_against", "opponent_rolling_xg_against", "opponent_rolling_shots_against", "opponent_rolling_shots_on_target_against", "opponent_rolling_corners_against", "opponent_rolling_deep_against", "opponent_rolling_ppda_against", 'opponent_rolling_goals_for_30d', 'opponent_rolling_goals_against_30d', 'opponent_rolling_xg_for_30d', 'opponent_rolling_xg_against_30d', 'opponent_rolling_shots_for_30d', 'opponent_rolling_shots_against_30d', 'opponent_rolling_shots_on_target_for_30d', 'opponent_rolling_shots_on_target_against_30d', 'opponent_rolling_corners_for_30d', 'opponent_rolling_corners_against_30d', 'opponent_rolling_deep_for_30d', 'opponent_rolling_deep_against_30d', 'opponent_rolling_ppda_for_30d', 'opponent_rolling_ppda_against_30d']
X = final_df[feature_cols]

test_cutoff = pd.to_datetime('2024-08-01')
y_train = y[final_df['match_date'] < test_cutoff]
y_test = y[final_df['match_date'] >= test_cutoff]
X_train = X[final_df['match_date'] < test_cutoff]
X_test = X[final_df['match_date'] >= test_cutoff]


model_params = {
    'n_estimators': 84,
    'learning_rate':  0.039,
    'max_depth': 3,
    'subsample': 0.761,
    'colsample_bytree': 0.861,
    'min_child_weight': 3,
    'gamma': 0.166,
    'alpha': 1.845,
    '_lambda': 0.494,
    'random_state': 26,
    'enable_categorical': True
}

# Use multi:softprob objective for classification with probabilities
base_model = xgb.XGBClassifier(objective='multi:softprob', num_class=3, **model_params)
calibrated_model = CalibratedClassifierCV(base_model, method='sigmoid', cv=5)
calibrated_model.fit(X_train, y_train)

y_probs = calibrated_model.predict_proba(X_test)
ll_score = log_loss(y_test, y_probs)
print(f"Log Loss: {ll_score:.4f}")

# Extract test set market probabilities
market_probs_test = final_df.loc[final_df['match_date'] >= test_cutoff, 
                               ['team_prob_norm', 'draw_prob_norm', 'opponent_prob_norm']].values

# Calculate market log loss
market_ll = log_loss(y_test, market_probs_test)
print(f"Market Log Loss: {market_ll:.4f}")


Parameters: { "_lambda" } are not used.

Parameters: { "_lambda" } are not used.

Parameters: { "_lambda" } are not used.

Parameters: { "_lambda" } are not used.

Parameters: { "_lambda" } are not used.



Log Loss: 1.0074
Market Log Loss: 0.9774


In [19]:
import optuna

def objective(trial):
    # Define the hyperparameters to optimize
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 6),
        'gamma': trial.suggest_float('gamma', 0, 0.3),
        'alpha': trial.suggest_float('alpha', 0, 2),
        'lambda': trial.suggest_float('lambda', 0, 2),
    }
    
    # Create and train model
    model = xgb.XGBClassifier(
        objective='multi:softprob',
        num_class=3,
        enable_categorical=True,
        random_state=26,
        **params
    )
    
    # Use cross-validation to evaluate
    from sklearn.model_selection import cross_val_score
    scores = cross_val_score(model, X_train, y_train, 
                            cv=5, scoring='neg_log_loss')
    
    # Return the mean log loss (negated because Optuna minimizes)
    return -scores.mean()

# Create a study object and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Print the best parameters
print(f"Best parameters: {study.best_params}")
print(f"Best log loss: {-study.best_value:.4f}")

# Train the final model with best parameters
best_params = study.best_params
final_model = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    enable_categorical=True,
    random_state=26,
    **best_params
)

# Create calibrated model with best parameters
final_calibrated_model = CalibratedClassifierCV(final_model, method='isotonic', cv=5)
final_calibrated_model.fit(X_train, y_train)

# Evaluate on test set
y_probs = final_calibrated_model.predict_proba(X_test)
ll_score = log_loss(y_test, y_probs)
print(f"Test Log Loss with optimized model: {ll_score:.4f}")

[I 2025-04-05 06:08:03,460] A new study created in memory with name: no-name-79ce003a-827d-4ec8-bf8f-0d87d8faf01c
[I 2025-04-05 06:08:10,766] Trial 0 finished with value: 1.0837471952944366 and parameters: {'n_estimators': 246, 'learning_rate': 0.04088708464607427, 'max_depth': 7, 'subsample': 0.6782379456793828, 'colsample_bytree': 0.7689304237738486, 'min_child_weight': 4, 'gamma': 0.25656434505429093, 'alpha': 0.6097044417876625, 'lambda': 0.06834311303192053}. Best is trial 0 with value: 1.0837471952944366.
[I 2025-04-05 06:08:13,163] Trial 1 finished with value: 0.9830622122768269 and parameters: {'n_estimators': 178, 'learning_rate': 0.01724252450513234, 'max_depth': 4, 'subsample': 0.6781714902546335, 'colsample_bytree': 0.8596137926138414, 'min_child_weight': 3, 'gamma': 0.07923085970694439, 'alpha': 1.857791083077166, 'lambda': 1.5101921802188183}. Best is trial 1 with value: 0.9830622122768269.
[I 2025-04-05 06:08:14,038] Trial 2 finished with value: 1.0039460824181472 and pa

Best parameters: {'n_estimators': 84, 'learning_rate': 0.039479646681735425, 'max_depth': 3, 'subsample': 0.7614970001136591, 'colsample_bytree': 0.8610964918370265, 'min_child_weight': 3, 'gamma': 0.1660195348295222, 'alpha': 1.8452767647035433, 'lambda': 0.4944176053188355}
Best log loss: -0.9777
Test Log Loss with optimized model: 1.0082
