In [5]:
import pandas as pd
import sqlite3
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb
import optuna
import os
import json 


conn = sqlite3.connect(r'C:\Users\Owner\dev\algobetting\infra\data\db\algobetting.db')

df = pd.read_sql_query("""
                        SELECT DISTINCT
                            f.*,
                            ms.summary_goals as goals
                        FROM 
                            team_all_features_365_005 f
                        JOIN
                            fbref_match_all_columns ms
                                ON ms.match_url = f.match_url
                                AND ms.team = f.team
                        WHERE 
                            f.team_rolling_summary_minutes IS NOT NULL
                            AND f.division = 'Premier League'
                       --AND NOT (f.`is_promoted?` = 1 AND f.`is_early_season?` = 1)
                       """, conn)

df

Unnamed: 0,match_url,match_date,season,division,team,opp_team,is_home,gw,is_promoted?,is_early_season?,...,opp_team_rolling_conceded_defense_challenges,opp_team_rolling_conceded_defense_challenge_tackles_pct,opp_team_rolling_conceded_defense_challenges_lost,opp_team_rolling_conceded_defense_blocked_shots,opp_team_rolling_conceded_defense_blocked_passes,opp_team_rolling_conceded_defense_tackles_interceptions,opp_team_rolling_conceded_defense_clearances,opp_team_rolling_conceded_defense_errors,opp_team_rolling_conceded_keeper_psxg,goals
0,https://fbref.com/en/matches/e4bb1c35/Tottenha...,2025-05-25 00:00:00,2024-2025,Premier League,Tottenham,Brighton,1,38,0,0,...,18.309716,50.720617,8.796946,3.393454,7.020603,26.403010,27.177737,0.862971,1.299603,1.0
1,https://fbref.com/en/matches/e4bb1c35/Tottenha...,2025-05-25 00:00:00,2024-2025,Premier League,Brighton,Tottenham,0,38,0,0,...,19.336254,57.889854,7.949406,3.378971,9.378588,32.038948,23.558919,0.911136,1.805191,4.0
2,https://fbref.com/en/matches/1ff370e8/Bournemo...,2025-05-25 00:00:00,2024-2025,Premier League,Bournemouth,Leicester City,1,38,0,0,...,14.039872,53.893970,6.606014,2.506894,6.904780,22.901258,21.697324,0.636969,1.811285,2.0
3,https://fbref.com/en/matches/1ff370e8/Bournemo...,2025-05-25 00:00:00,2024-2025,Premier League,Leicester City,Bournemouth,0,38,1,0,...,14.891199,49.279905,7.520887,4.153870,6.746572,23.732322,26.770789,1.451179,1.460762,0.0
4,https://fbref.com/en/matches/36844e73/Newcastl...,2025-05-25 00:00:00,2024-2025,Premier League,Newcastle Utd,Everton,1,38,0,0,...,14.492786,53.064874,6.895930,2.600829,6.075265,24.063503,26.142993,0.567784,1.251694,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5956,https://fbref.com/en/matches/21b58926/Liverpoo...,2020-09-12 00:00:00,2020-2021,Premier League,Leeds United,Liverpool,0,1,1,1,...,17.351282,39.725765,10.328149,4.258163,7.257168,27.029413,29.952466,0.750397,0.852680,3.0
5957,https://fbref.com/en/matches/db261cb0/Crystal-...,2020-09-12 00:00:00,2020-2021,Premier League,Crystal Palace,Southampton,1,1,0,1,...,17.968617,37.087656,11.181068,3.952485,7.942267,26.825346,21.945053,0.489783,1.307413,1.0
5958,https://fbref.com/en/matches/db261cb0/Crystal-...,2020-09-12 00:00:00,2020-2021,Premier League,Southampton,Crystal Palace,0,1,0,1,...,17.930510,38.966705,10.953040,2.684821,8.033599,32.571362,18.209727,0.459216,1.642488,0.0
5959,https://fbref.com/en/matches/bf52349b/Fulham-A...,2020-09-12 00:00:00,2020-2021,Premier League,Fulham,Arsenal,1,1,1,1,...,17.279478,40.691854,10.211788,2.561171,6.396202,26.695362,18.161359,0.624812,1.269339,0.0


In [6]:
# Check for duplicate columns (identical values)
def find_duplicate_columns(df):
    duplicate_pairs = []
    cols = df.columns.tolist()
    
    for i in range(len(cols)):
        for j in range(i+1, len(cols)):
            if df[cols[i]].equals(df[cols[j]]):
                duplicate_pairs.append((cols[i], cols[j]))
    
    return duplicate_pairs

# Find duplicates in your FEATURE MATRIX (not the full df)
duplicates = find_duplicate_columns(df)
print("Duplicate column pairs:")
for pair in duplicates:
    print(f"  {pair[0]} == {pair[1]}")

# More sophisticated duplicate removal logic
columns_to_drop = []
for pair in duplicates:
    col1, col2 = pair
    
    # Priority rules for which column to keep:
    # 1. Keep shorter names
    # 2. Prefer "passing" over "pass_types" 
    # 3. Prefer "summary" over other categories
    # 4. Prefer team stats over opp_team stats (less processing)
    
    keep_col1 = True
    
    # Rule 1: Shorter name
    if len(col1) > len(col2):
        keep_col1 = False
    elif len(col1) < len(col2):
        keep_col1 = True
    else:
        # Same length, apply other rules
        # Rule 2: Prefer passing over pass_types
        if "pass_types" in col1 and "passing" in col2:
            keep_col1 = False
        elif "passing" in col1 and "pass_types" in col2:
            keep_col1 = True
        # Rule 3: Prefer summary stats
        elif "summary" in col1 and "summary" not in col2:
            keep_col1 = True
        elif "summary" in col2 and "summary" not in col1:
            keep_col1 = False
        # Rule 4: Prefer team over opp_team (less nested)
        elif col1.startswith("team_") and col2.startswith("opp_team_"):
            keep_col1 = True
        elif col2.startswith("team_") and col1.startswith("opp_team_"):
            keep_col1 = False
    
    # Add the column to drop
    if keep_col1:
        columns_to_drop.append(col2)
        print(f"  Keeping: {col1}")
        print(f"  Dropping: {col2}")
    else:
        columns_to_drop.append(col1)
        print(f"  Keeping: {col2}")
        print(f"  Dropping: {col1}")
    print()

print(f"Columns to drop: {len(columns_to_drop)}")
print(columns_to_drop)

# Create cleaned feature matrix
df = df.drop(columns=columns_to_drop)

Duplicate column pairs:
  team_rolling_summary_minutes == team_rolling_pass_types_minutes
  team_rolling_summary_passes_completed == team_rolling_pass_types_passes_completed
  team_rolling_summary_passes == team_rolling_pass_types_passes
  team_rolling_misc_offsides == team_rolling_pass_types_passes_offsides
  team_rolling_misc_offsides == team_rolling_passing_passes_offsides
  team_rolling_misc_crosses == team_rolling_pass_types_crosses
  team_rolling_pass_types_passes_live == team_rolling_passing_passes_live
  team_rolling_pass_types_passes_dead == team_rolling_passing_passes_dead
  team_rolling_pass_types_passes_free_kicks == team_rolling_passing_passes_free_kicks
  team_rolling_pass_types_through_balls == team_rolling_passing_through_balls
  team_rolling_pass_types_passes_switches == team_rolling_passing_passes_switches
  team_rolling_pass_types_throw_ins == team_rolling_passing_throw_ins
  team_rolling_pass_types_corner_kicks == team_rolling_passing_corner_kicks
  team_rolling_pas

In [7]:
X = df.drop(columns=["team", "opp_team", "goals", "match_url", "match_date", "division"])
X = pd.get_dummies(X, columns=["season"], drop_first=True)

#X = df[["is_home", "team_rolling_summary_goals", "opp_team_rolling_conceded_summary_goals"]] # goals only

y = df["goals"]

# Use matches after a certain date as test
cutoff_date = '2024-08-01'
df['match_date'] = pd.to_datetime(df['match_date'])

train_mask = df['match_date'] < cutoff_date
test_mask = df['match_date'] >= cutoff_date

X_train = X[train_mask]
X_test = X[test_mask]
y_train = y[train_mask]
y_test = y[test_mask]

# Create DMatrix objects BEFORE using them in eval_list
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'count:poisson',
    'max_depth': 7,
    'eta': 0.1,
    'subsample': 0.7,
    'colsample_bytree': 0.8,
    'eval_metric': 'poisson-nloglik'
}

num_rounds = 100
eval_list = [(dtrain, 'train'), (dtest, 'eval')]  # Now dtrain and dtest are defined

model = xgb.train(
    params, 
    dtrain, 
    num_rounds, 
    eval_list, 
    early_stopping_rounds=10,
    verbose_eval=10
)

# Make predictions
preds = model.predict(dtest)

rmse = np.sqrt(mean_squared_error(y_test, preds))
mae = mean_absolute_error(y_test, preds)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")



[0]	train-poisson-nloglik:1.55351	eval-poisson-nloglik:1.54975
[10]	train-poisson-nloglik:1.42637	eval-poisson-nloglik:1.51450
[20]	train-poisson-nloglik:1.35398	eval-poisson-nloglik:1.50968
[30]	train-poisson-nloglik:1.30347	eval-poisson-nloglik:1.50562
[40]	train-poisson-nloglik:1.26598	eval-poisson-nloglik:1.50342
[50]	train-poisson-nloglik:1.23947	eval-poisson-nloglik:1.50331
[55]	train-poisson-nloglik:1.22843	eval-poisson-nloglik:1.50359
RMSE: 1.1970
MAE: 0.9583


In [8]:
# Get feature importance
importance = model.get_score(importance_type='gain')
feature_names = X_train.columns.tolist()

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
    'feature': list(importance.keys()),
    'importance': list(importance.values())
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(importance_df.head(15))

Top 10 Most Important Features:
                                               feature  importance
25                        team_rolling_summary_carries   18.513756
33        team_rolling_possession_touches_att_pen_area   13.144480
191        opp_team_rolling_possession_touches_att_3rd   12.994490
156      team_rolling_conceded_defense_challenges_lost   12.385612
10                team_rolling_summary_shots_on_target   10.003927
183        opp_team_rolling_summary_progressive_passes    9.794761
21               team_rolling_summary_passes_completed    9.411849
40    team_rolling_possession_carries_into_final_third    8.350016
198  opp_team_rolling_possession_carries_into_final...    8.059331
61                  team_rolling_passing_through_balls    7.868461
17                           team_rolling_summary_npxg    7.781621
105   team_rolling_conceded_summary_progressive_passes    7.462657
144            team_rolling_conceded_passing_throw_ins    7.208937
113   team_rolling_conceded_po

# Hyperparameter Tuning

In [9]:
# Set to True to tune new parameters, False to load existing
TUNE_NEW_PARAMS = True  # Change this to False to load saved params

# Tuning settings
N_TRIALS = 100
PARAMS_FILE = "best_xgb_params_365.json"

In [10]:
if TUNE_NEW_PARAMS or not os.path.exists(PARAMS_FILE):
    print("Tuning new hyperparameters...")
    
    def objective(trial):
        # Define search space
        params = {
            'objective': 'count:poisson',
            'eval_metric': 'poisson-nloglik',
            'max_depth': trial.suggest_int('max_depth', 4, 12),  # Deeper trees for more data
            'eta': trial.suggest_float('eta', 0.005, 0.2, log=True),  # Wider range, log scale
            'subsample': trial.suggest_float('subsample', 0.6, 0.95),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.8),  # More aggressive for 350 cols
            'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),  # Additional sampling
            'min_child_weight': trial.suggest_int('min_child_weight', 3, 15),  # Higher for larger dataset
            'gamma': trial.suggest_float('gamma', 0, 5),  # Wider regularization range
            'alpha': trial.suggest_float('alpha', 0, 10),  # L1 reg - good for feature selection
            'lambda': trial.suggest_float('lambda', 1, 10),  # L2 reg - important with many features
            'max_leaves': trial.suggest_int('max_leaves', 0, 200),  # Control tree complexity
            'random_state': 42
        }
        
        n_estimators = trial.suggest_int('n_estimators', 100, 500)
        
        # Train model
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)
        
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=n_estimators,
            evals=[(dtrain, 'train'), (dtest, 'eval')],
            early_stopping_rounds=20,
            verbose_eval=False
        )
        
        # Calculate RMSE
        preds = model.predict(dtest)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        
        return rmse
    
    # Run optimization
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)
    
    # Get best parameters
    best_params = study.best_params.copy()
    n_estimators = best_params.pop('n_estimators')
    
    print(f"Best RMSE: {study.best_value:.4f}")
    print("Best parameters:")
    for key, value in best_params.items():
        if key not in ['objective', 'eval_metric', 'random_state']:
            print(f"  {key}: {value}")
    print(f"  n_estimators: {n_estimators}")
    
    # Save parameters
    params_to_save = {
        'best_params': best_params,
        'n_estimators': n_estimators,
        'best_rmse': study.best_value
    }
    
    with open(PARAMS_FILE, 'w') as f:
        json.dump(params_to_save, f, indent=2)
    print(f"\nParameters saved to {PARAMS_FILE}")
    
else:
    print("Loading saved hyperparameters...")
    
    with open(PARAMS_FILE, 'r') as f:
        saved_params = json.load(f)
    
    best_params = saved_params['best_params']
    n_estimators = saved_params['n_estimators']
    
    print("Loaded parameters:")
    for key, value in best_params.items():
        if key not in ['objective', 'eval_metric', 'random_state']:
            print(f"  {key}: {value}")
    print(f"  n_estimators: {n_estimators}")
    print(f"Previous best RMSE: {saved_params['best_rmse']:.4f}")

[I 2025-08-05 07:57:33,108] A new study created in memory with name: no-name-428d21dd-b66c-4bc6-acd6-27fe129c5f5a


Tuning new hyperparameters...


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-08-05 07:57:34,815] Trial 0 finished with value: 1.181246250703139 and parameters: {'max_depth': 11, 'eta': 0.1258835806704184, 'subsample': 0.8320633613133379, 'colsample_bytree': 0.5700170367804736, 'colsample_bylevel': 0.5497904681717609, 'min_child_weight': 10, 'gamma': 2.733018787769079, 'alpha': 2.3716021748757496, 'lambda': 6.812002395361921, 'max_leaves': 24, 'n_estimators': 477}. Best is trial 0 with value: 1.181246250703139.
[I 2025-08-05 07:57:36,715] Trial 1 finished with value: 1.18758339571922 and parameters: {'max_depth': 4, 'eta': 0.01857178925164573, 'subsample': 0.7163163320933111, 'colsample_bytree': 0.6779209628614776, 'colsample_bylevel': 0.7510876880344842, 'min_child_weight': 3, 'gamma': 0.3905458332609635, 'alpha': 6.821523925528719, 'lambda': 7.164711611619235, 'max_leaves': 7, 'n_estimators': 139}. Best is trial 0 with value: 1.181246250703139.
[I 2025-08-05 07:57:42,923] Trial 2 finished with value: 1.1742605737523697 and parameters: {'max_depth': 12,

In [11]:
print("\nTraining final model...")

# Prepare data
dtrain_final = xgb.DMatrix(X_train, label=y_train)
dtest_final = xgb.DMatrix(X_test, label=y_test)

# Train with best parameters
final_model = xgb.train(
    best_params,
    dtrain_final,
    num_boost_round=n_estimators,
    evals=[(dtrain_final, 'train'), (dtest_final, 'eval')],
    early_stopping_rounds=20,
    verbose_eval=10
)

# Make predictions and evaluate
preds_final = final_model.predict(dtest_final)
rmse_final = np.sqrt(mean_squared_error(y_test, preds_final))
mae_final = mean_absolute_error(y_test, preds_final)

print(f"\nFinal Model Performance:")
print(f"RMSE: {rmse_final:.4f}")
print(f"MAE: {mae_final:.4f}")

# Save the trained model
final_model.save_model("best_xgb_model_365.json")
print("Model saved as 'best_xgb_model_365.json'")


Training final model...
[0]	train-rmse:1.28176	eval-rmse:1.23298
[10]	train-rmse:1.23019	eval-rmse:1.21918
[20]	train-rmse:1.18451	eval-rmse:1.20864
[30]	train-rmse:1.14391	eval-rmse:1.20095
[40]	train-rmse:1.10638	eval-rmse:1.19591
[50]	train-rmse:1.07244	eval-rmse:1.19083
[60]	train-rmse:1.04066	eval-rmse:1.18739
[70]	train-rmse:1.01136	eval-rmse:1.18399
[80]	train-rmse:0.98459	eval-rmse:1.18230
[90]	train-rmse:0.95932	eval-rmse:1.18258
[100]	train-rmse:0.93488	eval-rmse:1.18219
[110]	train-rmse:0.91253	eval-rmse:1.18224
[120]	train-rmse:0.89124	eval-rmse:1.18335
[130]	train-rmse:0.87050	eval-rmse:1.18380
[131]	train-rmse:0.86904	eval-rmse:1.18351

Final Model Performance:
RMSE: 1.1835
MAE: 0.9502
Model saved as 'best_xgb_model_365.json'


In [12]:
print("\nTop 10 Feature Importance (Gain):")
importance = final_model.get_score(importance_type='gain')
importance_df = pd.DataFrame({
    'feature': list(importance.keys()),
    'importance': list(importance.values())
}).sort_values('importance', ascending=False)

print(importance_df.head(15).to_string(index=False))


Top 10 Feature Importance (Gain):
                                                     feature  importance
                team_rolling_possession_touches_att_pen_area   50.053787
                        team_rolling_summary_shots_on_target   42.916401
                                     team_rolling_summary_xg   37.828583
                                team_rolling_summary_touches   29.921967
                                    team_rolling_keeper_psxg   28.650259
                                team_rolling_summary_carries   28.178934
                              team_rolling_summary_xg_assist   27.358438
                 opp_team_rolling_possession_passes_received   22.579401
team_rolling_conceded_possession_progressive_passes_received   21.472067
            team_rolling_possession_carries_into_final_third   21.094284
                 opp_team_rolling_possession_touches_att_3rd   20.673637
                                    team_rolling_summary_sca   20.607796
            team