In [26]:
import pandas as pd
import sqlite3
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb
import optuna
import os
import json 


conn = sqlite3.connect(r'C:\Users\Owner\dev\algobetting\infra\data\db\algobetting.db')

df = pd.read_sql_query("""
                        SELECT DISTINCT
                            f.*,
                            ms.summary_goals as goals
                        FROM 
                            team_all_features_180_0077 f
                        JOIN
                            fbref_match_all_columns ms
                                ON ms.match_url = f.match_url
                                AND ms.team = f.team
                        WHERE 
                            f.team_rolling_summary_minutes IS NOT NULL
                       --AND NOT (f.`is_promoted?` = 1 AND f.`is_early_season?` = 1)
                       """, conn)

df

Unnamed: 0,match_url,match_date,season,division,team,opp_team,is_home,gw,is_promoted?,is_early_season?,...,opp_team_rolling_conceded_defense_challenges,opp_team_rolling_conceded_defense_challenge_tackles_pct,opp_team_rolling_conceded_defense_challenges_lost,opp_team_rolling_conceded_defense_blocked_shots,opp_team_rolling_conceded_defense_blocked_passes,opp_team_rolling_conceded_defense_tackles_interceptions,opp_team_rolling_conceded_defense_clearances,opp_team_rolling_conceded_defense_errors,opp_team_rolling_conceded_keeper_psxg,goals
0,https://fbref.com/en/matches/e4bb1c35/Tottenha...,2025-05-25 00:00:00,2024-2025,Premier League,Tottenham,Brighton,1,38,0,0,...,17.626817,48.357654,8.825405,3.079800,6.920357,25.259719,29.267004,0.816211,1.269035,1.0
1,https://fbref.com/en/matches/e4bb1c35/Tottenha...,2025-05-25 00:00:00,2024-2025,Premier League,Brighton,Tottenham,0,38,0,0,...,18.967924,58.906421,7.531723,3.214026,8.753274,31.687899,22.755722,0.782216,1.890708,4.0
2,https://fbref.com/en/matches/1ff370e8/Bournemo...,2025-05-25 00:00:00,2024-2025,Premier League,Bournemouth,Leicester City,1,38,0,0,...,12.973336,53.287432,6.198606,2.647510,6.934026,21.619704,22.187666,0.665972,1.738255,2.0
3,https://fbref.com/en/matches/1ff370e8/Bournemo...,2025-05-25 00:00:00,2024-2025,Premier League,Leicester City,Bournemouth,0,38,1,0,...,14.549176,48.578723,7.431018,4.033535,6.855790,23.310313,27.477550,1.633190,1.354963,0.0
4,https://fbref.com/en/matches/36844e73/Newcastl...,2025-05-25 00:00:00,2024-2025,Premier League,Newcastle Utd,Everton,1,38,0,0,...,13.494185,54.012499,6.318674,2.577873,5.406179,23.600076,27.815311,0.509854,1.247472,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40589,https://fbref.com/en/matches/e8e75a75/Como-Cag...,2022-08-13 00:00:00,2022-2023,Serie B,Como,Cagliari,1,1,0,1,...,13.865806,51.140007,6.749611,4.156406,6.419369,24.242237,19.279783,0.205490,1.159798,1.0
40590,https://fbref.com/en/matches/e8e75a75/Como-Cag...,2022-08-13 00:00:00,2022-2023,Serie B,Cagliari,Como,0,1,0,1,...,15.974251,43.253169,9.110522,1.830221,6.919110,23.896380,14.582846,0.000000,1.668266,1.0
40591,https://fbref.com/en/matches/99979e25/Cittadel...,2022-08-13 00:00:00,2022-2023,Serie B,Cittadella,Pisa,1,1,0,1,...,14.470084,50.660840,6.980518,3.145323,6.960094,27.732300,19.685275,0.323261,1.194049,4.0
40592,https://fbref.com/en/matches/99979e25/Cittadel...,2022-08-13 00:00:00,2022-2023,Serie B,Pisa,Cittadella,0,1,0,1,...,11.689373,56.400894,5.332239,3.975192,9.716700,27.966362,19.311856,0.176218,0.765247,3.0


In [27]:
# Check for duplicate columns (identical values)
def find_duplicate_columns(df):
    duplicate_pairs = []
    cols = df.columns.tolist()
    
    for i in range(len(cols)):
        for j in range(i+1, len(cols)):
            if df[cols[i]].equals(df[cols[j]]):
                duplicate_pairs.append((cols[i], cols[j]))
    
    return duplicate_pairs

# Find duplicates in your FEATURE MATRIX (not the full df)
duplicates = find_duplicate_columns(df)
print("Duplicate column pairs:")
for pair in duplicates:
    print(f"  {pair[0]} == {pair[1]}")

# More sophisticated duplicate removal logic
columns_to_drop = []
for pair in duplicates:
    col1, col2 = pair
    
    # Priority rules for which column to keep:
    # 1. Keep shorter names
    # 2. Prefer "passing" over "pass_types" 
    # 3. Prefer "summary" over other categories
    # 4. Prefer team stats over opp_team stats (less processing)
    
    keep_col1 = True
    
    # Rule 1: Shorter name
    if len(col1) > len(col2):
        keep_col1 = False
    elif len(col1) < len(col2):
        keep_col1 = True
    else:
        # Same length, apply other rules
        # Rule 2: Prefer passing over pass_types
        if "pass_types" in col1 and "passing" in col2:
            keep_col1 = False
        elif "passing" in col1 and "pass_types" in col2:
            keep_col1 = True
        # Rule 3: Prefer summary stats
        elif "summary" in col1 and "summary" not in col2:
            keep_col1 = True
        elif "summary" in col2 and "summary" not in col1:
            keep_col1 = False
        # Rule 4: Prefer team over opp_team (less nested)
        elif col1.startswith("team_") and col2.startswith("opp_team_"):
            keep_col1 = True
        elif col2.startswith("team_") and col1.startswith("opp_team_"):
            keep_col1 = False
    
    # Add the column to drop
    if keep_col1:
        columns_to_drop.append(col2)
        print(f"  Keeping: {col1}")
        print(f"  Dropping: {col2}")
    else:
        columns_to_drop.append(col1)
        print(f"  Keeping: {col2}")
        print(f"  Dropping: {col1}")
    print()

print(f"Columns to drop: {len(columns_to_drop)}")
print(columns_to_drop)

# Create cleaned feature matrix
df = df.drop(columns=columns_to_drop)

Duplicate column pairs:
  team_rolling_summary_passes_completed == team_rolling_pass_types_passes_completed
  team_rolling_summary_passes == team_rolling_pass_types_passes
  team_rolling_misc_crosses == team_rolling_pass_types_crosses
  team_rolling_pass_types_passes_live == team_rolling_passing_passes_live
  team_rolling_pass_types_passes_dead == team_rolling_passing_passes_dead
  team_rolling_pass_types_passes_free_kicks == team_rolling_passing_passes_free_kicks
  team_rolling_pass_types_through_balls == team_rolling_passing_through_balls
  team_rolling_pass_types_passes_switches == team_rolling_passing_passes_switches
  team_rolling_pass_types_throw_ins == team_rolling_passing_throw_ins
  team_rolling_pass_types_corner_kicks == team_rolling_passing_corner_kicks
  team_rolling_pass_types_corner_kicks_in == team_rolling_passing_corner_kicks_in
  team_rolling_pass_types_corner_kicks_out == team_rolling_passing_corner_kicks_out
  team_rolling_pass_types_corner_kicks_straight == team_rol

In [28]:
X = df.drop(columns=["team", "opp_team", "goals", "match_url", "match_date"])
X = pd.get_dummies(X, columns=["season", "division"], drop_first=True)

#X = df[["is_home", "team_rolling_summary_goals", "opp_team_rolling_conceded_summary_goals"]] # goals only

y = df["goals"]

# Use matches after a certain date as test
cutoff_date = '2024-08-01'
df['match_date'] = pd.to_datetime(df['match_date'])

train_mask = df['match_date'] < cutoff_date
test_mask = df['match_date'] >= cutoff_date

X_train = X[train_mask]
X_test = X[test_mask]
y_train = y[train_mask]
y_test = y[test_mask]

# Create DMatrix objects BEFORE using them in eval_list
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'count:poisson',
    'max_depth': 7,
    'eta': 0.1,
    'subsample': 0.7,
    'colsample_bytree': 0.8,
    'eval_metric': 'poisson-nloglik'
}

num_rounds = 100
eval_list = [(dtrain, 'train'), (dtest, 'eval')]  # Now dtrain and dtest are defined

model = xgb.train(
    params, 
    dtrain, 
    num_rounds, 
    eval_list, 
    early_stopping_rounds=10,
    verbose_eval=10
)

# Make predictions
preds = model.predict(dtest)

rmse = np.sqrt(mean_squared_error(y_test, preds))
mae = mean_absolute_error(y_test, preds)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")



[0]	train-poisson-nloglik:1.50430	eval-poisson-nloglik:1.49440
[10]	train-poisson-nloglik:1.44200	eval-poisson-nloglik:1.45455
[20]	train-poisson-nloglik:1.41107	eval-poisson-nloglik:1.44125
[30]	train-poisson-nloglik:1.39128	eval-poisson-nloglik:1.43732
[40]	train-poisson-nloglik:1.37611	eval-poisson-nloglik:1.43506
[50]	train-poisson-nloglik:1.36302	eval-poisson-nloglik:1.43415
[60]	train-poisson-nloglik:1.35160	eval-poisson-nloglik:1.43395
[70]	train-poisson-nloglik:1.34141	eval-poisson-nloglik:1.43385
[71]	train-poisson-nloglik:1.34052	eval-poisson-nloglik:1.43395
RMSE: 1.1223
MAE: 0.8814


In [29]:
# Get feature importance
importance = model.get_score(importance_type='gain')
feature_names = X_train.columns.tolist()

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
    'feature': list(importance.keys()),
    'importance': list(importance.values())
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(importance_df.head(15))

Top 10 Most Important Features:
                                               feature  importance
59                    team_rolling_passing_passes_live   82.381851
33        team_rolling_possession_touches_att_pen_area   52.050701
17                           team_rolling_summary_npxg   37.707771
21               team_rolling_summary_passes_completed   32.820927
16                             team_rolling_summary_xg   21.786440
279  opp_team_rolling_conceded_possession_touches_a...   21.773891
34           team_rolling_possession_touches_live_ball   19.757393
45   team_rolling_possession_progressive_passes_rec...   19.206972
264        opp_team_rolling_conceded_summary_xg_assist   18.982849
263             opp_team_rolling_conceded_summary_npxg   18.011448
0                                              is_home   13.927182
332                                   division_Ligue 1   13.794147
83                            team_rolling_keeper_psxg   13.290424
262               opp_team_rol

# Hyperparameter Tuning

In [30]:
# Set to True to tune new parameters, False to load existing
TUNE_NEW_PARAMS = True  # Change this to False to load saved params

# Tuning settings
N_TRIALS = 100
PARAMS_FILE = "best_xgb_params.json"

In [31]:
if TUNE_NEW_PARAMS or not os.path.exists(PARAMS_FILE):
    print("Tuning new hyperparameters...")
    
    def objective(trial):
        # Define search space
        params = {
            'objective': 'count:poisson',
            'eval_metric': 'poisson-nloglik',
            'max_depth': trial.suggest_int('max_depth', 4, 12),  # Deeper trees for more data
            'eta': trial.suggest_float('eta', 0.005, 0.2, log=True),  # Wider range, log scale
            'subsample': trial.suggest_float('subsample', 0.6, 0.95),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.8),  # More aggressive for 350 cols
            'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),  # Additional sampling
            'min_child_weight': trial.suggest_int('min_child_weight', 3, 15),  # Higher for larger dataset
            'gamma': trial.suggest_float('gamma', 0, 5),  # Wider regularization range
            'alpha': trial.suggest_float('alpha', 0, 10),  # L1 reg - good for feature selection
            'lambda': trial.suggest_float('lambda', 1, 10),  # L2 reg - important with many features
            'max_leaves': trial.suggest_int('max_leaves', 0, 200),  # Control tree complexity
            'random_state': 42
        }
        
        n_estimators = trial.suggest_int('n_estimators', 100, 500)
        
        # Train model
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)
        
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=n_estimators,
            evals=[(dtrain, 'train'), (dtest, 'eval')],
            early_stopping_rounds=20,
            verbose_eval=False
        )
        
        # Calculate RMSE
        preds = model.predict(dtest)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        
        return rmse
    
    # Run optimization
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)
    
    # Get best parameters
    best_params = study.best_params.copy()
    n_estimators = best_params.pop('n_estimators')
    
    print(f"Best RMSE: {study.best_value:.4f}")
    print("Best parameters:")
    for key, value in best_params.items():
        if key not in ['objective', 'eval_metric', 'random_state']:
            print(f"  {key}: {value}")
    print(f"  n_estimators: {n_estimators}")
    
    # Save parameters
    params_to_save = {
        'best_params': best_params,
        'n_estimators': n_estimators,
        'best_rmse': study.best_value
    }
    
    with open(PARAMS_FILE, 'w') as f:
        json.dump(params_to_save, f, indent=2)
    print(f"\nParameters saved to {PARAMS_FILE}")
    
else:
    print("Loading saved hyperparameters...")
    
    with open(PARAMS_FILE, 'r') as f:
        saved_params = json.load(f)
    
    best_params = saved_params['best_params']
    n_estimators = saved_params['n_estimators']
    
    print("Loaded parameters:")
    for key, value in best_params.items():
        if key not in ['objective', 'eval_metric', 'random_state']:
            print(f"  {key}: {value}")
    print(f"  n_estimators: {n_estimators}")
    print(f"Previous best RMSE: {saved_params['best_rmse']:.4f}")

[I 2025-08-04 08:59:37,015] A new study created in memory with name: no-name-ca2a9f4a-3183-46c1-98f1-853d9973272c


Tuning new hyperparameters...


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-08-04 08:59:43,485] Trial 0 finished with value: 1.1217046694822748 and parameters: {'max_depth': 8, 'eta': 0.14387335213037086, 'subsample': 0.6585146876269514, 'colsample_bytree': 0.449926767193743, 'colsample_bylevel': 0.8707069736123407, 'min_child_weight': 4, 'gamma': 1.3110368536023886, 'alpha': 9.987137217823067, 'lambda': 1.6454575309146338, 'max_leaves': 53, 'n_estimators': 491}. Best is trial 0 with value: 1.1217046694822748.
[I 2025-08-04 08:59:48,984] Trial 1 finished with value: 1.1183080904698435 and parameters: {'max_depth': 4, 'eta': 0.14589124639255763, 'subsample': 0.7124403386516207, 'colsample_bytree': 0.7733684205496411, 'colsample_bylevel': 0.8462070311506342, 'min_child_weight': 10, 'gamma': 0.3951562858074964, 'alpha': 0.5938581369843521, 'lambda': 2.6807480636274903, 'max_leaves': 180, 'n_estimators': 189}. Best is trial 1 with value: 1.1183080904698435.
[I 2025-08-04 08:59:56,088] Trial 2 finished with value: 1.1179510530607564 and parameters: {'max_de

In [None]:
print("\nTraining final model...")

# Prepare data
dtrain_final = xgb.DMatrix(X_train, label=y_train)
dtest_final = xgb.DMatrix(X_test, label=y_test)

# Train with best parameters
final_model = xgb.train(
    best_params,
    dtrain_final,
    num_boost_round=n_estimators,
    evals=[(dtrain_final, 'train'), (dtest_final, 'eval')],
    early_stopping_rounds=20,
    verbose_eval=10
)

# Make predictions and evaluate
preds_final = final_model.predict(dtest_final)
rmse_final = np.sqrt(mean_squared_error(y_test, preds_final))
mae_final = mean_absolute_error(y_test, preds_final)

print(f"\nFinal Model Performance:")
print(f"RMSE: {rmse_final:.4f}")
print(f"MAE: {mae_final:.4f}")

# Save the trained model
final_model.save_model("best_xgb_model.json")
print("Model saved as 'best_xgb_model.json'")


Training final model...
[0]	train-rmse:1.20870	eval-rmse:1.18961
[10]	train-rmse:1.16761	eval-rmse:1.15648
[20]	train-rmse:1.14045	eval-rmse:1.13757
[30]	train-rmse:1.12197	eval-rmse:1.12813
[40]	train-rmse:1.10924	eval-rmse:1.12377
[50]	train-rmse:1.09898	eval-rmse:1.12173
[60]	train-rmse:1.09051	eval-rmse:1.12084
[70]	train-rmse:1.08318	eval-rmse:1.11966
[80]	train-rmse:1.07658	eval-rmse:1.11929
[90]	train-rmse:1.07013	eval-rmse:1.11902
[100]	train-rmse:1.06373	eval-rmse:1.11943
[106]	train-rmse:1.06033	eval-rmse:1.12007

Final Model Performance:
RMSE: 1.1201
MAE: 0.8820
Model saved as 'best_xgb_model.json'


In [None]:
print("\nTop 10 Feature Importance (Gain):")
importance = final_model.get_score(importance_type='gain')
importance_df = pd.DataFrame({
    'feature': list(importance.keys()),
    'importance': list(importance.values())
}).sort_values('importance', ascending=False)

print(importance_df.head(10).to_string(index=False))


Top 10 Feature Importance (Gain):
                                                  feature  importance
                             team_rolling_summary_touches  179.686142
                         team_rolling_passing_passes_live  160.319351
                  team_rolling_possession_passes_received  126.460159
                           team_rolling_summary_xg_assist  100.019966
                                team_rolling_summary_npxg   99.978600
                              team_rolling_summary_passes   93.963310
             team_rolling_possession_touches_att_pen_area   93.568680
                                                  is_home   82.524269
                  team_rolling_possession_touches_att_3rd   79.989845
                    team_rolling_summary_passes_completed   60.474033
                                  team_rolling_summary_xg   57.729107
                team_rolling_possession_touches_live_ball   55.937267
opp_team_rolling_conceded_possession_touches_att_pen_ar