In [1]:
import numpy as np
import pandas as pd
import os
import requests
import io
from datetime import datetime, timedelta
import multiprocessing
import arviz as az
import logging
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
from scipy.stats import poisson
import sqlite3

# get data
API_KEY = os.getenv("API_KEY")
url = 'https://data-service.beatthebookie.blog/data'
headers = {"x-api-key": API_KEY}

# Function to fetch data for a specific division and season
def fetch_data(division, season):
    params = {
        'division': division,
        'season': season
    }
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return pd.read_json(io.StringIO(response.content.decode('utf-8')))
    else:
        print(f"Error fetching {division} {season}: {response.status_code}")
        print(response.content.decode('utf-8'))
        return pd.DataFrame()

# Fetch data for all combinations
seasons = ['2024_2025', '2023_2024', '2022_2023', '2021_2022']
divisions = ['Premier League', 'Championship', 'Bundesliga', 'Serie A', 'La Liga', 'Ligue 1']
dataframes = []

for division in divisions:
    for season in seasons:
        df = fetch_data(division, season)
        if not df.empty:
            dataframes.append(df)

# Combine all dataframes
if dataframes:
    df = pd.concat(dataframes, ignore_index=True)
    
    # Convert match_date to datetime
    df['match_date'] = pd.to_datetime(df['match_date'])

df['match_id'] = df.apply(
    lambda x: f"{x['match_teams']}_{x['match_date'].strftime('%Y%m%d')}", 
    axis=1
)

df

Unnamed: 0,division_id,division,season_id,season,match_date,match_teams,home_team_id,home_team,away_team_id,away_team,...,home_deep,away_deep,home_ppda,away_ppda,bet365_home_odds,bet365_draw_odds,bet365_away_odds,bet365_u25_odds,bet365_o25_odds,match_id
0,98b8784f6685b7289f583e0ce4b4f6f2,Premier League,3ac445d3cc1d404987efdfcfa42f3bcd,20242025,2024-08-16,Man United - Fulham,f2b82cdbdadf9d3ec47c3a6be66dcfad,Man United,8cd5e94668b139c1f42a89a1e130f3cf,Fulham,...,7.0,3.0,7.379310,10.833333,1.60,4.20,5.25,2.50,1.53,Man United - Fulham_20240816
1,98b8784f6685b7289f583e0ce4b4f6f2,Premier League,3ac445d3cc1d404987efdfcfa42f3bcd,20242025,2024-08-17,Newcastle - Southampton,78e9266876e7649e0a12e3840f5be006,Newcastle,5a884401673693b0bdf379fefb7ec2b2,Southampton,...,4.0,13.0,16.250000,3.789474,1.36,5.25,8.00,3.00,1.40,Newcastle - Southampton_20240817
2,98b8784f6685b7289f583e0ce4b4f6f2,Premier League,3ac445d3cc1d404987efdfcfa42f3bcd,20242025,2024-08-17,Arsenal - Wolves,4fc9baf210346939946d5a49f255588b,Arsenal,31f3ecbc5c48590ccc7dabaedd49a4ff,Wolves,...,14.0,2.0,7.769231,10.818182,1.18,7.50,13.00,2.75,1.44,Arsenal - Wolves_20240817
3,98b8784f6685b7289f583e0ce4b4f6f2,Premier League,3ac445d3cc1d404987efdfcfa42f3bcd,20242025,2024-08-17,Nott'm Forest - Bournemouth,9a8e1e9fad8766fc3d69a0c26d98b928,Nott'm Forest,b436d55f36cfbe8a085c8b75fb7fe98a,Bournemouth,...,10.0,4.0,8.653846,9.954545,2.45,3.50,2.80,2.10,1.73,Nott'm Forest - Bournemouth_20240817
4,98b8784f6685b7289f583e0ce4b4f6f2,Premier League,3ac445d3cc1d404987efdfcfa42f3bcd,20242025,2024-08-17,Ipswich - Liverpool,e4f63bf6d6d2cd121e6c8e59bef68209,Ipswich,afce84ff226407a47c9782a742ba02f7,Liverpool,...,2.0,13.0,18.777778,8.739130,8.50,5.50,1.33,3.00,1.40,Ipswich - Liverpool_20240817
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8838,ef393faadcf7d00912ef326a1cd0f103,Ligue 1,06b84148ed1c6a06340478bef065f5bc,20212022,2022-05-21,Marseille - Strasbourg,8377de0f1845174610bd8b815b9a285e,Marseille,c651bda18ca0a5704727ed63d2bd3e8b,Strasbourg,...,9.0,5.0,9.850000,13.860000,2.05,3.60,3.60,2.10,1.72,Marseille - Strasbourg_20220521
8839,ef393faadcf7d00912ef326a1cd0f103,Ligue 1,06b84148ed1c6a06340478bef065f5bc,20212022,2022-05-21,Paris SG - Metz,541dd69b8e5c6a19f50d3dbf3819addf,Paris SG,a71f82f01730198a712591ceabb74238,Metz,...,14.0,2.0,8.560000,18.400000,1.20,8.50,10.00,4.33,1.22,Paris SG - Metz_20220521
8840,ef393faadcf7d00912ef326a1cd0f103,Ligue 1,06b84148ed1c6a06340478bef065f5bc,20212022,2022-05-21,Angers - Montpellier,eeaba7c0c0cd90d3286ae34bf3131fc1,Angers,375c7d274d5b84410e7b2bf0a00a9465,Montpellier,...,6.0,4.0,8.340000,16.470000,2.00,3.75,3.50,2.30,1.61,Angers - Montpellier_20220521
8841,ef393faadcf7d00912ef326a1cd0f103,Ligue 1,06b84148ed1c6a06340478bef065f5bc,20212022,2022-05-21,Lorient - Troyes,d14a6b82c2ad05082f0d7bf372b9994f,Lorient,bd3ff00cf2a68c244fdbb4fb0e513bc8,Troyes,...,8.0,4.0,8.190000,15.810000,2.15,3.50,3.40,2.10,1.72,Lorient - Troyes_20220521


In [2]:
def append_df_to_sqlite_table(df_new, db_path, table_name, key_columns=None, batch_size=500):

    # Make a copy to avoid modifying the original dataframe
    df = df_new.copy()
    
    # Connect to the database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # Check if table exists
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (table_name,))
    table_exists = cursor.fetchone() is not None
    
    if table_exists:
        # If key_columns not specified, use all columns
        if key_columns is None:
            key_columns = list(df.columns)
        
        # Get columns from existing table
        cursor.execute(f"PRAGMA table_info({table_name})")
        existing_columns = [row[1] for row in cursor.fetchall()]
        
        # Ensure all key columns exist in both dataframe and table
        key_columns = [col for col in key_columns if col in df.columns and col in existing_columns]
        
        if key_columns:
            # Process in batches to avoid SQL expression tree too large error
            filtered_dfs = []
            
            # Split dataframe into batches
            total_rows = len(df)
            for start_idx in range(0, total_rows, batch_size):
                end_idx = min(start_idx + batch_size, total_rows)
                batch_df = df.iloc[start_idx:end_idx]
                
                # For this batch, extract all key column values
                key_values = {}
                for key in key_columns:
                    key_values[key] = batch_df[key].tolist()
                
                # Build an optimized query - using IN clause for each key column
                where_clauses = []
                params = []
                
                for key in key_columns:
                    if len(key_values[key]) > 0:
                        placeholders = ','.join(['?'] * len(key_values[key]))
                        where_clauses.append(f"{key} IN ({placeholders})")
                        params.extend(key_values[key])
                
                if where_clauses:
                    # Get existing records
                    query = f"SELECT {', '.join(key_columns)} FROM {table_name} WHERE {' OR '.join(where_clauses)}"
                    cursor.execute(query, params)
                    existing_rows = cursor.fetchall()
                    
                    # Create a set of tuples representing existing rows
                    if key_columns:
                        existing_keys = set(existing_rows)
                        
                        # Filter out rows that already exist
                        batch_filtered = batch_df[~batch_df[key_columns].apply(tuple, axis=1).isin(existing_keys)]
                        if not batch_filtered.empty:
                            filtered_dfs.append(batch_filtered)
                else:
                    # No valid key values, include all rows
                    filtered_dfs.append(batch_df)
            
            # Combine all filtered batches
            if filtered_dfs:
                df_filtered = pd.concat(filtered_dfs, ignore_index=True)
                if not df_filtered.empty:
                    df_filtered.to_sql(table_name, conn, if_exists='append', index=False)
                    print(f"Added {len(df_filtered)} new records to table {table_name}")
                else:
                    print("No new records to append.")
            else:
                print("No new records to append.")
        else:
            # If no valid key columns, append all data (may create duplicates)
            df.to_sql(table_name, conn, if_exists='append', index=False)
            print(f"Warning: No valid key columns. Added all {len(df)} records to table {table_name}")
    else:
        # If table doesn't exist, create it with the new data
        df.to_sql(table_name, conn, index=False)
        print(f"Created new table {table_name} with {len(df)} records")
    
    # Close the connection
    conn.close()

append_df_to_sqlite_table(df, 'match_db.db', 'btb_matches', key_columns=['match_id'])

Created new table btb_matches with 8843 records


## Creating XGBoost Model Features

In [3]:
# Separates out the home vs away and allows every team to have an individual row of data. Allows us to feature engineer home effect easier.
home_df = df[["division", "season", "match_date", "match_id",
             "home_team", "home_avg_market_value", "home_goals", "home_shots", "home_shots_on_target", "home_corners", "home_xgoals", "home_deep", "home_ppda", "home_red",
             "away_team", "away_avg_market_value", "away_goals", "away_shots", "away_shots_on_target", "away_corners", "away_xgoals", "away_deep", "away_ppda", "away_red",
              "bet365_home_odds", "bet365_draw_odds", "bet365_away_odds"]].copy()
home_df["home?"] = 1
home_df = home_df.rename(columns={"home_team":"team", "home_avg_market_value":"avg_market_value", "home_goals":"goals",
             "home_shots":"shots", "home_shots_on_target":"shots_on_target", "home_corners":"corners", "home_xgoals":"xG", "home_deep":"deep", "home_ppda":"ppda", "home_red": "red", "bet365_home_odds":"odds",
             "away_team":"opponent_team", "away_avg_market_value":"opponent_avg_market_value", "away_goals":"opponent_goals", 
             "away_shots": "opponent_shots", "away_shots_on_target":"opponent_shots_on_target", "away_corners":"opponent_corners", "away_xgoals": "opponent_xG", 
             "away_deep":"opponent_deep", "away_ppda":"opponent_ppda", "away_red": "opponent_red", "bet365_away_odds":"opponent_odds"})

away_df = df[["division", "season", "match_date", "match_id",
                "away_team", "away_avg_market_value", "away_goals", "away_shots", "away_shots_on_target", "away_corners", "away_xgoals", "away_deep", "away_ppda", "away_red",
                "home_team", "home_avg_market_value", "home_goals", "home_shots", "home_shots_on_target", "home_corners", "home_xgoals", "home_deep", "home_ppda", "home_red",
                 "bet365_home_odds", "bet365_draw_odds", "bet365_away_odds"]].copy()
away_df["home?"] = 0
away_df = away_df.rename(columns={"away_team":"team", "away_avg_market_value":"avg_market_value", "away_goals":"goals",
             "away_shots":"shots", "away_shots_on_target":"shots_on_target", "away_corners":"corners", "away_xgoals":"xG", "away_deep":"deep", "away_ppda":"ppda", "away_red": "red", "bet365_away_odds":"odds",
             "home_team":"opponent_team", "home_avg_market_value":"opponent_avg_market_value", "home_goals":"opponent_goals", 
             "home_shots": "opponent_shots", "home_shots_on_target":"opponent_shots_on_target", "home_corners":"opponent_corners", "home_xgoals": "opponent_xG", 
             "home_deep":"opponent_deep", "home_ppda":"opponent_ppda", "home_red": "opponent_red","bet365_home_odds":"opponent_odds"})

df = pd.concat([away_df, home_df])
df["prem?"] = df["division"].apply(lambda x: 1 if x == "Premier League" else 0)
df["team_season"] = df["team"] + "_" + df["season"].astype(str)
df["opponent_team_season"] = df["opponent_team"] + "_" + df["season"].astype(str)
df.sort_values(["match_date", "division"], inplace=True)

df

Unnamed: 0,division,season,match_date,match_id,team,avg_market_value,goals,shots,shots_on_target,corners,...,opponent_deep,opponent_ppda,opponent_red,opponent_odds,bet365_draw_odds,odds,home?,prem?,team_season,opponent_team_season
2989,Championship,20212022,2021-08-06,Bournemouth - West Brom_20210806,West Brom,4487500,2,15.0,5.0,8.0,...,,,0.0,2.30,3.25,3.20,0,0,West Brom_20212022,Bournemouth_20212022
2989,Championship,20212022,2021-08-06,Bournemouth - West Brom_20210806,Bournemouth,5313636,2,7.0,4.0,4.0,...,,,0.0,3.20,3.25,2.30,1,0,Bournemouth_20212022,West Brom_20212022
8462,Ligue 1,20212022,2021-08-06,Monaco - Nantes_20210806,Nantes,2745161,1,5.0,1.0,2.0,...,13.0,3.960000,0.0,1.44,4.33,7.00,0,0,Nantes_20212022,Monaco_20212022
8462,Ligue 1,20212022,2021-08-06,Monaco - Nantes_20210806,Monaco,10194595,1,15.0,3.0,5.0,...,2.0,26.000000,0.0,7.00,4.33,1.44,1,0,Monaco_20212022,Nantes_20212022
2990,Championship,20212022,2021-08-07,Sheffield United - Birmingham_20210807,Birmingham,1458621,1,9.0,2.0,2.0,...,,,0.0,1.75,3.50,5.00,0,0,Birmingham_20212022,Sheffield United_20212022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4980,Serie A,20242025,2025-03-16,Bologna - Lazio_20250316,Bologna,7433333,5,13.0,7.0,7.0,...,2.0,20.066667,0.0,3.25,3.10,2.40,1,0,Bologna_20242025,Lazio_20242025
4981,Serie A,20242025,2025-03-16,Fiorentina - Juventus_20250316,Fiorentina,8556667,3,9.0,3.0,1.0,...,6.0,10.947368,0.0,2.25,3.20,3.40,1,0,Fiorentina_20242025,Juventus_20242025
4982,Serie A,20242025,2025-03-16,Atalanta - Inter_20250316,Atalanta,17344000,0,16.0,3.0,2.0,...,9.0,10.304348,1.0,2.80,3.30,2.55,1,0,Atalanta_20242025,Inter_20242025
6401,La Liga,20242025,2025-03-27,Barcelona - Osasuna_20250327,Osasuna,4072000,0,4.0,0.0,4.0,...,,,0.0,1.29,7.00,7.00,0,0,Osasuna_20242025,Barcelona_20242025


In [4]:
def apply_weighted_avg(col, match_date, match_red, division=None, decay_rate=0.005, time_window=365):
    # Create a mask for non-NaN values
    valid_mask = ~pd.isna(col)
    
    # If all values are NaN, return NaN
    if not valid_mask.any():
        return np.nan
    
    # Filter out NaN values
    valid_col = col[valid_mask].copy()  # Create a copy to avoid modifying original
    valid_dates = match_date[valid_mask]
    valid_red = match_red[valid_mask]
    
    # Get division for valid entries
    valid_division = None
    if division is not None:
        valid_division = division[valid_mask]
    
    # Apply Championship adjustment if division is provided
    if division is not None:
        # Check if this is for an attacking or defensive metric
        is_attacking = 'opponent' not in str(col.name).lower() if hasattr(col, 'name') else False
        is_ppda = 'ppda' in str(col.name).lower() if hasattr(col, 'name') else False
        
        # Create a new numpy array from the Series values
        values_array = valid_col.to_numpy()
        
        # Apply the appropriate adjustment based on metric type
        #for i, div in enumerate(valid_division):
        #    if div == 'Championship':
        #        if is_attacking:
        #            if is_ppda:
        #                values_array[i] *= 1.4  # For PPDA, higher is worse for attack
        #            else:
        #                values_array[i] *= 0.7  # Reduce attacking stats by 40%
        #        else:
        #            if is_ppda:
        #                values_array[i] *= 0.7  # For PPDA, lower is better for defense
        #            else:
        #                values_array[i] *= 1.4  # Increase defensive stats by 40%
        
        # Replace the Series values with the modified array
        valid_col = pd.Series(values_array, index=valid_col.index)
    
    # Get most recent date
    recent_date = max(valid_dates)
    
    # Create a time window mask (only include matches within time_window days)
    time_window_mask = (recent_date - valid_dates).dt.days <= time_window
    
    # If no matches in the time window, return NaN
    if not time_window_mask.any():
        return np.nan
    
    # Apply time window filter
    valid_col = valid_col[time_window_mask]
    valid_dates = valid_dates[time_window_mask]
    valid_red = valid_red[time_window_mask]
    
    if division is not None:
        valid_division = valid_division[time_window_mask]
    
    # Calculate weights for matches within the time window
    match_weight = np.exp(-(recent_date - valid_dates).dt.days * decay_rate)
    
    # Reduce weight for matches with red cards (now using 0.3 instead of 0.5)
    match_weight = np.where(valid_red == 1, match_weight * 0.3, match_weight)
    
    # Get division of the most recent match
    if division is not None:
        # Find the index of the most recent date
        most_recent_idx = valid_dates[valid_dates == recent_date].index
        if len(most_recent_idx) > 0:
            # Get division of the most recent match
            current_division = valid_division.iloc[0] if len(most_recent_idx) == len(valid_division) else valid_division[most_recent_idx[0]]
            # Apply 0.5 weight to matches with different division
            match_weight = np.where(valid_division != current_division, match_weight * 0.5, match_weight)

    # Calculate weighted average
    weighted_avg = sum(match_weight * valid_col) / sum(match_weight) 

    return weighted_avg

In [5]:
# Assuming df is already sorted by match_date
df = df.sort_values(['team', 'match_date'])

def calc_team_metrics_up_to_date(team, current_date, decay_rate=0.0065, time_window=365):
    # Get all matches for the team up to but not including the current date
    team_matches = df[(df['team'] == team) & (df['match_date'] < current_date)]
    
    # If no prior matches, return NaN for all metrics
    if len(team_matches) == 0:
        return pd.Series({
            'rolling_goals_for': np.nan,
            'rolling_goals_against': np.nan,
            'rolling_xg_for': np.nan,
            'rolling_xg_against': np.nan,
            'rolling_shots_for': np.nan,
            'rolling_shots_against': np.nan,
            'rolling_shots_on_target_for': np.nan,
            'rolling_shots_on_target_against': np.nan,
            'rolling_corners_for': np.nan,
            'rolling_corners_against': np.nan,
            'rolling_deep_for': np.nan,
            'rolling_deep_against': np.nan,
            'rolling_ppda_for': np.nan,
            'rolling_ppda_against': np.nan,
            'rolling_odds_for': np.nan,
            'rolling_odds_against': np.nan
        })
    
    # Calculate metrics using the original weighted average function
    metrics = {
        'rolling_goals_for': apply_weighted_avg(
            team_matches['goals'], team_matches['match_date'], team_matches['red'], team_matches['division'],
            decay_rate, time_window
        ),
        'rolling_goals_against': apply_weighted_avg(
            team_matches['opponent_goals'], team_matches['match_date'], team_matches['red'], team_matches['division'],
            decay_rate, time_window
        ),
        'rolling_xg_for': apply_weighted_avg(
            team_matches['xG'], team_matches['match_date'], team_matches['red'], team_matches['division'],
            decay_rate, time_window
        ),
        'rolling_xg_against': apply_weighted_avg(
            team_matches['opponent_xG'], team_matches['match_date'], team_matches['red'], team_matches['division'],
            decay_rate, time_window
        ),
        'rolling_shots_for': apply_weighted_avg(
            team_matches['shots'], team_matches['match_date'], team_matches['red'], team_matches['division'],
            decay_rate, time_window
        ),
        'rolling_shots_against': apply_weighted_avg(
            team_matches['opponent_shots'], team_matches['match_date'], team_matches['red'], team_matches['division'],
            decay_rate, time_window
        ),
        'rolling_shots_on_target_for': apply_weighted_avg(
            team_matches['shots_on_target'], team_matches['match_date'], team_matches['red'], team_matches['division'],
            decay_rate, time_window
        ),
        'rolling_shots_on_target_against': apply_weighted_avg(
            team_matches['opponent_shots_on_target'], team_matches['match_date'], team_matches['red'], team_matches['division'],
            decay_rate, time_window
        ),
        'rolling_corners_for': apply_weighted_avg(
            team_matches['corners'], team_matches['match_date'], team_matches['red'], team_matches['division'],
            decay_rate, time_window
        ),
        'rolling_corners_against': apply_weighted_avg(
            team_matches['opponent_corners'], team_matches['match_date'], team_matches['red'], team_matches['division'],
            decay_rate, time_window
        ),
        'rolling_deep_for': apply_weighted_avg(
            team_matches['deep'], team_matches['match_date'], team_matches['red'], team_matches['division'],
            decay_rate, time_window
        ),
        'rolling_deep_against': apply_weighted_avg(
            team_matches['opponent_deep'], team_matches['match_date'], team_matches['red'], team_matches['division'],
            decay_rate, time_window
        ),
        'rolling_ppda_for': apply_weighted_avg(
            team_matches['ppda'], team_matches['match_date'], team_matches['red'], team_matches['division'],
            decay_rate, time_window
        ),
        'rolling_ppda_against': apply_weighted_avg(
            team_matches['opponent_ppda'], team_matches['match_date'], team_matches['red'], team_matches['division'],
            decay_rate, time_window
        ),
        'rolling_odds_for': apply_weighted_avg(
            team_matches['odds'], team_matches['match_date'], team_matches['red'], team_matches['division'],
            decay_rate, time_window
        ),
        'rolling_odds_against': apply_weighted_avg(
            team_matches['opponent_odds'], team_matches['match_date'], team_matches['red'], team_matches['division'],
            decay_rate, time_window
        ),
    }
    
    return pd.Series(metrics)

def calc_team_metrics_multi_window(team, current_date, windows_and_decays=[(365, 0.0065), (30, 0.001)]):
    # Dictionary to store results for each window
    all_metrics = {}
    
    # Calculate metrics for each time window with its specific decay rate
    for window, decay in windows_and_decays:
        # Use existing function but with specific window and decay rate
        window_metrics = calc_team_metrics_up_to_date(
            team, 
            current_date,
            decay_rate=decay,
            time_window=window
        )
        
        # Add window suffix to column names (only for non-default window)
        window_suffix = f"_{window}d" if window != 365 else ""
        
        # Add these metrics to our results
        for metric, value in window_metrics.items():
            all_metrics[f"{metric}{window_suffix}"] = value
    
    return pd.Series(all_metrics)

# Apply the function to each row for team stats
team_results = []
opponent_results = []

# Define window sizes and decay rates
windows_and_decays = [(365, 0.0065), (30, 0.001)]

for idx, row in df.iterrows():
    # Calculate team metrics with multiple windows
    team_metrics = calc_team_metrics_multi_window(
        row['team'], 
        row['match_date'], 
        windows_and_decays=windows_and_decays
    )
    
    # Calculate opponent metrics with multiple windows
    opponent_metrics_raw = calc_team_metrics_multi_window(
        row['opponent_team'], 
        row['match_date'],
        windows_and_decays=windows_and_decays
    )
    
    # Rename opponent metrics with a clearer naming convention
    opponent_metrics = {}
    for metric, value in opponent_metrics_raw.items():
        opponent_metrics[f"opponent_{metric}"] = value
    
    team_results.append(team_metrics)
    opponent_results.append(pd.Series(opponent_metrics))

# Create DataFrames from the results
team_metrics_df = pd.DataFrame(team_results, index=df.index)
opponent_metrics_df = pd.DataFrame(opponent_results, index=df.index)

# Check for duplicated columns before concatenation
print("Team metrics columns:", team_metrics_df.columns.tolist())
print("Opponent metrics columns:", opponent_metrics_df.columns.tolist())

# Combine everything into the final DataFrame
final_df = pd.concat([df, team_metrics_df, opponent_metrics_df], axis=1)


final_df

Team metrics columns: ['rolling_goals_for', 'rolling_goals_against', 'rolling_xg_for', 'rolling_xg_against', 'rolling_shots_for', 'rolling_shots_against', 'rolling_shots_on_target_for', 'rolling_shots_on_target_against', 'rolling_corners_for', 'rolling_corners_against', 'rolling_deep_for', 'rolling_deep_against', 'rolling_ppda_for', 'rolling_ppda_against', 'rolling_odds_for', 'rolling_odds_against', 'rolling_goals_for_30d', 'rolling_goals_against_30d', 'rolling_xg_for_30d', 'rolling_xg_against_30d', 'rolling_shots_for_30d', 'rolling_shots_against_30d', 'rolling_shots_on_target_for_30d', 'rolling_shots_on_target_against_30d', 'rolling_corners_for_30d', 'rolling_corners_against_30d', 'rolling_deep_for_30d', 'rolling_deep_against_30d', 'rolling_ppda_for_30d', 'rolling_ppda_against_30d', 'rolling_odds_for_30d', 'rolling_odds_against_30d']
Opponent metrics columns: ['opponent_rolling_goals_for', 'opponent_rolling_goals_against', 'opponent_rolling_xg_for', 'opponent_rolling_xg_against', 'opp

Unnamed: 0,division,season,match_date,match_id,team,avg_market_value,goals,shots,shots_on_target,corners,...,opponent_rolling_shots_on_target_for_30d,opponent_rolling_shots_on_target_against_30d,opponent_rolling_corners_for_30d,opponent_rolling_corners_against_30d,opponent_rolling_deep_for_30d,opponent_rolling_deep_against_30d,opponent_rolling_ppda_for_30d,opponent_rolling_ppda_against_30d,opponent_rolling_odds_for_30d,opponent_rolling_odds_against_30d
8082,Ligue 1,20222023,2022-08-05,Lyon - Ajaccio_20220805,Ajaccio,710938,1,8.0,4.0,3.0,...,7.068052,4.737273,5.822685,5.123348,11.158575,7.036508,13.680607,12.016902,1.857353,5.246322
8096,Ligue 1,20222023,2022-08-14,Ajaccio - Lens_20220814,Ajaccio,710938,0,9.0,2.0,1.0,...,7.000000,4.000000,6.000000,3.000000,7.000000,4.000000,5.920000,18.710000,1.570000,5.250000
8105,Ligue 1,20222023,2022-08-21,Rennes - Ajaccio_20220821,Ajaccio,735000,1,10.0,3.0,8.0,...,7.003000,3.003000,6.498500,1.498500,10.991000,5.006000,6.873335,23.631320,2.307835,5.315475
8111,Ligue 1,20222023,2022-08-26,Ajaccio - Lille_20220826,Ajaccio,735000,1,13.0,2.0,6.0,...,7.341912,4.018368,6.652069,3.670437,9.640758,8.361629,12.455353,17.948796,3.377359,2.992995
8122,Ligue 1,20222023,2022-08-31,Montpellier - Ajaccio_20220831,Ajaccio,735000,0,15.0,0.0,8.0,...,6.258456,5.465609,3.509624,5.988240,6.013523,6.984163,12.480771,11.365481,7.309676,2.443786
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248,Premier League,20242025,2025-02-16,Liverpool - Wolves_20250216,Wolves,10972727,1,16.0,4.0,3.0,...,6.785002,3.693686,7.156575,2.239583,15.678170,4.090837,7.230244,14.780322,1.446936,7.793515
254,Premier League,20242025,2025-02-22,Bournemouth - Wolves_20250222,Wolves,10972727,1,13.0,5.0,7.0,...,7.485164,4.998216,4.502694,5.733792,5.481596,10.479800,7.312345,8.240788,3.252346,3.187490
264,Premier League,20242025,2025-02-25,Wolves - Fulham_20250225,Wolves,10972727,1,18.0,5.0,7.0,...,4.249590,3.010004,6.264898,3.773559,5.518628,5.989220,15.824290,18.219634,2.819511,2.954952
275,Premier League,20242025,2025-03-08,Wolves - Everton_20250308,Wolves,10972727,1,11.0,3.0,5.0,...,6.018758,3.308347,4.562368,5.553719,6.620515,5.726807,18.604640,10.016210,3.491567,2.698088


In [6]:
append_df_to_sqlite_table(final_df, 'match_db.db', 'xgb_match_features_0065_001', key_columns=['match_id'])

final_df

Created new table xgb_match_features_0065_001 with 17686 records


Unnamed: 0,division,season,match_date,match_id,team,avg_market_value,goals,shots,shots_on_target,corners,...,opponent_rolling_shots_on_target_for_30d,opponent_rolling_shots_on_target_against_30d,opponent_rolling_corners_for_30d,opponent_rolling_corners_against_30d,opponent_rolling_deep_for_30d,opponent_rolling_deep_against_30d,opponent_rolling_ppda_for_30d,opponent_rolling_ppda_against_30d,opponent_rolling_odds_for_30d,opponent_rolling_odds_against_30d
8082,Ligue 1,20222023,2022-08-05,Lyon - Ajaccio_20220805,Ajaccio,710938,1,8.0,4.0,3.0,...,7.068052,4.737273,5.822685,5.123348,11.158575,7.036508,13.680607,12.016902,1.857353,5.246322
8096,Ligue 1,20222023,2022-08-14,Ajaccio - Lens_20220814,Ajaccio,710938,0,9.0,2.0,1.0,...,7.000000,4.000000,6.000000,3.000000,7.000000,4.000000,5.920000,18.710000,1.570000,5.250000
8105,Ligue 1,20222023,2022-08-21,Rennes - Ajaccio_20220821,Ajaccio,735000,1,10.0,3.0,8.0,...,7.003000,3.003000,6.498500,1.498500,10.991000,5.006000,6.873335,23.631320,2.307835,5.315475
8111,Ligue 1,20222023,2022-08-26,Ajaccio - Lille_20220826,Ajaccio,735000,1,13.0,2.0,6.0,...,7.341912,4.018368,6.652069,3.670437,9.640758,8.361629,12.455353,17.948796,3.377359,2.992995
8122,Ligue 1,20222023,2022-08-31,Montpellier - Ajaccio_20220831,Ajaccio,735000,0,15.0,0.0,8.0,...,6.258456,5.465609,3.509624,5.988240,6.013523,6.984163,12.480771,11.365481,7.309676,2.443786
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248,Premier League,20242025,2025-02-16,Liverpool - Wolves_20250216,Wolves,10972727,1,16.0,4.0,3.0,...,6.785002,3.693686,7.156575,2.239583,15.678170,4.090837,7.230244,14.780322,1.446936,7.793515
254,Premier League,20242025,2025-02-22,Bournemouth - Wolves_20250222,Wolves,10972727,1,13.0,5.0,7.0,...,7.485164,4.998216,4.502694,5.733792,5.481596,10.479800,7.312345,8.240788,3.252346,3.187490
264,Premier League,20242025,2025-02-25,Wolves - Fulham_20250225,Wolves,10972727,1,18.0,5.0,7.0,...,4.249590,3.010004,6.264898,3.773559,5.518628,5.989220,15.824290,18.219634,2.819511,2.954952
275,Premier League,20242025,2025-03-08,Wolves - Everton_20250308,Wolves,10972727,1,11.0,3.0,5.0,...,6.018758,3.308347,4.562368,5.553719,6.620515,5.726807,18.604640,10.016210,3.491567,2.698088
