In [6]:
import pandas as pd
import requests
import os
import io



API_KEY = os.getenv("API_KEY")
url = 'https://data-service.beatthebookie.blog/data'
headers = {"x-api-key": API_KEY}

# Function to fetch data for a specific division and season
def fetch_data(division, season):
    params = {
        'division': division,
        'season': season
    }
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return pd.read_json(io.StringIO(response.content.decode('utf-8')))
    else:
        print(f"Error fetching {division} {season}: {response.status_code}")
        print(response.content.decode('utf-8'))
        return pd.DataFrame()

# Fetch data for all combinations
seasons = ['2024_2025']
divisions = ['Premier League']
dataframes = []

for division in divisions:
    for season in seasons:
        df = fetch_data(division, season)
        if not df.empty:
            dataframes.append(df)

# Combine all dataframes
if dataframes:
    df = pd.concat(dataframes, ignore_index=True)
    
    # Convert match_date to datetime
    df['match_date'] = pd.to_datetime(df['match_date'])
    
    # Filter for dates after January 11, 2024
    #df = df[df["match_date"] > '2024-01-11']

df.columns

Index(['division_id', 'division', 'season_id', 'season', 'match_date',
       'match_teams', 'home_team_id', 'home_team', 'away_team_id', 'away_team',
       'home_num_players', 'home_market_value', 'home_avg_market_value',
       'away_num_players', 'away_market_value', 'away_avg_market_value',
       'home_goals', 'away_goals', 'home_shots', 'away_shots',
       'home_shots_on_target', 'away_shots_on_target', 'home_corners',
       'away_corners', 'home_yellow', 'away_yellow', 'home_red', 'away_red',
       'home_xgoals', 'away_xgoals', 'home_deep', 'away_deep', 'home_ppda',
       'away_ppda', 'bet365_home_odds', 'bet365_draw_odds', 'bet365_away_odds',
       'bet365_u25_odds', 'bet365_o25_odds'],
      dtype='object')

In [7]:
import datetime
import numpy as np

def preprocess_data(df):
    df = df.copy()

    def calculate_time_weighted_metrics(group, current_date, halflife=90):
        # create an exponential decay based on match_date from current match_date. half time represent days ago where weighting=0.5 (realative to current game)
        days_ago = (current_date - group["match_date"]).dt.total_seconds() / (24*3600)
        weights = np.exp(-np.log(2) * days_ago / halflife)
        weights = weights / weights.sum()
        
        metrics = {
            'weighted_xg': np.average(group["team_xg"], weights=weights),
            'weighted_goals': np.average(group["team_goals"], weights=weights),
            'weighted_shots': np.average(group["team_shots"], weights=weights),
            'weighted_xg_against': np.average(group["opponent_xg"], weights=weights),
            'weighted_goals_against': np.average(group["opponent_goals"], weights=weights),
            'weighted_shots_against': np.average(group["opponent_shots"], weights=weights)
    }
        return pd.Series(metrics)

    team_metrics = {}

    # assign home/away to proper team/opponent mapping
    for team in pd.concat([df["home_team"], df["away_team"]]).unique():
        home_games = df[df["home_team"] == team].copy()
        away_games = df[df["away_team"] == team].copy()

        team_games = pd.concat([
            home_games[["match_date", "home_xgoals", "home_goals", "home_shots", 
                        "away_xgoals", "away_goals", "away_shots"]].rename(columns={
                "home_xgoals": "team_xg",
                "home_goals": "team_goals",
                "home_shots": "team_shots",
                "home_red": "team_red",
                "away_xgoals": "opponent_xg",
                "away_goals": "opponent_goals",
                "away_shots": "opponent_shots",
                "away_red": "opponent_red"
            }),
            away_games[["match_date", "away_xgoals", "away_goals", "away_shots",
                        "home_xgoals", "home_goals", "home_shots"]].rename(columns={
                "away_xgoals": "team_xg",
                "away_goals": "team_goals",
                "away_shots": "team_shots",
                "away_red": "team_red",
                "home_xgoals": "opponent_xg",
                "home_goals": "opponent_goals",
                "home_shots": "opponent_shots",
                "home_red": "opponent_red",
            })
        ]).sort_values('match_date')

        # create series of match data per team and then apply weighting function
        metrics = []
        for match_date in team_games["match_date"]:
            historical_games = team_games[team_games['match_date'] < match_date]
            
            if historical_games.empty and len(metrics) == 0:
                current_game = team_games[team_games['match_date'] == match_date].iloc[0]
                metrics.append((match_date, pd.Series({
                    'weighted_xg': current_game['team_xg'],
                    'weighted_goals': current_game['team_goals'],
                    'weighted_shots': current_game['team_shots'],
                    'weighted_xg_against': current_game['opponent_xg'],
                    'weighted_goals_against': current_game['opponent_goals'],
                    'weighted_shots_against': current_game['opponent_shots']
                })))
            elif not historical_games.empty:
                match_metrics = calculate_time_weighted_metrics(historical_games, match_date)
                metrics.append((match_date, match_metrics))

        if metrics:
            team_metrics[team] = pd.DataFrame([m[1] for m in metrics], index=[m[0] for m in metrics])

    # Create final feature matrix with team/opponent structure
    feature_matrix = []
    
    for _, match in df.iterrows():
        match_date = match['match_date']
        
        # Create two rows for each match (one for each team)
        for is_home, (team, opponent) in enumerate([(match['home_team'], match['away_team']), 
                                                  (match['away_team'], match['home_team'])]):
            team_metrics_df = team_metrics.get(team, pd.DataFrame())
            opponent_metrics_df = team_metrics.get(opponent, pd.DataFrame())
            
            if is_home:
                match_features = {
                    'match_date': match_date,
                    'team': team,
                    'opponent': opponent,
                    'is_home': 0,
                    # Actual match stats
                    'team_goals': match["away_goals"],
                    'opponent_goals': match["home_goals"],
                    'team_red': match["away_red"],
                    'opponent_red': match["home_red"],
                    # Team stats
                    'weighted_xg': team_metrics_df.loc[match_date, 'weighted_xg'] if match_date in team_metrics_df.index else match['home_xgoals'],
                    'weighted_goals': team_metrics_df.loc[match_date, 'weighted_goals'] if match_date in team_metrics_df.index else match['home_goals'],
                    'weighted_shots': team_metrics_df.loc[match_date, 'weighted_shots'] if match_date in team_metrics_df.index else match['home_shots'],
                    'weighted_xg_against': team_metrics_df.loc[match_date, 'weighted_xg_against'] if match_date in team_metrics_df.index else match['away_xgoals'],
                    'weighted_goals_against': team_metrics_df.loc[match_date, 'weighted_goals_against'] if match_date in team_metrics_df.index else match['away_goals'],
                    'weighted_shots_against': team_metrics_df.loc[match_date, 'weighted_shots_against'] if match_date in team_metrics_df.index else match['away_shots'],
                    # Opponent stats
                    'opponent_weighted_xg': opponent_metrics_df.loc[match_date, 'weighted_xg'] if match_date in opponent_metrics_df.index else match['away_xgoals'],
                    'opponent_weighted_goals': opponent_metrics_df.loc[match_date, 'weighted_goals'] if match_date in opponent_metrics_df.index else match['away_goals'],
                    'opponent_weighted_shots': opponent_metrics_df.loc[match_date, 'weighted_shots'] if match_date in opponent_metrics_df.index else match['away_shots'],
                    'opponent_weighted_xg_against': opponent_metrics_df.loc[match_date, 'weighted_xg_against'] if match_date in opponent_metrics_df.index else match['home_xgoals'],
                    'opponent_weighted_goals_against': opponent_metrics_df.loc[match_date, 'weighted_goals_against'] if match_date in opponent_metrics_df.index else match['home_goals'],
                    'opponent_weighted_shots_against': opponent_metrics_df.loc[match_date, 'weighted_shots_against'] if match_date in opponent_metrics_df.index else match['home_shots']
                }
            else:
                match_features = {
                    'match_date': match_date,
                    'team': team,
                    'opponent': opponent,
                    'is_home': 1,
                    # Actual match stats
                    'team_goals': match["home_goals"],
                    'opponent_goals': match["away_goals"],
                    'team_red': match["home_red"],
                    'opponent_red': match["away_red"],
                    # Team stats
                    'weighted_xg': team_metrics_df.loc[match_date, 'weighted_xg'] if match_date in team_metrics_df.index else match['away_xgoals'],
                    'weighted_goals': team_metrics_df.loc[match_date, 'weighted_goals'] if match_date in team_metrics_df.index else match['away_goals'],
                    'weighted_shots': team_metrics_df.loc[match_date, 'weighted_shots'] if match_date in team_metrics_df.index else match['away_shots'],
                    'weighted_xg_against': team_metrics_df.loc[match_date, 'weighted_xg_against'] if match_date in team_metrics_df.index else match['home_xgoals'],
                    'weighted_goals_against': team_metrics_df.loc[match_date, 'weighted_goals_against'] if match_date in team_metrics_df.index else match['home_goals'],
                    'weighted_shots_against': team_metrics_df.loc[match_date, 'weighted_shots_against'] if match_date in team_metrics_df.index else match['home_shots'],
                    # Opponent stats
                    'opponent_weighted_xg': opponent_metrics_df.loc[match_date, 'weighted_xg'] if match_date in opponent_metrics_df.index else match['home_xgoals'],
                    'opponent_weighted_goals': opponent_metrics_df.loc[match_date, 'weighted_goals'] if match_date in opponent_metrics_df.index else match['home_goals'],
                    'opponent_weighted_shots': opponent_metrics_df.loc[match_date, 'weighted_shots'] if match_date in opponent_metrics_df.index else match['home_shots'],
                    'opponent_weighted_xg_against': opponent_metrics_df.loc[match_date, 'weighted_xg_against'] if match_date in opponent_metrics_df.index else match['away_xgoals'],
                    'opponent_weighted_goals_against': opponent_metrics_df.loc[match_date, 'weighted_goals_against'] if match_date in opponent_metrics_df.index else match['away_goals'],
                    'opponent_weighted_shots_against': opponent_metrics_df.loc[match_date, 'weighted_shots_against'] if match_date in opponent_metrics_df.index else match['away_shots']
                }
            
            feature_matrix.append(match_features)
    
    return pd.DataFrame(feature_matrix)


df_processed = preprocess_data(df)

df_processed.tail(20)

Unnamed: 0,match_date,team,opponent,is_home,team_goals,opponent_goals,team_red,opponent_red,weighted_xg,weighted_goals,weighted_shots,weighted_xg_against,weighted_goals_against,weighted_shots_against,opponent_weighted_xg,opponent_weighted_goals,opponent_weighted_shots,opponent_weighted_xg_against,opponent_weighted_goals_against,opponent_weighted_shots_against
438,2025-01-25,Brighton,Everton,1,0,1,0,0,1.526115,1.567166,13.747298,1.503567,1.345606,11.523477,1.136631,0.839355,11.053859,1.654501,1.192451,13.001135
439,2025-01-25,Everton,Brighton,0,1,0,0,0,1.136631,0.839355,11.053859,1.654501,1.192451,13.001135,1.526115,1.567166,13.747298,1.503567,1.345606,11.523477
440,2025-01-25,Southampton,Newcastle,1,1,3,0,0,1.176651,0.708524,9.341829,2.828527,2.389309,18.680362,1.870725,1.914523,14.720948,1.435757,1.220325,12.613749
441,2025-01-25,Newcastle,Southampton,0,3,1,0,0,1.870725,1.914523,14.720948,1.435757,1.220325,12.613749,1.176651,0.708524,9.341829,2.828527,2.389309,18.680362
442,2025-01-25,Man City,Chelsea,1,3,1,0,0,2.057934,2.057514,16.781185,1.688453,1.327805,10.725323,2.250647,1.949861,17.073794,1.533524,1.247602,10.120215
443,2025-01-25,Chelsea,Man City,0,1,3,0,0,2.250647,1.949861,17.073794,1.533524,1.247602,10.120215,2.057934,2.057514,16.781185,1.688453,1.327805,10.725323
444,2025-01-25,Liverpool,Ipswich,1,4,1,0,0,2.795517,2.50766,19.315519,1.028941,1.072385,9.693167,1.109312,0.882123,9.143042,2.282213,2.015288,15.32475
445,2025-01-25,Ipswich,Liverpool,0,1,4,0,0,1.109312,0.882123,9.143042,2.282213,2.015288,15.32475,2.795517,2.50766,19.315519,1.028941,1.072385,9.693167
446,2025-01-25,Bournemouth,Nott'm Forest,1,5,0,0,0,2.121397,1.754783,16.148324,1.371913,1.124931,13.711851,1.527424,1.618907,12.167483,1.342608,1.012213,13.419905
447,2025-01-25,Nott'm Forest,Bournemouth,0,0,5,0,0,1.527424,1.618907,12.167483,1.342608,1.012213,13.419905,2.121397,1.754783,16.148324,1.371913,1.124931,13.711851


In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor
import xgboost as xgb

features = [
        'weighted_xg', 'weighted_goals', 'weighted_shots',
        'weighted_xg_against', 'weighted_goals_against', 'weighted_shots_against',
        'opponent_weighted_xg', 'opponent_weighted_goals', 'opponent_weighted_shots',
        'opponent_weighted_xg_against', 'opponent_weighted_goals_against', 'opponent_weighted_shots_against',
        'team_red', "opponent_red", 'is_home'
    ]


def goals_poisson_model(df, features):
    # create poisson models for goals scored and against

    X = df[features]
    y_goals_for = df["team_goals"]
    y_goals_against = df["opponent_goals"]

    # scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # train models
    model_goals_for = PoissonRegressor()
    model_goals_against = PoissonRegressor()

    model_goals_for.fit(X_scaled, y_goals_for)
    model_goals_against.fit(X_scaled, y_goals_against)

    return model_goals_for, model_goals_against, scaler
