In [41]:
import pandas as pd
import requests
import os
import io



API_KEY = os.getenv("API_KEY")
url = 'https://data-service.beatthebookie.blog/data'
headers = {"x-api-key": API_KEY}

# Function to fetch data for a specific division and season
def fetch_data(division, season):
    params = {
        'division': division,
        'season': season
    }
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return pd.read_json(io.StringIO(response.content.decode('utf-8')))
    else:
        print(f"Error fetching {division} {season}: {response.status_code}")
        print(response.content.decode('utf-8'))
        return pd.DataFrame()

# Fetch data for all combinations
seasons = ['2024_2025']
divisions = ['Premier League']
dataframes = []

for division in divisions:
    for season in seasons:
        df = fetch_data(division, season)
        if not df.empty:
            dataframes.append(df)

# Combine all dataframes
if dataframes:
    df = pd.concat(dataframes, ignore_index=True)
    
    # Convert match_date to datetime
    df['match_date'] = pd.to_datetime(df['match_date'])
    
    # Filter for dates after January 11, 2024
    #df = df[df["match_date"] > '2024-01-11']

df.columns

Index(['division_id', 'division', 'season_id', 'season', 'match_date',
       'match_teams', 'home_team_id', 'home_team', 'away_team_id', 'away_team',
       'home_num_players', 'home_market_value', 'home_avg_market_value',
       'away_num_players', 'away_market_value', 'away_avg_market_value',
       'home_goals', 'away_goals', 'home_shots', 'away_shots',
       'home_shots_on_target', 'away_shots_on_target', 'home_corners',
       'away_corners', 'home_yellow', 'away_yellow', 'home_red', 'away_red',
       'home_xgoals', 'away_xgoals', 'home_deep', 'away_deep', 'home_ppda',
       'away_ppda', 'bet365_home_odds', 'bet365_draw_odds', 'bet365_away_odds',
       'bet365_u25_odds', 'bet365_o25_odds'],
      dtype='object')

In [None]:
import datetime
import numpy as np

def preprocess_data(df):
    df = df.copy()

    def calculate_time_weighted_metrics(group, current_date, halflife=90):
        # create an exponential decay based on match_date from current match_date. half time represent days ago where weighting=0.5 (realative to current game)
        days_ago = (current_date - group["match_date"]).dt.total_seconds() / (24*3600)
        weights = np.exp(-np.log(2) * days_ago / halflife)
        weights = weights / weights.sum()
        
        metrics = {
            'weighted_xg': np.average(group["team_xg"], weights=weights),
            'weighted_goals': np.average(group["team_goals"], weights=weights),
            'weighted_shots': np.average(group["team_shots"], weights=weights),
            'weighted_xg_against': np.average(group["opponent_xg"], weights=weights),
            'weighted_goals_against': np.average(group["opponent_goals"], weights=weights),
            'weighted_shots_against': np.average(group["opponent_shots"], weights=weights)
    }
        return pd.Series(metrics)

    team_metrics = {}

    # assign home/away to proper team/opponent mapping
    for team in pd.concat([df["home_team"], df["away_team"]]).unique():
        home_games = df[df["home_team"] == team].copy()
        away_games = df[df["away_team"] == team].copy()

        team_games = pd.concat([
            home_games[["match_date", "home_xgoals", "home_goals", "home_shots", 
                        "away_xgoals", "away_goals", "away_shots"]].rename(columns={
                "home_xgoals": "team_xg",
                "home_goals": "team_goals",
                "home_shots": "team_shots",
                "away_xgoals": "opponent_xg",
                "away_goals": "opponent_goals",
                "away_shots": "opponent_shots"
            }),
            away_games[["match_date", "away_xgoals", "away_goals", "away_shots",
                        "home_xgoals", "home_goals", "home_shots"]].rename(columns={
                "away_xgoals": "team_xg",
                "away_goals": "team_goals",
                "away_shots": "team_shots",
                "home_xgoals": "opponent_xg",
                "home_goals": "opponent_goals",
                "home_shots": "opponent_shots"
            })
        ]).sort_values('match_date')

        # create series of match data per team and then apply weighting function
        metrics = []
        for match_date in team_games["match_date"]:
            historical_games = team_games[team_games['match_date'] < match_date]
            
            if historical_games.empty and len(metrics) == 0:
                current_game = team_games[team_games['match_date'] == match_date].iloc[0]
                metrics.append((match_date, pd.Series({
                    'weighted_xg': current_game['team_xg'],
                    'weighted_goals': current_game['team_goals'],
                    'weighted_shots': current_game['team_shots'],
                    'weighted_xg_against': current_game['opponent_xg'],
                    'weighted_goals_against': current_game['opponent_goals'],
                    'weighted_shots_against': current_game['opponent_shots']
                })))
            elif not historical_games.empty:
                match_metrics = calculate_time_weighted_metrics(historical_games, match_date)
                metrics.append((match_date, match_metrics))

        if metrics:
            team_metrics[team] = pd.DataFrame([m[1] for m in metrics], index=[m[0] for m in metrics])

    # Create final feature matrix
    feature_matrix = []
    
    for _, match in df.iterrows():
        match_date = match['match_date']
        home_team = match['home_team']
        away_team = match['away_team']
        
        # Get team metrics
        home_metrics = team_metrics.get(home_team, pd.DataFrame())
        away_metrics = team_metrics.get(away_team, pd.DataFrame())
        
        match_features = {
            'match_date': match_date,
            'home_team': home_team,
            'away_team': away_team,
            'home_weighted_xg': home_metrics.loc[match_date, 'weighted_xg'] if match_date in home_metrics.index else match['home_xgoals'],
            'home_weighted_goals': home_metrics.loc[match_date, 'weighted_goals'] if match_date in home_metrics.index else match['home_goals'],
            'home_weighted_shots': home_metrics.loc[match_date, 'weighted_shots'] if match_date in home_metrics.index else match['home_shots'],
            'home_weighted_xg_against': home_metrics.loc[match_date, 'weighted_xg_against'] if match_date in home_metrics.index else match['away_xgoals'],
            'home_weighted_goals_against': home_metrics.loc[match_date, 'weighted_goals_against'] if match_date in home_metrics.index else match['away_goals'],
            'home_weighted_shots_against': home_metrics.loc[match_date, 'weighted_shots_against'] if match_date in home_metrics.index else match['away_shots'],
            'away_weighted_xg': away_metrics.loc[match_date, 'weighted_xg'] if match_date in away_metrics.index else match['away_xgoals'],
            'away_weighted_goals': away_metrics.loc[match_date, 'weighted_goals'] if match_date in away_metrics.index else match['away_goals'],
            'away_weighted_shots': away_metrics.loc[match_date, 'weighted_shots'] if match_date in away_metrics.index else match['away_shots'],
            'away_weighted_xg_against': away_metrics.loc[match_date, 'weighted_xg_against'] if match_date in away_metrics.index else match['home_xgoals'],
            'away_weighted_goals_against': away_metrics.loc[match_date, 'weighted_goals_against'] if match_date in away_metrics.index else match['home_goals'],
            'away_weighted_shots_against': away_metrics.loc[match_date, 'weighted_shots_against'] if match_date in away_metrics.index else match['home_shots']
        }
        
        feature_matrix.append(match_features)
    
    return pd.DataFrame(feature_matrix)

df_processed = preprocess_data(df)

df_processed.tail()

Unnamed: 0,match_date,home_team,away_team,home_weighted_xg,home_weighted_goals,home_weighted_shots,home_weighted_xg_against,home_weighted_goals_against,home_weighted_shots_against,away_weighted_xg,away_weighted_goals,away_weighted_shots,away_weighted_xg_against,away_weighted_goals_against,away_weighted_shots_against
224,2025-01-25,Wolves,Arsenal,1.136024,1.431693,10.833201,1.86845,2.238081,14.050353,2.152061,1.991657,13.993792,0.946929,0.96796,9.5172
225,2025-01-26,Tottenham,Leicester,1.851806,1.999967,13.638911,1.973517,1.775592,13.240317,1.183211,0.921663,9.222847,2.250792,2.266296,16.697698
226,2025-01-26,Crystal Palace,Brentford,1.701459,1.261889,13.602939,1.551647,1.198665,12.998012,1.783754,1.77814,11.148676,1.938484,1.738553,19.100524
227,2025-01-26,Fulham,Man United,1.718749,1.609673,14.040803,1.244132,1.399859,10.639072,1.60601,1.293302,13.625785,1.697145,1.580308,11.022146
228,2025-01-26,Aston Villa,West Ham,1.749424,1.464968,12.460014,1.472289,1.552279,12.286367,1.358391,1.205322,13.356695,2.134728,2.037547,16.666992
