In [21]:
import warnings
import requests
import pandas as pd
import io
import os

from sklearn.preprocessing import StandardScaler


### Data Loading

In [22]:
# Suppress divide by zero warnings
warnings.filterwarnings("ignore", category=RuntimeWarning, message="divide by zero encountered in log")


API_KEY = os.getenv("API_KEY")
url = 'https://data-service.beatthebookie.blog/data'
headers = {"x-api-key": API_KEY}
params = {'division':'Premier League'}
response = requests.get(url, headers=headers, params=params)
json_str = response.content.decode('utf-8')
prem_df = pd.read_json(io.StringIO(json_str))
prem_teams_25 = prem_df[prem_df["season"] == 20242025]
prem_teams_25 = pd.concat([prem_teams_25['home_team'], prem_teams_25['away_team']]).unique()

params = {'division':'Championship'}
response = requests.get(url, headers=headers, params=params)
json_str = response.content.decode('utf-8')
champ_df = pd.read_json(io.StringIO(json_str))

df = pd.concat([champ_df, prem_df])
#df = df[(df['home_team'].isin(prem_teams_25)) | (df['away_team'].isin(prem_teams_25))]

print(df[["season", "match_date", "home_team", "away_team", "home_goals", "home_xgoals", "away_goals", "away_xgoals"]].tail())

        season  match_date    home_team  away_team  home_goals  home_xgoals  \
3835  20242025  2024-09-14       Fulham   West Ham           1     2.886010   
3836  20242025  2024-09-14  Aston Villa    Everton           3     3.158760   
3837  20242025  2024-09-14     Brighton    Ipswich           0     1.756960   
3838  20242025  2024-09-15    Tottenham    Arsenal           0     0.792595   
3839  20242025  2024-09-15       Wolves  Newcastle           1     1.469690   

      away_goals  away_xgoals  
3835           1     0.682719  
3836           2     0.852158  
3837           0     0.721323  
3838           1     1.120900  
3839           2     1.483430  


### Data Cleansing

In [23]:
df['match_date'] = pd.to_datetime(df['match_date'])

# Converts columns listed below to numeric datatype
numeric_cols = ["home_num_players", "home_market_value", "home_avg_market_value", "away_num_players", "away_market_value", "away_avg_market_value",
                "home_goals", "away_goals", "home_shots", "away_shots", "home_shots_on_target", "away_shots_on_target", "home_corners", "away_corners",
                "home_red", "away_red", "home_yellow", "away_yellow", "home_deep", "away_deep", "home_ppda", "away_ppda"]
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

#Check for nulls
print(df.isnull().sum())

# Investigating home_shots nulls
home_shot_nulls = df[df["home_shots"].isnull()]
home_shot_null_counts = home_shot_nulls.groupby(["division", "season"]).size().reset_index(name='count')
print(home_shot_null_counts)

# Investigating deep/ppda nulls
ppda_nulls = df[df["home_ppda"].isnull()]
ppda_null_counts = ppda_nulls.groupby(["division", "season"]).size().reset_index(name='count')
print(ppda_null_counts)


division_id                 0
division                    0
season_id                   0
season                      0
match_date                  0
match_teams                 0
home_team_id                0
home_team                   0
away_team_id                0
away_team                   0
home_num_players            0
home_market_value           0
home_avg_market_value       0
away_num_players            0
away_market_value           0
away_avg_market_value       0
home_goals                  0
away_goals                  0
home_shots                430
away_shots                  0
home_shots_on_target        0
away_shots_on_target        0
home_corners                0
away_corners                0
home_yellow                 0
away_yellow                 0
home_red                    0
away_red                    0
home_xgoals                 0
away_xgoals                 0
home_deep                3912
away_deep                3912
home_ppda                3912
away_ppda 

ABove analysis shos that Championship is missing data for deep/ppda and that some random seasons of Premier League and Championship are missing home_shots data. I will not be handling these missing values directly as I am going to use an XGBoost model which handles missing values in training itself.

In [24]:
# Separates out the home vs away and allows every team to have an individual row of data. Allows us to feature engineer home effect easier.
home_df = df[["division", "season", "match_date",
             "home_team", "home_avg_market_value", "home_goals", "home_shots", "home_xgoals",  "home_deep", "home_ppda",
             "home_shots_on_target", "home_corners", "home_yellow", "home_red",
             "away_team", "away_avg_market_value", "away_goals", "away_shots", "away_xgoals", "away_deep", "away_ppda",
             "away_shots_on_target", "away_corners", "away_yellow", "away_red",
              ]].copy()
home_df["home?"] = 1
home_df = home_df.rename(columns={"home_team":"team", "home_avg_market_value":"avg_market_value", "home_goals":"goals",
             "home_shots":"shots", "home_xgoals":"xG",  "home_deep":"deep", "home_ppda":"ppda", "home_shots_on_target":"shots_on_target",
             "home_corners":"corners", "home_yellow": "yellow", "home_red": "red",

             "away_team":"opponent_team", "away_avg_market_value":"opponent_avg_market_value", "away_goals":"opponent_goals", 
             "away_shots": "opponent_shots", "away_xgoals": "opponent_xG", "away_deep":"opponent_deep", "away_ppda":"opponent_ppda",
             "away_shots_on_target": "opponent_shots_on_target", "away_corners":"opponent_corners", "away_yellow":"opponent_yellow",
             "away_red": "opponent_red"})

away_df = df[["division", "season", "match_date",
                "away_team", "away_avg_market_value", "away_goals", "away_shots", "away_xgoals",  "away_deep", "away_ppda",
                "away_shots_on_target", "away_corners", "away_yellow", "away_red",
                "home_team", "home_avg_market_value", "home_goals", "home_shots", "home_xgoals", "home_deep", "home_ppda",
                "home_shots_on_target", "home_corners", "home_yellow", "home_red",]].copy()
away_df["home?"] = 0
away_df = away_df.rename(columns={"away_team":"team", "away_avg_market_value":"avg_market_value", "away_goals":"goals",
             "away_shots":"shots", "away_xgoals":"xG", "away_deep":"deep", "away_ppda":"ppda", "away_shots_on_target":"shots_on_target",
             "away_corners":"corners", "away_yellow":"yellow", "away_red":"red",


             "home_team":"opponent_team", "home_avg_market_value":"opponent_avg_market_value", "home_goals":"opponent_goals", 
             "home_shots": "opponent_shots", "home_xgoals": "opponent_xG", "home_deep":"opponent_deep", "home_ppda":"opponent_ppda",
             "home_shots_on_target":"opponent_shots_on_target", "home_corners":"opponent_corners", "home_yellow":"opponent_yellow",
             "home_red":"opponent_red"})

df = pd.concat([away_df, home_df])
df.sort_values(["match_date", "division"], inplace=True)