# Statsbomb Data Preparation


In [2]:
import os
import warnings
import pandas as pd
import numpy as np
from scipy.spatial import distance
import requests
from tqdm.notebook import tqdm

from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from statsbombpy import sb

In [3]:
pd.set_option("display.max_columns", None)
data_dir = f"{os.path.abspath(os.path.join(os.getcwd(), '..'))}/data/"

# Suppress the NoAuthWarning from statsbombpy
warnings.filterwarnings("ignore", category=UserWarning, module="statsbombpy")

In [3]:
comps = sb.competitions()
comps = comps[comps["competition_gender"] == "male"]
comps[["competition_name", "competition_id", "season_id", "season_name"]]
comps_list_of_tuples = list(
    comps[["competition_id", "competition_name", "season_id", "season_name"]].itertuples(index=False, name=None)
)



## Download Data


In [4]:
# URL template for lineups
lineups_url_template = "https://raw.githubusercontent.com/statsbomb/open-data/master/data/lineups/{match_id}.json"


# Example function to get match_ids
def get_match_ids(competition_id, season_id):
    # URL for matches in a specific competition and season
    matches_url = (
        f"https://raw.githubusercontent.com/statsbomb/open-data/master/data/matches/{competition_id}/{season_id}.json"
    )

    # Fetch the matches data
    response = requests.get(matches_url)
    matches_data = response.json()

    # Extract match_ids
    match_ids = [match["match_id"] for match in matches_data]
    return match_ids

In [15]:
for comp in tqdm(comps_list_of_tuples[2:], desc="Processing Competitions"):
    competition_id = comp[0]
    competition_name = comp[1]
    season_id = comp[2]
    season_name = comp[3]

    match_ids = get_match_ids(competition_id, season_id)

    required_columns = [
        "location",
        "minute",
        "play_pattern",
        "player",
        "player_id",
        "position",
        "possession",
        "possession_team",
        "possession_team_id",
        "second",
        "shot_aerial_won",
        "shot_body_part",
        "shot_first_time",
        "shot_freeze_frame",
        "shot_one_on_one",
        "shot_outcome",
        "shot_statsbomb_xg",
        "shot_technique",
        "shot_type",
        "team",
        "team_id",
        "timestamp",
        "type",
        "under_pressure",
    ]

    # Initialize a list to store all events data
    all_events_data = []

    for match_id in match_ids:
        events = sb.events(match_id=match_id)
        for col in required_columns:
            if col not in events.columns:
                events[col] = np.nan

        shot_events = events[events["type"] == "Shot"][required_columns]

        # Add match_id to each event
        shot_events["match_id"] = match_id

        # Append to the list
        all_events_data.append(shot_events)

    # Concatenate all events data into a single DataFrame
    all_events_df = pd.concat(all_events_data, ignore_index=True)
    # Add competition and season information
    all_events_df["competition_id"] = competition_id
    all_events_df["competition_name"] = competition_name
    all_events_df["season_id"] = season_id
    all_events_df["season_name"] = season_name

    all_events_df.to_parquet(f"{data_dir}/{competition_id}_{season_id}.parquet", index=False)


Processing Competitions:   0%|          | 0/65 [00:00<?, ?it/s]

## Read Data


In [88]:
files = os.listdir(data_dir)
parquet_files = [f for f in files if f.endswith('.parquet')]

In [89]:
# Read and concatenate all parquet files
df_list = [pd.read_parquet(os.path.join(data_dir, f)) for f in parquet_files]
df = pd.concat(df_list, ignore_index=True)

In [90]:
len(df)

73874

In [100]:
df.groupby(['competition_name', 'season_name']).size().head(15)

competition_name        season_name
1. Bundesliga           2015/2016      7831
                        2023/2024       916
African Cup of Nations  2023           1244
Champions League        1970/1971        34
                        1971/1972        35
                        1972/1973        42
                        1999/2000        27
                        2003/2004        13
                        2004/2005        49
                        2006/2007        20
                        2008/2009        23
                        2009/2010        34
                        2010/2011        26
                        2011/2012        59
                        2012/2013        30
dtype: int64

In [91]:
df = df.reset_index(drop=True)
df.head(3)

Unnamed: 0,location,minute,play_pattern,player,player_id,position,possession,possession_team,possession_team_id,second,shot_aerial_won,shot_body_part,shot_first_time,shot_freeze_frame,shot_one_on_one,shot_outcome,shot_statsbomb_xg,shot_technique,shot_type,team,team_id,timestamp,type,under_pressure,match_id,competition_id,competition_name,season_id,season_name
0,"[92.6, 48.8]",1,From Free Kick,Marcial Manuel Pina Morales,397772.0,Left Center Midfield,5,Barcelona,217,13,,Left Foot,True,"[{'location': [106.1, 33.4], 'player': {'id': ...",,Blocked,0.023486,Normal,Open Play,Barcelona,217,00:01:13.997,Shot,,3888713,11,La Liga,278,1973/1974
1,"[97.6, 32.4]",4,From Free Kick,Jesus Antonio de la Cruz Gallego,397769.0,Left Back,12,Barcelona,217,41,,Left Foot,,"[{'location': [106.2, 35.8], 'player': {'id': ...",,Off T,0.035229,Normal,Open Play,Barcelona,217,00:04:41.465,Shot,,3888713,11,La Liga,278,1973/1974
2,"[110.7, 32.5]",8,Regular Play,José Martínez Sánchez,397763.0,Center Defensive Midfield,20,Real Madrid,220,28,True,Head,,"[{'location': [105.6, 45.0], 'player': {'id': ...",,Off T,0.058368,Normal,Open Play,Real Madrid,220,00:08:28.882,Shot,True,3888713,11,La Liga,278,1973/1974


## Calculate distance to goal


In [60]:
def calculate_distance_to_goal(location):
    goal_center = np.array([120, 40])
    shot_location = np.array(location)
    distance_to_goal = np.linalg.norm(shot_location - goal_center)
    return distance_to_goal

In [61]:
df["distance_to_goal"] = df["location"].apply(calculate_distance_to_goal)

In [62]:
df[["location", "distance_to_goal"]].head(3)

Unnamed: 0,location,distance_to_goal
0,"[92.6, 48.8]",28.778464
1,"[97.6, 32.4]",23.654175
2,"[110.7, 32.5]",11.947385


## Calculate angle to goal


In [63]:
def calculate_shot_angle(location):
    left_post = np.array([120, 36])
    right_post = np.array([120, 44])
    shot_location = np.array(location)

    # Exclude shots taken from the same x-coordinate as the goal line
    if shot_location[0] == 120:
        return np.nan

    # Calculate distances using the Euclidean distance
    a = np.linalg.norm(left_post - right_post)
    b = np.linalg.norm(shot_location - left_post)
    c = np.linalg.norm(shot_location - right_post)

    # Use the cosine rule to calculate the angle
    # cos(C) = (a^2 + b^2 - c^2) / (2ab)
    cos_angle = (b**2 + c**2 - a**2) / (2 * b * c)
    shot_angle_radians = np.arccos(np.clip(cos_angle, -1.0, 1.0))  # Clip to handle floating point errors

    # Convert the angle from radians to degrees
    shot_angle_degrees = np.degrees(shot_angle_radians)

    return shot_angle_degrees


In [64]:
df["shot_angle"] = df["location"].apply(calculate_shot_angle)

In [65]:
df[["location", "distance_to_goal", "shot_angle"]].head(3)

Unnamed: 0,location,distance_to_goal,shot_angle
0,"[92.6, 48.8]",28.778464,15.103394
1,"[97.6, 32.4]",23.654175,18.247526
2,"[110.7, 32.5]",11.947385,30.414151


## Number of close opponents


In [66]:
def find_opponents_within_radius(row, radius=1.5):
    try:
        freeze_frame_df = pd.json_normalize(row["shot_freeze_frame"])
        opponents_df = freeze_frame_df[freeze_frame_df["teammate"] == False]
    except:
        return np.nan

    if len(opponents_df) == 0:
        return 0

    shot_location = np.array(row["location"])
    opponents_locations = opponents_df["location"].values
    opponents_locations = [item for item in opponents_locations]

    # Calculate distances
    distances = [distance.euclidean(opponent, shot_location) for opponent in opponents_locations]

    # Count the number of opponents within the specified radius
    num_opponents_within_radius = sum(d <= radius for d in distances)

    return num_opponents_within_radius

In [67]:
df["opponents_within_radius"] = df.apply(find_opponents_within_radius, axis=1)

In [68]:
df[["location", "opponents_within_radius"]].head(3)

Unnamed: 0,location,opponents_within_radius
0,"[92.6, 48.8]",0.0
1,"[97.6, 32.4]",0.0
2,"[110.7, 32.5]",1.0


## Calculate GK distance to goal


In [69]:
def calculate_gk_distance_to_goal(row):
    try:
        freeze_frame_df = pd.json_normalize(row["shot_freeze_frame"])
        # Filter for the goalkeeper
        gk_df = freeze_frame_df[freeze_frame_df["position.name"] == "Goalkeeper"]
        if gk_df.empty:
            return np.nan
        gk_location = np.array(gk_df.iloc[0]["location"])
    except:
        return np.nan

    goal_center = np.array([120, 40])
    gk_distance_to_goal = np.linalg.norm(gk_location - goal_center)
    return gk_distance_to_goal

In [70]:
df["gk_distance_to_goal"] = df.apply(calculate_gk_distance_to_goal, axis=1)

In [71]:
df[["location", "gk_distance_to_goal"]].head(3)

Unnamed: 0,location,gk_distance_to_goal
0,"[92.6, 48.8]",0.781025
1,"[97.6, 32.4]",2.308679
2,"[110.7, 32.5]",2.284732


## Calculate number of players in shot triangle


In [72]:
from shapely.geometry import Point, Polygon

In [73]:
def calculate_players_in_shot_triangle(row):
    try:
        freeze_frame_df = pd.json_normalize(row["shot_freeze_frame"])
        # Exclude the shooter
        non_shooters_df = freeze_frame_df[freeze_frame_df["teammate"] == False]
    except:
        return np.nan

    if non_shooters_df.empty:
        return 0

    # Define the goalposts and shot location
    left_post = np.array([120, 36])
    right_post = np.array([120, 44])
    shot_location = np.array(row["location"])

    # Create a triangle using the shot location and the goalposts
    shot_triangle = Polygon([shot_location, left_post, right_post])

    # Count players inside the triangle
    num_players_in_triangle = 0
    for location in non_shooters_df["location"]:
        player_point = Point(location)
        if shot_triangle.contains(player_point):
            num_players_in_triangle += 1

    return num_players_in_triangle


df["players_in_shot_triangle"] = df.apply(calculate_players_in_shot_triangle, axis=1)

In [74]:
df["players_in_shot_triangle"].value_counts()

players_in_shot_triangle
1.0    34551
2.0    23806
3.0     7665
4.0     2956
0.0     2290
5.0     1169
6.0      354
7.0       84
8.0       15
9.0        8
Name: count, dtype: int64

## Convert specific position to general position


In [78]:
position_map = {
    "Center Forward": "Forward",
    "Left Wing": "Forward",
    "Right Wing": "Forward",
    "Left Center Forward": "Forward",
    "Right Center Forward": "Forward",
    "Secondary Striker": "Forward",
    "Center Attacking Midfield": "Midfielder",
    "Left Midfield": "Midfielder",
    "Right Midfield": "Midfielder",
    "Center Midfield": "Midfielder",
    "Defensive Midfield": "Midfielder",
    "Left Center Midfield": "Midfielder",
    "Right Center Midfield": "Midfielder",
    "Right Defensive Midfield": "Midfielder",
    "Left Defensive Midfield": "Midfielder",
    "Center Defensive Midfield": "Midfielder",
    "Left Attacking Midfield": "Midfielder",
    "Right Attacking Midfield": "Midfielder",
    "Center Midfield": "Midfielder",
    "Left Back": "Defender",
    "Right Back": "Defender",
    "Center Back": "Defender",
    "Left Center Back": "Defender",
    "Right Center Back": "Defender",
    "Left Wing Back": "Defender",
    "Right Wing Back": "Defender",
    "Goalkeeper": "Goalkeeper",
}

In [79]:
df["position_simplified"] = df["position"].map(position_map)

In [80]:
df[["position", "position_simplified"]].head(3)

Unnamed: 0,position,position_simplified
0,Left Center Midfield,Midfielder
1,Left Back,Defender
2,Center Defensive Midfield,Midfielder


# Convert boolean columns to int


In [81]:
boolean_columns = ['shot_aerial_won', 'shot_first_time', 'shot_one_on_one', 'under_pressure']

In [82]:
df[boolean_columns] = df[boolean_columns].fillna(0).astype(int)

## Convert outcome_id to target

The value 97 in outcome_id is a goal, anything esle is not a goal.


In [83]:
df["shot_outcome"].value_counts()

shot_outcome
Off T               24032
Blocked             18143
Saved               17472
Goal                 8185
Wayward              4034
Post                 1514
Saved Off Target      260
Saved to Post         234
Name: count, dtype: int64

In [84]:
df["target"] = df["shot_outcome"].apply(lambda x: 1 if x == 'Goal' else 0)
df[["shot_outcome", "target"]].head(3)

Unnamed: 0,shot_outcome,target
0,Blocked,0
1,Off T,0
2,Off T,0


## Fill NaN values with statistically derived values


In [85]:
opponents_within_radius_median = df['opponents_within_radius'].median()
gk_distance_to_goal_median = df['gk_distance_to_goal'].median()
players_in_shot_triangle_median = df['players_in_shot_triangle'].median()

df['opponents_within_radius'] = df['opponents_within_radius'].fillna(opponents_within_radius_median)
df['gk_distance_to_goal'] = df['gk_distance_to_goal'].fillna(gk_distance_to_goal_median)
df['players_in_shot_triangle'] = df['players_in_shot_triangle'].fillna(players_in_shot_triangle_median)


## Save data to parquet

In [86]:
len(df)

73874

In [87]:
df.to_parquet(f"{data_dir}/processed_data/complete_data.parquet", index=False)

## TODO: Convert shot body part to prefered/weak foot. Requires checking pass data and counting number of passes with each foot.

```python
# TODO: Get preferred foot from Statsbomb

preferred_foot = {
    # player_id: 'left' or 'right'
    1: 'right',
    2: 'left',
    # Add more player_id and preferred foot mappings
}

def convert_to_foot_type(row):
    player_id = row['player_id']
    shot_body_part = row['shot_body_part']

    # Check if the player's preferred foot is known
    if player_id in preferred_foot:
        if shot_body_part == 'left foot' and preferred_foot[player_id] == 'left':
            return 'strong foot'
        elif shot_body_part == 'right foot' and preferred_foot[player_id] == 'right':
            return 'strong foot'
        else:
            return 'weak foot'
    else:
        return np.nan  # Return NaN if preferred foot is unknown

df['foot_type'] = df.apply(convert_to_foot_type, axis=1)
```