## Introduction to Data Science Project ##

### Data Loading ###

This first part of the code will be about loading the data and constructing the features to later train our models.

We are working with Statsbomb Open Data 360, which is integrated in the mplsoccer python library. This data is event stream data, that includes the positions of all other players on the pitch at the time of any action.

In [1]:
from mplsoccer import Sbopen
import pandas as pd
import numpy as np

parser = Sbopen()
df_competition = parser.competition()
competitions360 = df_competition[
    (df_competition['match_available_360'].notna()) & 
    (df_competition["competition_name"].isin(["UEFA Euro"])) & 
    (df_competition["season_name"].isin(["2020"]))
] #.isin(["1. Bundesliga", "FIFA World Cup", "UEFA Euro"]))]
print(competitions360[['competition_name', 'season_name', 'competition_id', 'season_id']])

   competition_name season_name  competition_id  season_id
69        UEFA Euro        2020              55         43


Now all matches from the selected competitions are concatenated into one data frame.

In [2]:
matches_360_list = []

for _, row in competitions360.iterrows():
    comp_id = row['competition_id']
    season_id = row['season_id']
    
    df_matches = parser.match(competition_id=comp_id, season_id=season_id)
    matches_360_list.append(df_matches)

all_matches_360 = pd.concat(matches_360_list, ignore_index=True)

print(len(all_matches_360))

51


And then all events from the selected matches are also concatenated into one data frame.

In [3]:
all_events = pd.DataFrame()

for _, row in competitions360.iterrows():
    comp_id = row['competition_id']
    season_id = row['season_id']
    
    try:
        df_matches = parser.match(competition_id=comp_id, season_id=season_id)
        
        for match_id in df_matches['match_id']: 
            try:
                df_event, _, _, _ = parser.event(match_id)
                df_event['match_id'] = match_id  
                df_event['competition_name'] = row['competition_name']
                df_event['season_name'] = row['season_name']

                all_events = pd.concat([all_events, df_event], ignore_index=True)
            except Exception as e:
                print(f"Skipping match {match_id}: {e}")
                
    except Exception as e:
        print(f"Error retrieving matches for competition {comp_id}, season {season_id}: {e}")

### Data Cleaning ###

For the sake of simplicity we will exclude some action types from the data. This could be because they are off-ball actions, they are not relevant to the model, they are very rare or they are already captured by the model. In particular we will neglect goal keeper and defensive actions since those are primarily off-ball and cannot really be captured by our data.

Also we will exclude penalty shootouts from the model, these are denoted with period = 5.

In [4]:
relevant_events = ['Pass', 'Carry', 'Shot', 'Clearance', 'Dribble', 'Duel', 'Foul Committed', 'Interception',
                   'Miscontrol', 'Ball Recovery', 'Own Goal Against']

all_events = all_events[all_events['type_name'].isin(relevant_events)]
all_events = all_events[~((all_events['type_name'] == "Duel") & (all_events['sub_type_name'] != "Tackle"))]
all_events = all_events[all_events['period'] != 5]

def parse_action_row(row):
    type_name = row.get("type_name", "")
    outcome = row.get("outcome_name", "")
    body_part_name = row.get("body_part_name", "")

    if body_part_name in ["Right Foot", "Left Foot"]:
        body_part_name = "Foot"
    elif body_part_name == "Head":
        body_part_name = "Head"
    else:
        body_part_name = "Other"
    
    # Defaults
    action_type = "non_action"
    result = "success"

    if type_name == "Pass":
        cross = False
        if row.get("pass_cross", False) == np.nan:
            cross = True
        height = row.get("pass_height_name", "")
        subtype = row.get("sub_type_name", "")

        if subtype == "Free Kick":
            action_type = "Free Kick Pass"
        elif subtype == "Corner":
            action_type = "Corner"
        elif subtype == "Goal Kick":
            action_type = "Goal Kick"
        elif subtype == "Throw-in":
            action_type = "Throw In"
        elif cross:
            action_type = "Cross"
        else:
            action_type = "Pass"

        if outcome in ["Incomplete", "Out"]:
            result = "fail"
        elif outcome == "Pass Offside":
            result = "offside"

    elif type_name == "Shot":
        subtype = row.get("sub_type_name", "")
        if subtype == "Free Kick":
            action_type = "Free Kick Shot"
        elif subtype == "Penalty":
            action_type = "Penalty Shot"
        else:
            action_type = "Shot"

        result = "success" if outcome == "Goal" else "fail"

    elif type_name == "Dribble":
        action_type = "Carry"
        result = "fail" if outcome == "Incomplete" else "success"

    elif type_name == "Carry":
        action_type = "Carry"
        result = "success"

    elif type_name == "Foul Committed":
        action_type = "Foul"
        result = "fail"

    elif type_name == "Duel":
        action_type = "Tackle"
        result = "fail" if outcome in ["Lost In Play", "Lost Out"] else "success"

    elif type_name == "Interception":
        action_type = "Interception"
        result = "fail" if outcome in ["Lost In Play", "Lost Out"] else "success"

    elif type_name == "Own Goal Against":
        action_type = "Own Goal"
        result = "owngoal"

    elif type_name == "Clearance":
        action_type = "Clearance"
        result = "success"

    elif type_name == "Miscontrol":
        action_type = "Miscontrol"
        result = "fail"
    
    elif type_name == "Ball Recovery":
        action_type = "Ball Recovery"
        result = "fail" if row.get("ball_recovery_recovery_failure") == True else "success"

    return pd.Series([action_type, result, body_part_name])

all_events_cleaned = all_events
all_events_cleaned[["action_type", "result", "body_part_name"]] = all_events_cleaned.apply(parse_action_row, axis=1)

### Feature Construction ###

As the initial features we add distance to goal, angle to goal and time elapsed to every game state.

Note: The data is already normalized such that the attacking team will always attack from x = 0 to x = 120, so the distance to goal can be calculated the same for both teams regardless of possession changes. Also the x and y dimensions of the pitch (as well as individual player positions) are already normalized to x = [0, 120] and y = [0, 80] to deal with different pitch dimensions in football. The goal size is set to 7.32m.

In [5]:
GOAL_X = 120
GOAL_Y = 40
LEFT_POST = 43.66
RIGHT_POST = 36.34

def calculate_distance_to_goal(df_event):
    
    df_event['distance_to_goal'] = np.sqrt((df_event['x'] - GOAL_X)**2 + (df_event['y'] - GOAL_Y)**2)
    
    return df_event

def calculate_angle_to_goal(df_event):

    dx = GOAL_X - df_event['x']
    dy1 = LEFT_POST - df_event['y']
    dy2 = RIGHT_POST - df_event['y']

    angle = np.degrees(np.abs(np.arctan2(dy1, dx) - np.arctan2(dy2, dx)))
    df_event['angle_to_goal'] = angle

    return df_event

def calculate_time_elapsed(df_event):

    minute = df_event['minute']
    second = df_event['second']

    df_event['time_elapsed'] = minute * 60 + second

    return df_event

all_events_cleaned = (
    all_events_cleaned
    .pipe(calculate_distance_to_goal)
    .pipe(calculate_angle_to_goal)
    .pipe(calculate_time_elapsed)
)

### Target Label Construction ###

Now we will search all goals in the match and for the previous k = 10 actions we will assign a value of 1 to label_team_goal if the team that scored a goal was in possession at the game state or a label of 1 to label_opponent_goal if the opponent team was in possession at the game state. All other game states will have a default value of 0.

In [6]:
def assign_goal_labels(df_all_events, lookback=10):
    df_all_events = df_all_events.copy()
    df_all_events['label_team_goal'] = 0
    df_all_events['label_opponent_goal'] = 0

    grouped = df_all_events.groupby(['match_id', 'period'])

    for match_id, df_match in grouped:
        match_indices = df_match.index.to_list()

        for i, global_idx in enumerate(match_indices):
            if df_all_events.loc[global_idx, 'outcome_name'] == 'Goal':
                scoring_team = df_all_events.loc[global_idx, 'team_id']

                for j in range(max(0, i - lookback + 1), i + 1):
                    event_idx = match_indices[j]
                    acting_team = df_all_events.loc[event_idx, 'team_id']

                    if acting_team == scoring_team:
                        df_all_events.at[event_idx, 'label_team_goal'] = 1
                    else:
                        df_all_events.at[event_idx, 'label_opponent_goal'] = 1

    return df_all_events

all_events_cleaned = assign_goal_labels(all_events_cleaned)
all_events_cleaned.to_csv("data_cleaned.csv")