In [136]:
import os
import pandas as pd

# --- Path to your data folder ---
base_dir = "../../data/matches"   # adjust to your actual structure

# Collect all relevant CSV file paths
csv_files = []
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith(".csv") and "dynamic_events" in file.lower():
            csv_files.append(os.path.join(root, file))

print(f"Found {len(csv_files)} dynamic_events files")

# --- Read and concatenate them all ---
all_dfs = []
for path in csv_files:
    try:
        df = pd.read_csv(path)
        df["source_file"] = os.path.basename(path)  # optional: keep track of source
        all_dfs.append(df)
    except Exception as e:
        print(f"⚠️ Error reading {path}: {e}")

# Combine into one DataFrame
if all_dfs:
    combined_df = pd.concat(all_dfs, ignore_index=True)
    print(f"✅ Combined shape: {combined_df.shape}")
else:
    print("❌ No dynamic_events CSVs found")


Found 10 dynamic_events files


  df = pd.read_csv(path)


✅ Combined shape: (47853, 295)


  df = pd.read_csv(path)


In [137]:
combined_df

Unnamed: 0,event_id,index,match_id,frame_start,frame_end,frame_physical_start,time_start,time_end,minute_start,second_start,...,xloss_player_possession_max,xshot_player_possession_start,xshot_player_possession_end,xshot_player_possession_max,is_player_possession_start_matched,is_player_possession_end_matched,is_previous_pass_matched,is_pass_reception_matched,fully_extrapolated,source_file
0,8_0,0,1886347,28,28,,00:01.8,00:01.8,0,1,...,,,,,True,True,,True,False,1886347_dynamic_events.csv
1,8_1,1,1886347,48,58,,00:03.8,00:04.8,0,3,...,,,,,True,True,True,True,False,1886347_dynamic_events.csv
2,7_0,2,1886347,48,53,,00:03.8,00:04.3,0,3,...,,,,,True,True,,,False,1886347_dynamic_events.csv
3,7_1,3,1886347,48,58,,00:03.8,00:04.8,0,3,...,,,,,True,True,,True,False,1886347_dynamic_events.csv
4,9_0,4,1886347,56,58,34.0,00:02.4,00:04.8,0,2,...,,,,,True,True,True,True,,1886347_dynamic_events.csv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47848,9_820,4183,2017461,69853,69858,69853.0,95:18.3,95:18.8,95,18,...,0.099,0.815,0.815,0.815,True,True,True,,,2017461_dynamic_events.csv
47849,8_838,4184,2017461,69858,69858,,95:18.8,95:18.8,95,18,...,,,,,True,True,True,,False,2017461_dynamic_events.csv
47850,7_2065,4185,2017461,69858,69858,,95:18.8,95:18.8,95,18,...,,,,,True,True,,,False,2017461_dynamic_events.csv
47851,7_2066,4186,2017461,69858,69858,,95:18.8,95:18.8,95,18,...,,,,,True,True,,,False,2017461_dynamic_events.csv


In [138]:
cols_with_threat = [col for col in combined_df.columns if 'threat' in col.lower()]
cols_with_threat

['player_targeted_xthreat',
 'xthreat',
 'affected_line_breaking_passing_option_xthreat']

In [139]:
# Removing passing_options, off ball runs to make this easier
# We also don't care about them, they don't contribute to this analysis since we are using the raw tracking data for that

combined_df = combined_df[~combined_df['event_type'].isin(['passing_option', 'off_ball_run'])]

combined_df["source_file"] = combined_df["source_file"].str.replace(r"_.*", "", regex=True)

# only want recoveries and interceptions, this is for testing though

recoveries_further = combined_df[combined_df['start_type'].isin(['recovery', 'pass_interception'])]

# these are the baseline columns that give us enough information to identify what the column means

basic_columns = ['phase_index', 'player_possession_phase_index', 'event_type', 'event_subtype', 'start_type', 'end_type', 'pass_outcome', 'player_in_possession_name', 'player_in_possession_id', 'team_shortname',
                 'first_player_possession_in_team_possession', 'last_player_possession_in_team_possession', 'index', 'player_targeted_name', 'player_targeted_id', 'frame_start', 'frame_end']

recoveries_further = recoveries_further[basic_columns + ['source_file']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df["source_file"] = combined_df["source_file"].str.replace(r"_.*", "", regex=True)


In [140]:
# these are the start of the recovery possession
starts = combined_df[
    combined_df["start_type"].isin(["pass_interception", "recovery"]) &
    (combined_df["first_player_possession_in_team_possession"] == True) &
    (combined_df['last_player_possession_in_team_possession'] == False)
][["phase_index", "index", 'source_file', 'game_state', 'n_teammates_ahead_start', 'n_opponents_ahead_start', 'team_out_of_possession_phase_type']]
starts = starts.rename(columns={"index": "start_row", 
                                "phase_index": "start_phase"})

# these are the end of the recovery possession
ends = combined_df[
    combined_df["last_player_possession_in_team_possession"] == True
][['phase_index', "index", 'source_file']]
ends = ends.rename(columns={"index": "end_row",
                            "phase_index": "end_phase"})

# these are the possessions that start and end with the same event. A pass interception/recovery where the player's next action is to pass which is an interception, for example. 
starts_and_ends = combined_df[
    combined_df["start_type"].isin(["pass_interception", "recovery"]) &
    (combined_df["first_player_possession_in_team_possession"] == True) &
    (combined_df['last_player_possession_in_team_possession'] == True)
][["phase_index", "index", 'source_file']]
starts_and_ends = starts_and_ends.rename(columns={"index": "start_row", 
                                "phase_index": "start_phase"})

In [141]:
ends_shifted = ends.copy()
ends_shifted["start_phase"] = ends_shifted["end_phase"] - 1

In [142]:
possessions = starts.merge(
    ends_shifted,
    on=["start_phase", 'source_file'],
    how="inner",
    suffixes=("_start", "_end")
)

possessions

Unnamed: 0,start_phase,start_row,source_file,game_state,n_teammates_ahead_start,n_opponents_ahead_start,team_out_of_possession_phase_type,end_phase,end_row
0,6,42,1886347,drawing,9.0,10.0,high_block,7,46
1,20,170,1886347,drawing,0.0,0.0,chaotic,21,193
2,39,452,1886347,drawing,3.0,5.0,chaotic,40,466
3,53,536,1886347,drawing,4.0,5.0,medium_block,54,558
4,84,882,1886347,drawing,5.0,7.0,chaotic,85,893
...,...,...,...,...,...,...,...,...,...
515,420,3921,2017461,winning,5.0,9.0,defending_set_play,421,3947
516,420,3933,2017461,winning,6.0,9.0,defending_set_play,421,3947
517,423,3967,2017461,losing,10.0,10.0,medium_block,424,3969
518,425,3973,2017461,winning,7.0,9.0,chaotic,426,3992


In [143]:
starts_and_ends['end_phase'] = starts_and_ends['start_phase']
starts_and_ends['end_row'] = starts_and_ends['start_row']

full_possessions = pd.concat([possessions, starts_and_ends], ignore_index=True)

In [144]:
full_possessions

Unnamed: 0,start_phase,start_row,source_file,game_state,n_teammates_ahead_start,n_opponents_ahead_start,team_out_of_possession_phase_type,end_phase,end_row
0,6,42,1886347,drawing,9.0,10.0,high_block,7,46
1,20,170,1886347,drawing,0.0,0.0,chaotic,21,193
2,39,452,1886347,drawing,3.0,5.0,chaotic,40,466
3,53,536,1886347,drawing,4.0,5.0,medium_block,54,558
4,84,882,1886347,drawing,5.0,7.0,chaotic,85,893
...,...,...,...,...,...,...,...,...,...
1119,403,3747,2017461,,,,,403,3747
1120,406,3782,2017461,,,,,406,3782
1121,414,3837,2017461,,,,,414,3837
1122,420,3932,2017461,,,,,420,3932


In [None]:
segments = []

# this is getting the rows in between the start and end of possession and creating a unique identifier for that possession
for _, p in full_possessions.iterrows():
    s = p["start_row"]
    e = p["end_row"]
    file  = p["source_file"]
    

    seg = combined_df[(combined_df["index"] >= s) & (combined_df["index"] <= e) & (combined_df['source_file'] == file)].copy()

    seg["possession_id"] = f"{s}_{e}"

    start_row = combined_df[
        (combined_df["index"] == s) &
        (combined_df["source_file"] == file)
    ].iloc[0]

    game_state_start = start_row["game_state"]
    oop_phase_start = start_row["team_out_of_possession_phase_type"]
    teammates_start = start_row["n_teammates_ahead_start"]
    opponents_start = start_row["n_opponents_ahead_start"]
    max_player_targeted = seg["player_targeted_xthreat"].max()

    # ---- 3. Assign the start-row values to the entire segment ----
    seg["game_state"] = game_state_start
    seg["team_out_of_possession_phase_type"] = oop_phase_start
    seg["n_teammates_ahead_start"] = teammates_start
    seg["n_opponents_ahead_start"] = opponents_start
    seg["max_player_targeted_xthreat"] = max_player_targeted

    segments.append(seg[basic_columns + ["source_file", "possession_id", 'game_state', 'team_out_of_possession_phase_type', 'n_teammates_ahead_start', 'n_opponents_ahead_start', 
                                         'max_player_targeted_xthreat']])

all_possessions = pd.concat(segments, ignore_index=True)

all_possessions['Unique ID'] = all_possessions['source_file'] + "_" + all_possessions['possession_id'].astype(str)

In [146]:
all_possessions["possession_start"] = (
    all_possessions.groupby("Unique ID")["frame_start"].transform("min")
)

all_possessions["possession_end"] = (
    all_possessions.groupby("Unique ID")["frame_end"].transform("max")
)

In [147]:
all_possessions

Unnamed: 0,phase_index,player_possession_phase_index,event_type,event_subtype,start_type,end_type,pass_outcome,player_in_possession_name,player_in_possession_id,team_shortname,...,source_file,possession_id,game_state,team_out_of_possession_phase_type,n_teammates_ahead_start,n_opponents_ahead_start,player_targeted_xthreat,Unique ID,possession_start,possession_end
0,6,1.0,player_possession,,recovery,pass,successful,,,Newcastle,...,1886347,42_46,drawing,high_block,9.0,10.0,,1886347_42_46,387,504
1,6,2.0,player_possession,,pass_reception,pass,successful,,,Newcastle,...,1886347,42_46,drawing,high_block,9.0,10.0,0.0000,1886347_42_46,387,504
2,7,1.0,player_possession,,pass_reception,pass,unsuccessful,,,Newcastle,...,1886347,42_46,drawing,high_block,9.0,10.0,0.0024,1886347_42_46,387,504
3,20,1.0,player_possession,,recovery,pass,successful,,,Newcastle,...,1886347,170_193,drawing,chaotic,0.0,0.0,0.0091,1886347_170_193,1492,1606
4,20,2.0,player_possession,,pass_reception,pass,successful,,,Newcastle,...,1886347,170_193,drawing,chaotic,0.0,0.0,0.0071,1886347_170_193,1492,1606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5850,403,1.0,player_possession,,recovery,pass,unsuccessful,,,Auckland FC,...,2017461,3747_3747,winning,chaotic,4.0,8.0,,2017461_3747_3747,65560,65560
5851,406,2.0,player_possession,,pass_interception,pass,unsuccessful,,,Melbourne V FC,...,2017461,3782_3782,losing,chaotic,7.0,8.0,,2017461_3782_3782,65920,65920
5852,414,8.0,player_possession,,recovery,pass,unsuccessful,,,Auckland FC,...,2017461,3837_3837,winning,chaotic,7.0,10.0,0.0133,2017461_3837_3837,66806,66806
5853,420,1.0,player_possession,,pass_interception,pass,unsuccessful,,,Melbourne V FC,...,2017461,3932_3932,losing,disruption,3.0,6.0,,2017461_3932_3932,67716,67716


In [148]:
all_possessions.to_csv('../../Our Datasets/processed_recoverys_and_interceptions_dynamic_events.csv', index=False)

## My Notes: 

So we would want the start of the possession to be 
start_type to be pass_interception or recovery
First_player_possession_in_team_possession == True
Include this row

We would the end of the possession to be
Last_player_possession_in_team_possession == True
Phase_index = start of possession phase_index + 1
Include this row
