In [110]:
import os
import pandas as pd

# --- Path to your data folder ---
base_dir = "../../data/matches"   # adjust to your actual structure

# Collect all relevant CSV file paths
csv_files = []
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith(".csv") and "dynamic_events" in file.lower():
            csv_files.append(os.path.join(root, file))

print(f"Found {len(csv_files)} dynamic_events files")

# --- Read and concatenate them all ---
all_dfs = []
for path in csv_files:
    try:
        df = pd.read_csv(path)
        df["source_file"] = os.path.basename(path)  # optional: keep track of source
        all_dfs.append(df)
    except Exception as e:
        print(f"⚠️ Error reading {path}: {e}")

# Combine into one DataFrame
if all_dfs:
    combined_df = pd.concat(all_dfs, ignore_index=True)
    print(f"✅ Combined shape: {combined_df.shape}")
else:
    print("❌ No dynamic_events CSVs found")


Found 10 dynamic_events files


  df = pd.read_csv(path)


✅ Combined shape: (47853, 295)


  df = pd.read_csv(path)


In [111]:
combined_df

Unnamed: 0,event_id,index,match_id,frame_start,frame_end,frame_physical_start,time_start,time_end,minute_start,second_start,...,xloss_player_possession_max,xshot_player_possession_start,xshot_player_possession_end,xshot_player_possession_max,is_player_possession_start_matched,is_player_possession_end_matched,is_previous_pass_matched,is_pass_reception_matched,fully_extrapolated,source_file
0,8_0,0,1886347,28,28,,00:01.8,00:01.8,0,1,...,,,,,True,True,,True,False,1886347_dynamic_events.csv
1,8_1,1,1886347,48,58,,00:03.8,00:04.8,0,3,...,,,,,True,True,True,True,False,1886347_dynamic_events.csv
2,7_0,2,1886347,48,53,,00:03.8,00:04.3,0,3,...,,,,,True,True,,,False,1886347_dynamic_events.csv
3,7_1,3,1886347,48,58,,00:03.8,00:04.8,0,3,...,,,,,True,True,,True,False,1886347_dynamic_events.csv
4,9_0,4,1886347,56,58,34.0,00:02.4,00:04.8,0,2,...,,,,,True,True,True,True,,1886347_dynamic_events.csv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47848,9_820,4183,2017461,69853,69858,69853.0,95:18.3,95:18.8,95,18,...,0.099,0.815,0.815,0.815,True,True,True,,,2017461_dynamic_events.csv
47849,8_838,4184,2017461,69858,69858,,95:18.8,95:18.8,95,18,...,,,,,True,True,True,,False,2017461_dynamic_events.csv
47850,7_2065,4185,2017461,69858,69858,,95:18.8,95:18.8,95,18,...,,,,,True,True,,,False,2017461_dynamic_events.csv
47851,7_2066,4186,2017461,69858,69858,,95:18.8,95:18.8,95,18,...,,,,,True,True,,,False,2017461_dynamic_events.csv


In [112]:
# Removing passing_options, off ball runs to make this easier

combined_df = combined_df[~combined_df['event_type'].isin(['passing_option', 'off_ball_run'])]

recoveries_further = combined_df[combined_df['start_type'].isin(['recovery', 'pass_interception'])]

basic_columns = ['phase_index', 'player_possession_phase_index', 'event_type', 'event_subtype', 'start_type', 'end_type', 'pass_outcome', 'player_in_possession_name', 'team_shortname',
                 'first_player_possession_in_team_possession', 'last_player_possession_in_team_possession', 'index', 'player_targeted_name', 'frame_start', 'frame_end', ]

recoveries_further = recoveries_further[basic_columns + ['source_file']]


recoveries_further

Unnamed: 0,phase_index,player_possession_phase_index,event_type,event_subtype,start_type,end_type,pass_outcome,player_in_possession_name,team_shortname,first_player_possession_in_team_possession,last_player_possession_in_team_possession,index,player_targeted_name,frame_start,frame_end,source_file
24,4,1.0,player_possession,,pass_interception,pass,unsuccessful,,Newcastle,True,True,24,C. Timmins,283,299,1886347_dynamic_events.csv
31,5,1.0,player_possession,,pass_interception,pass,successful,,Auckland FC,True,False,31,F. Gallegos,301,337,1886347_dynamic_events.csv
42,6,1.0,player_possession,,recovery,pass,successful,,Newcastle,True,False,42,M. Natta,387,430,1886347_dynamic_events.csv
53,8,1.0,player_possession,,pass_interception,pass,successful,,Auckland FC,True,False,53,F. Gallegos,526,526,1886347_dynamic_events.csv
168,19,1.0,player_possession,,recovery,pass,unsuccessful,,Auckland FC,True,True,168,L. Rogerson,1463,1463,1886347_dynamic_events.csv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47690,428,9.0,player_possession,,pass_interception,pass,successful,,Melbourne V FC,True,False,4025,L. Jackson,68466,68466,2017461_dynamic_events.csv
47760,432,1.0,player_possession,,pass_interception,pass,unsuccessful,,Auckland FC,True,True,4095,G. May,69308,69308,2017461_dynamic_events.csv
47762,432,1.0,player_possession,,recovery,pass,successful,,Melbourne V FC,True,False,4097,R. Piscopo,69341,69349,2017461_dynamic_events.csv
47783,433,1.0,player_possession,,pass_interception,pass,successful,,Auckland FC,True,False,4118,G. May,69416,69423,2017461_dynamic_events.csv


In [113]:
source_file_want = ['1886347_dynamic_events.csv']

indexes_want = [23, 24, 25, 26, 27, 28]
indexes_want = list(range(15, 51))

test_one = combined_df[combined_df['index'].isin(indexes_want) & combined_df['source_file'].isin(source_file_want)]

# Removing passing_options, off ball runs to make this easier

test_one = test_one[~test_one['event_type'].isin(['passing_option', 'off_ball_run'])]

test_one = test_one[basic_columns + ['source_file']]

test_one

Unnamed: 0,phase_index,player_possession_phase_index,event_type,event_subtype,start_type,end_type,pass_outcome,player_in_possession_name,team_shortname,first_player_possession_in_team_possession,last_player_possession_in_team_possession,index,player_targeted_name,frame_start,frame_end,source_file
18,3,,on_ball_engagement,pressure,,indirect_regain,,F. De Vries,Newcastle,,,18,F. De Vries,235,251,1886347_dynamic_events.csv
19,3,1.0,player_possession,,pass_reception,pass,unsuccessful,,Auckland FC,False,True,19,L. Gillion,243,251,1886347_dynamic_events.csv
23,4,,on_ball_engagement,pressure,,direct_regain,,D. Ingham,Auckland FC,,,23,D. Ingham,253,299,1886347_dynamic_events.csv
24,4,1.0,player_possession,,pass_interception,pass,unsuccessful,,Newcastle,True,True,24,C. Timmins,283,299,1886347_dynamic_events.csv
31,5,1.0,player_possession,,pass_interception,pass,successful,,Auckland FC,True,False,31,F. Gallegos,301,337,1886347_dynamic_events.csv
35,5,,on_ball_engagement,pressure,,,,L. Gillion,Newcastle,,,35,L. Gillion,314,319,1886347_dynamic_events.csv
37,5,,on_ball_engagement,recovery_press,,direct_disruption,,F. Gallegos,Newcastle,,,37,F. Gallegos,345,350,1886347_dynamic_events.csv
39,5,2.0,player_possession,,pass_reception,pass,unsuccessful,,Auckland FC,False,True,39,G. May,350,350,1886347_dynamic_events.csv
42,6,1.0,player_possession,,recovery,pass,successful,,Newcastle,True,False,42,M. Natta,387,430,1886347_dynamic_events.csv
43,6,2.0,player_possession,,pass_reception,pass,successful,,Newcastle,False,False,43,A. Šušnjar,451,460,1886347_dynamic_events.csv


In [114]:
starts = combined_df[
    combined_df["start_type"].isin(["pass_interception", "recovery"]) &
    (combined_df["first_player_possession_in_team_possession"] == True)
][["phase_index", "index", 'source_file']]
starts = starts.rename(columns={"index": "start_row", 
                                "phase_index": "start_phase"})

ends = combined_df[
    combined_df["last_player_possession_in_team_possession"] == True
][['phase_index', "index", 'source_file']]
ends = ends.rename(columns={"index": "end_row",
                            "phase_index": "end_phase"})

In [115]:
ends_shifted = ends.copy()
ends_shifted["start_phase"] = ends_shifted["end_phase"] - 1

In [116]:
possessions = starts.merge(
    ends_shifted,
    on=["start_phase", 'source_file'],
    how="inner",
    suffixes=("_start", "_end")
)

In [117]:
segments = []

for _, p in possessions.iterrows():
    s = p["start_row"]
    e = p["end_row"]

    seg = combined_df[(combined_df["index"] >= s) & (combined_df["index"] <= e)].copy()

    seg["possession_id"] = f"{s}_{e}"

    segments.append(seg[basic_columns + ["source_file", "possession_id"]])

#########################################
# 6. Combine into final possession dataset
#########################################

all_possessions = pd.concat(segments, ignore_index=True)

all_possessions

Unnamed: 0,phase_index,player_possession_phase_index,event_type,event_subtype,start_type,end_type,pass_outcome,player_in_possession_name,team_shortname,first_player_possession_in_team_possession,last_player_possession_in_team_possession,index,player_targeted_name,frame_start,frame_end,source_file,possession_id
0,4,1.0,player_possession,,pass_interception,pass,unsuccessful,,Newcastle,True,True,24,C. Timmins,283,299,1886347_dynamic_events.csv,24_39
1,5,1.0,player_possession,,pass_interception,pass,successful,,Auckland FC,True,False,31,F. Gallegos,301,337,1886347_dynamic_events.csv,24_39
2,5,,on_ball_engagement,pressure,,,,L. Gillion,Newcastle,,,35,L. Gillion,314,319,1886347_dynamic_events.csv,24_39
3,5,,on_ball_engagement,recovery_press,,direct_disruption,,F. Gallegos,Newcastle,,,37,F. Gallegos,345,350,1886347_dynamic_events.csv,24_39
4,5,2.0,player_possession,,pass_reception,pass,unsuccessful,,Auckland FC,False,True,39,G. May,350,350,1886347_dynamic_events.csv,24_39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62875,433,1.0,player_possession,,pass_interception,pass,successful,,Auckland FC,True,False,4118,G. May,69416,69423,2017461_dynamic_events.csv,4097_4130
62876,433,,on_ball_engagement,counter_press,,,,G. May,Melbourne V FC,,,4123,G. May,69425,69432,2017461_dynamic_events.csv,4097_4130
62877,433,2.0,player_possession,,pass_reception,pass,successful,,Auckland FC,False,False,4125,N. Moreno,69432,69453,2017461_dynamic_events.csv,4097_4130
62878,433,,on_ball_engagement,counter_press,,,,G. May,Melbourne V FC,,,4127,G. May,69435,69453,2017461_dynamic_events.csv,4097_4130


## My Notes: 

So we would want the start of the possession to be 
start_type to be pass_interception or recovery
First_player_possession_in_team_possession == True
Include this row

We would the end of the possession to be
Last_player_possession_in_team_possession == True
Phase_index = start of possession phase_index + 1
Include this row
