In [12]:
import pandas as pd
from statsbombpy import sb
from tqdm import tqdm
import os

In [13]:
# Set working directory to your desired folder
os.chdir("/Users/tirdodbehbehani/Desktop/BSE/Term 2/Geospatial Data Science and Economic Data Models/Football_Analysis")

# Confirm the current working directory
print("Current working directory:", os.getcwd())

Current working directory: /Users/tirdodbehbehani/Desktop/BSE/Term 2/Geospatial Data Science and Economic Data Models/Football_Analysis


In [3]:
# --- Step 1: Filter Competitions ---
comps = sb.competitions()

# Filter for male, non-youth competitions, and exclude specific competitions
comps_clean = comps[(comps['competition_gender'] == 'male') &
                    (comps['competition_youth'] == False) &
                    (~comps['competition_name'].isin([
                        'FIFA U20 World Cup',
                        'Indian Super league',
                        'Major League Soccer',
                        'North American League'
                    ]))]



In [4]:
# --- Step 2: Retrieve Matches ---
matches_list = []
for _, row in comps_clean.iterrows():
    try:
        m = sb.matches(competition_id=row['competition_id'], season_id=row['season_id'])
        matches_list.append(m)
    except Exception as e:
        print(f"Error fetching matches for {row['competition_name']} ({row['season_name']}): {e}")

matches_shots = pd.concat(matches_list, ignore_index=True)



In [5]:
# --- Step 3: Split Matches for Shots and Passes ---
# Convert match_date to datetime
matches_shots['match_date'] = pd.to_datetime(matches_shots['match_date'])
# For passes, filter matches from year >=2000
matches_passes = matches_shots[matches_shots['match_date'].dt.year >= 2000]

In [6]:
# --- Step 4: Download Events in Chunks ---
def get_events(match_ids):
    events_list = []
    for match_id in tqdm(match_ids, desc="Downloading events"):
        try:
            events = sb.events(match_id=match_id)
            events_list.append(events)
        except Exception as e:
            print(f"Error downloading events for match {match_id}: {e}")
    return pd.concat(events_list, ignore_index=True)

In [7]:
# For shots: get events for all match IDs in matches_shots
shot_match_ids = matches_shots['match_id'].unique()
events_shots = get_events(shot_match_ids)

Downloading events: 100%|██████████| 2801/2801 [36:02<00:00,  1.30it/s]


In [None]:
print(events_shots['type'].value_counts())

shots = events_shots[events_shots['type'] == "Shot"]


type
Pass                 2788229
Ball Receipt*        2622528
Carry                2161380
Pressure              900670
Ball Recovery         287691
Duel                  212490
Clearance             126833
Block                 105429
Dribble                98989
Foul Committed         85739
Goal Keeper            85372
Foul Won               81627
Miscontrol             77991
Dispossessed           70670
Shot                   70553
Interception           63729
Dribbled Past          61622
Substitution           17160
Half Start             11396
Half End               11396
Injury Stoppage        10373
50/50                   8910
Tactical Shift          7175
Starting XI             5602
Shield                  3664
Referee Ball-Drop       2738
Player Off              2626
Player On               2601
Bad Behaviour           2318
Camera On               1837
Error                   1255
Offside                 1030
Camera off               452
Own Goal For             262
Own Goal 

In [9]:
shots.shape

(70553, 121)

In [10]:
# For passes: get events for matches_passes
pass_match_ids = matches_passes['match_id'].unique()
events_passes = get_events(pass_match_ids)
# Filter events for passes only
passes = events_passes[events_passes['type'] == "Pass"]

Downloading events: 100%|██████████| 2768/2768 [39:57<00:00,  1.15it/s]


In [11]:
passes.shape

(2760335, 121)

In [22]:
import pandas as pd

# --- Step 1: Merge match-level data from Matches_Shots ---
# (Matches_Shots has columns: 'match_id', 'match_date', 'competition', 'season', ...)

shots = pd.merge(
    shots, 
    matches_shots[['match_id', 'match_date', 'competition', 'season']],
    on='match_id',
    how='left'
)

# --- Step 2: Merge competition info from comps_clean ---
# Here we merge on the competition and season names.
# Ensure that the columns in comps_clean to merge on are 'competition_name' and 'season_name'
shots = pd.merge(
    shots,
    comps_clean[['competition_id', 'competition_name', 'season_id', 'season_name']],
    left_on=['competition', 'season'],
    right_on=['competition_name', 'season_name'],
    how='left'
)

# Optionally, drop redundant columns from the merge if desired:
#shots.drop(columns=['competition_name', 'season_name'], inplace=True)

# --- Step 3: Convert match_date to datetime ---
shots['match_date'] = pd.to_datetime(shots['match_date'])

# --- Step 4: Clean the DataFrame ---
# Drop unwanted columns similar to the R select() statement:
cols_to_drop = ['carry_end_location', 'goalkeeper_end_location', 'tactics_lineup',
                'related_events', 'shot_freeze_frame', 'pass_end_location']
shots_clean = shots.drop(columns=cols_to_drop, errors='ignore')

# --- Step 5: "Unnest" shot.end_location ---
# We assume shot.end_location is a column with list-like values (e.g., [x, y]).
if 'shot_end_location' in shots_clean.columns:
    # Expand the list into two columns: shot.end_x and shot.end_y.
    # Make sure each element is a list or tuple of length 2.
    shots_clean[['shot.end_x', 'shot.end_y']] = pd.DataFrame(
        shots_clean['shot_end_location'].tolist(), 
        index=shots_clean.index
    )
    # Drop the original shot_end_location column.
    shots_clean.drop(columns=['shot_end_location'], inplace=True)
else:
    print("Column 'shot_end_location' not found in shots_clean.")

# At this point, shots_clean mimics the R workflow:
# - It has match_date (as datetime),
# - It has competition info merged via the names,
# - And the shot.end_location column has been expanded into shot.end_x and shot.end_y.

KeyError: 'match_date'

In [15]:
# Export the DataFrames as CSV files
shots.to_csv("shots_new.csv", index=False)
passes.to_csv("passes_new.csv", index=False)