In [98]:
import pandas as pd
import os
import datetime
import kaggle
import zipfile


from directory import KAGGLE_DATASET_SLUG, KAGGLE_DATA_ROOT, BASE_DATA_DIR, PLAYS_DATA_DIR
from directory import KEY_EVENTS_DATA_DIR


if not os.path.exists(KAGGLE_DATA_ROOT):
    print(f"Data directory '{KAGGLE_DATA_ROOT}' not found. Downloading from Kaggle...")

    print(f"Downloading dataset '{KAGGLE_DATASET_SLUG}'...")
    kaggle.api.dataset_download_files(
        KAGGLE_DATASET_SLUG,
        path=KAGGLE_DATA_ROOT,
        unzip=True
    )
    print(f"✅ Download and unzip complete.")
else:
    print(f"Data directory '{KAGGLE_DATA_ROOT}' already exists. Skipping download.")

df_leagues = pd.read_csv(BASE_DATA_DIR + "/leagues.csv")
df_players = pd.read_csv(BASE_DATA_DIR + "/players.csv", low_memory=False)
df_teams = pd.read_csv(BASE_DATA_DIR + "/teams.csv")
df_fixtures = pd.read_csv(BASE_DATA_DIR + "/fixtures.csv")
df_teamStats = pd.read_csv(BASE_DATA_DIR + "/teamStats.csv")
df_teamRoster = pd.read_csv(BASE_DATA_DIR + "/teamRoster.csv")
df_keyEventDescription = pd.read_csv(BASE_DATA_DIR + "/keyEventDescription.csv")
df_status = pd.read_csv(BASE_DATA_DIR + "/status.csv")
df_venues = pd.read_csv(BASE_DATA_DIR + "/venues.csv")
df_standings = pd.read_csv(BASE_DATA_DIR + "/standings.csv")

Data directory './kaggle_data' already exists. Skipping download.


In [99]:

# Ask the user for the specific 'plays' file they want to analyze
keyEvents_filename = input("Enter the 'Key Events' file name (e.g., keyEvents_2024_ENG.1): ")

# Construct the full file path
full_keyEvents_path = KEY_EVENTS_DATA_DIR + '/' + keyEvents_filename + ".csv"

print(f"--- Loading plays data from: {full_keyEvents_path} ---")

# Load the specified CSV file
# I've added low_memory=False, as 'plays' files can be very large
try:
    df_keyEvents = pd.read_csv(full_keyEvents_path, low_memory=False)
    print("✅ Plays file loaded successfully!")
except FileNotFoundError:
    print(f"❌ ERROR: File not found at {full_keyEvents_path}")
    print("Please check the filename and try again.")
except Exception as e:
    print(f"An error occurred: {e}")

# This cell is now separate, so you can inspect df_plays after loading

--- Loading plays data from: ./kaggle_data/keyEvents_data/keyEvents_2024_ENG.1.csv ---
✅ Plays file loaded successfully!


In [100]:
df_keyEvents = pd.merge(df_keyEvents,df_teams, how="left", left_on=['teamId'],right_on=['teamId'])
df_keyEvents.rename(columns={'name':'team'},inplace=True)

df_keyEvents


Unnamed: 0,seasonType,eventId,keyEventOrder,playId,keyEventTypeId,period,clockValue,clockDisplayValue,scoringPlay,shootout,...,location,team,abbreviation,displayName,shortDisplayName,color,alternateColor,logoURL,venueId,slug
0,12654,704279,1,43556191,80,1,0,,0,0,...,,,,,,,,,,
1,12654,704279,2,43556383,94,1,1038,18',0,0,...,Manchester United,Manchester United,MAN,Manchester United,Man United,da020e,144992,https://a.espncdn.com/i/teamlogos/soccer/500/3...,250.0,eng.man_utd
2,12654,704279,3,43556498,94,1,1497,25',0,0,...,Fulham,Fulham,FUL,Fulham,Fulham,ffffff,d11317,https://a.espncdn.com/i/teamlogos/soccer/500/3...,279.0,eng.fulham
3,12654,704279,4,43556737,94,1,2394,40',0,0,...,Manchester United,Manchester United,MAN,Manchester United,Man United,da020e,144992,https://a.espncdn.com/i/teamlogos/soccer/500/3...,250.0,eng.man_utd
4,12654,704279,5,43556797,81,1,2700,45'+1',0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14699,12654,704658,18,45319232,76,2,4990,84',0,0,...,Aston Villa,Aston Villa,AVL,Aston Villa,Aston Villa,660e36,ffffff,https://a.espncdn.com/i/teamlogos/soccer/500/3...,307.0,eng.aston_villa
14700,12654,704658,18,45319232,76,2,4990,84',0,0,...,Aston Villa,Aston Villa,AVL,Aston Villa,Aston Villa,660e36,ffffff,https://a.espncdn.com/i/teamlogos/soccer/500/3...,307.0,eng.aston_villa
14701,12654,704658,19,45319290,98,2,5205,87',1,0,...,Manchester United,Manchester United,MAN,Manchester United,Man United,da020e,144992,https://a.espncdn.com/i/teamlogos/soccer/500/3...,250.0,eng.man_utd
14702,12654,704658,21,45319509,83,2,5400,90'+9',0,0,...,,,,,,,,,,


In [101]:
df_keyEvents = pd.merge(df_keyEvents,df_keyEventDescription, how="left",
                    left_on=['keyEventTypeId'],right_on=['keyEventTypeId'])
df_keyEvents.rename(columns={'keyEventName':'keyEventDescription'},inplace=True)


In [102]:
df_keyEvents = pd.merge(df_keyEvents,df_players, how="left",
                    left_on=['athleteId'],right_on=['athleteId'])
#df_keyEvents.rename(columns={'displayName':'participantDisplayName'},inplace=True)
df_keyEvents = pd.merge(df_keyEvents, df_leagues, how = "left", left_on = ["seasonType"], right_on = ["seasonType"])

df_keyEvents



Unnamed: 0,seasonType,eventId,keyEventOrder,playId,keyEventTypeId,period,clockValue,clockDisplayValue,scoringPlay,shootout,...,headshotUrl,headshot_alt,timestamp,year,seasonName,seasonSlug,leagueId,midsizeName,leagueName,leagueShortName
0,12654,704279,1,43556191,80,1,0,,0,0,...,,,,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League
1,12654,704279,2,43556383,94,1,1038,18',0,0,...,,,2025-07-31 04:40:44,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League
2,12654,704279,3,43556498,94,1,1497,25',0,0,...,https://a.espncdn.com/i/headshots/soccer/playe...,Calvin Bassey,2025-10-14 05:15:02,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League
3,12654,704279,4,43556737,94,1,2394,40',0,0,...,,,2025-07-31 04:40:44,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League
4,12654,704279,5,43556797,81,1,2700,45'+1',0,0,...,,,,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14699,12654,704658,18,45319232,76,2,4990,84',0,0,...,https://a.espncdn.com/i/headshots/soccer/playe...,Donyell Malen,2025-11-08 06:11:08,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League
14700,12654,704658,18,45319232,76,2,4990,84',0,0,...,,,2025-11-09 14:57:16,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League
14701,12654,704658,19,45319290,98,2,5205,87',1,0,...,,,2025-11-08 06:04:17,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League
14702,12654,704658,21,45319509,83,2,5400,90'+9',0,0,...,,,,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League


In [103]:
df_keyEvents_columns = [
        "seasonName",
        "eventId",
        "keyEventOrder",
        "playId",
        "keyEventDescription",
        "period",
        "clockDisplayValue",
        "scoringPlay",
        "shootout",
        "keyEventText",
        "keyEventShortText",
        "team",
        "participantOrder",
        "displayName_y",
        "positionName"
]
df_keyEvents = df_keyEvents[df_keyEvents_columns]

df_keyEvents

Unnamed: 0,seasonName,eventId,keyEventOrder,playId,keyEventDescription,period,clockDisplayValue,scoringPlay,shootout,keyEventText,keyEventShortText,team,participantOrder,displayName_y,positionName
0,2024-25 English Premier League,704279,1,43556191,Kickoff,1,,0,0,First Half begins.,,,,,
1,2024-25 English Premier League,704279,2,43556383,Yellow Card,1,18',0,0,Mason Mount (Manchester United) is shown the y...,Mason Mount Yellow Card,Manchester United,1.0,Mason Mount,Midfielder
2,2024-25 English Premier League,704279,3,43556498,Yellow Card,1,25',0,0,Calvin Bassey (Fulham) is shown the yellow car...,Calvin Bassey Yellow Card,Fulham,1.0,Calvin Bassey,Defender
3,2024-25 English Premier League,704279,4,43556737,Yellow Card,1,40',0,0,Harry Maguire (Manchester United) is shown the...,Harry Maguire Yellow Card,Manchester United,1.0,Harry Maguire,Defender
4,2024-25 English Premier League,704279,5,43556797,Halftime,1,45'+1',0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14699,2024-25 English Premier League,704658,18,45319232,Substitution,2,84',0,0,"Substitution, Aston Villa. Donyell Malen repla...",Donyell Malen Substitution,Aston Villa,1.0,Donyell Malen,Forward
14700,2024-25 English Premier League,704658,18,45319232,Substitution,2,84',0,0,"Substitution, Aston Villa. Donyell Malen repla...",Donyell Malen Substitution,Aston Villa,2.0,Morgan Rogers,Forward
14701,2024-25 English Premier League,704658,19,45319290,Penalty - Scored,2,87',1,0,"Goal! Manchester United 2, Aston Villa 0. Chri...",Christian Eriksen Penalty - Scor,Manchester United,1.0,Christian Eriksen,Midfielder
14702,2024-25 English Premier League,704658,21,45319509,End Regular Time,2,90'+9',0,0,"Second Half ends, Manchester United 2, Aston V...",,,,,


In [104]:

if not df_keyEvents.empty:

    season_name = df_keyEvents['seasonName'].iloc[0].replace(' ', '_').replace('/', '-')
    file_name = f"{season_name}_keyEvents_data.csv"


    df_keyEvents.to_csv(file_name, index=False)

    print(f"\nSuccessfully saved data to: {file_name}")
else:
    print("\nDataFrame was empty. No file was saved.")


Successfully saved data to: 2024-25_English_Premier_League_keyEvents_data.csv
