In [222]:
from unittest.mock import inplace

import pandas as pd
import os
import datetime
import kaggle
import zipfile


from directory import KAGGLE_DATASET_SLUG, KAGGLE_DATA_ROOT, BASE_DATA_DIR, PLAYS_DATA_DIR


if not os.path.exists(KAGGLE_DATA_ROOT):
    print(f"Data directory '{KAGGLE_DATA_ROOT}' not found. Downloading from Kaggle...")

    print(f"Downloading dataset '{KAGGLE_DATASET_SLUG}'...")
    kaggle.api.dataset_download_files(
        KAGGLE_DATASET_SLUG,
        path=data_dir,
        unzip=True
    )
    print(f"✅ Download and unzip complete.")
else:
    print(f"Data directory '{KAGGLE_DATA_ROOT}' already exists. Skipping download.")

df_leagues = pd.read_csv(BASE_DATA_DIR + "/leagues.csv")
df_players = pd.read_csv(BASE_DATA_DIR + "/players.csv", low_memory=False)
df_teams = pd.read_csv(BASE_DATA_DIR + "/teams.csv")
df_fixtures = pd.read_csv(BASE_DATA_DIR + "/fixtures.csv")
df_teamStats = pd.read_csv(BASE_DATA_DIR + "/teamStats.csv")
df_teamRoster = pd.read_csv(BASE_DATA_DIR + "/teamRoster.csv")
df_keyEventDescription = pd.read_csv(BASE_DATA_DIR + "/keyEventDescription.csv")
df_status = pd.read_csv(BASE_DATA_DIR + "/status.csv")
df_venues = pd.read_csv(BASE_DATA_DIR + "/venues.csv")
df_standings = pd.read_csv(BASE_DATA_DIR + "/standings.csv")

Data directory './kaggle_data' already exists. Skipping download.


In [223]:
# Ask the user for the specific 'plays' file they want to analyze
plays_filename = input("Enter the 'plays' file name (e.g., plays_2024_ENG.1): ")

# Construct the full file path
full_plays_path = PLAYS_DATA_DIR + '/' + plays_filename + ".csv"

print(f"--- Loading plays data from: {full_plays_path} ---")

# Load the specified CSV file
# I've added low_memory=False, as 'plays' files can be very large
try:
    df_plays = pd.read_csv(full_plays_path, low_memory=False)
    print("✅ Plays file loaded successfully!")
except FileNotFoundError:
    print(f"❌ ERROR: File not found at {full_plays_path}")
    print("Please check the filename and try again.")
except Exception as e:
    print(f"An error occurred: {e}")

# This cell is now separate, so you can inspect df_plays after loading

--- Loading plays data from: ./kaggle_data/plays_data/plays_2024_UEFA.EUROPA.csv ---
✅ Plays file loaded successfully!


In [224]:
df_plays

Unnamed: 0,seasonType,eventId,playOrder,playId,typeId,text,shortText,period,clockValue,clockDisplayValue,...,wallclock,goalPositionX,goalPositionY,fieldpositionX,fieldPositionY,fieldPosition2X,fieldPosition2Y,athleteId,participant,updateDateTime
0,12891,732201,1,44689443,80,First Half begins.,,1,0,,...,2025-02-14 13:29:55,0.0,0.0,0.000,0.000,0.000,0.000,,,2025-02-15 05:51:25
1,12891,732201,2,44689453,68,"Offside, AZ. Sven Mijnans is caught offside.",Troy Parrott Offside,1,71,2',...,2025-02-14 13:30:10,0.0,0.0,0.316,0.363,0.202,0.330,279790.0,Troy Parrott,2025-02-15 05:51:25
2,12891,732201,3,44689481,68,"Offside, AZ. Mayckel Lahdo is caught offside.",Troy Parrott Offside,1,127,3',...,2025-02-14 13:30:10,0.0,0.0,0.742,0.149,0.236,0.192,279790.0,Troy Parrott,2025-02-15 05:51:25
3,12891,732201,4,44689489,66,Foul by Berkan Kutlu (Galatasaray).,Berkan Kutlu Foul,1,198,4',...,2025-02-14 13:30:10,0.0,0.0,0.908,0.960,0.000,0.000,292252.0,Berkan Kutlu,2025-02-15 05:51:25
4,12891,732201,4,44689489,66,Foul by Berkan Kutlu (Galatasaray).,Berkan Kutlu Foul,1,198,4',...,2025-02-14 13:30:10,0.0,0.0,0.908,0.960,0.000,0.000,,Ernest Poku,2025-02-15 05:51:25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23858,12892,720608,79,44401418,66,Matej Jurásek (Slavia Prague) wins a free kick...,Kasper Dolberg Foul,2,5400,90'+1',...,2024-12-20 16:31:47,0.0,0.0,0.124,0.977,0.000,0.000,219284.0,Kasper Dolberg,2024-12-21 06:01:07
23859,12892,720608,80,44401460,94,Kasper Dolberg (RSC Anderlecht) is shown the y...,Kasper Dolberg Yellow Card,2,5400,90'+2',...,2024-12-20 16:31:47,0.0,0.0,0.000,0.000,0.000,0.000,219284.0,Kasper Dolberg,2024-12-21 06:01:07
23860,12892,720608,81,44401462,117,Attempt missed. Mojmír Chytil (Slavia Prague) ...,Mojmír Chytil Shot Off Target,2,5400,90'+2',...,2024-12-20 16:31:47,0.0,0.0,0.222,0.530,0.134,0.554,343733.0,Mojmír Chytil,2024-12-21 06:01:07
23861,12892,720608,79,44401418,66,Matej Jurásek (Slavia Prague) wins a free kick...,Kasper Dolberg Foul,2,5400,90'+1',...,2024-12-20 16:31:47,0.0,0.0,0.124,0.977,0.000,0.000,,Matej Jurásek,2024-12-21 06:01:07


In [225]:
#df_keyevents = pd.merge(df_keyevents, df_players, how = "inner", left_on = ["athleteId"], right_on = ["athleteId"])

#df_keyevents = pd.merge(df_keyevents, df_players, how = "inner", left_on = ["athleteId"], right_on = ["athleteId"])

In [226]:
df_plays = pd.merge(df_plays,df_teams, how="inner", left_on=['teamId'],right_on=['teamId'])
df_plays.rename(columns={'name':'team'},inplace=True)
df_plays_columns = [
    "seasonType",
    "eventId",
    "playOrder",
    "playId",
    "typeId",
    "text",
    "shortText",
    "period",
    "clockValue",
    "clockDisplayValue",
    "team",
    "scoringPlay",
    "shootout",
    "wallclock",
    "goalPositionX",
    "goalPositionY",
    "fieldpositionX",
    "fieldPositionY",
    "fieldPosition2X",
    "fieldPosition2Y",
    "athleteId",
    "participant",
    "updateDateTime"
]
df_plays = df_plays[df_plays_columns]
df_plays

Unnamed: 0,seasonType,eventId,playOrder,playId,typeId,text,shortText,period,clockValue,clockDisplayValue,...,wallclock,goalPositionX,goalPositionY,fieldpositionX,fieldPositionY,fieldPosition2X,fieldPosition2Y,athleteId,participant,updateDateTime
0,12891,732201,2,44689453,68,"Offside, AZ. Sven Mijnans is caught offside.",Troy Parrott Offside,1,71,2',...,2025-02-14 13:30:10,0.0,0.0,0.316,0.363,0.202,0.330,279790.0,Troy Parrott,2025-02-15 05:51:25
1,12891,732201,3,44689481,68,"Offside, AZ. Mayckel Lahdo is caught offside.",Troy Parrott Offside,1,127,3',...,2025-02-14 13:30:10,0.0,0.0,0.742,0.149,0.236,0.192,279790.0,Troy Parrott,2025-02-15 05:51:25
2,12891,732201,4,44689489,66,Foul by Berkan Kutlu (Galatasaray).,Berkan Kutlu Foul,1,198,4',...,2025-02-14 13:30:10,0.0,0.0,0.908,0.960,0.000,0.000,292252.0,Berkan Kutlu,2025-02-15 05:51:25
3,12891,732201,4,44689489,66,Foul by Berkan Kutlu (Galatasaray).,Berkan Kutlu Foul,1,198,4',...,2025-02-14 13:30:10,0.0,0.0,0.908,0.960,0.000,0.000,,Ernest Poku,2025-02-15 05:51:25
4,12891,732201,5,44689510,106,Attempt saved. Ernest Poku (AZ) right footed s...,Ernest Poku Shot On Target,1,261,5',...,2025-02-14 13:30:10,0.0,0.0,0.488,0.489,0.066,0.502,318534.0,Ernest Poku,2025-02-15 05:51:25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23452,12892,720608,79,44401418,66,Matej Jurásek (Slavia Prague) wins a free kick...,Kasper Dolberg Foul,2,5400,90'+1',...,2024-12-20 16:31:47,0.0,0.0,0.124,0.977,0.000,0.000,219284.0,Kasper Dolberg,2024-12-21 06:01:07
23453,12892,720608,80,44401460,94,Kasper Dolberg (RSC Anderlecht) is shown the y...,Kasper Dolberg Yellow Card,2,5400,90'+2',...,2024-12-20 16:31:47,0.0,0.0,0.000,0.000,0.000,0.000,219284.0,Kasper Dolberg,2024-12-21 06:01:07
23454,12892,720608,81,44401462,117,Attempt missed. Mojmír Chytil (Slavia Prague) ...,Mojmír Chytil Shot Off Target,2,5400,90'+2',...,2024-12-20 16:31:47,0.0,0.0,0.222,0.530,0.134,0.554,343733.0,Mojmír Chytil,2024-12-21 06:01:07
23455,12892,720608,79,44401418,66,Matej Jurásek (Slavia Prague) wins a free kick...,Kasper Dolberg Foul,2,5400,90'+1',...,2024-12-20 16:31:47,0.0,0.0,0.124,0.977,0.000,0.000,,Matej Jurásek,2024-12-21 06:01:07


In [227]:
df_plays = pd.merge(df_plays,df_keyEventDescription, how="left",
                    left_on=['typeId'],right_on=['keyEventTypeId'])


df_plays.rename(columns={'keyEventName':'playDescription'},inplace=True)
df_plays_columns = [
    "seasonType",
    "eventId",
    "playOrder",
    "playId",
    "playDescription",
    "text",
    "shortText",
    "period",
    "clockValue",
    "clockDisplayValue",
    "team",
    "scoringPlay",
    "shootout",
    "wallclock",
    "goalPositionX",
    "goalPositionY",
    "fieldpositionX",
    "fieldPositionY",
    "fieldPosition2X",
    "fieldPosition2Y",
    "athleteId",
    "participant",
    "updateDateTime"
]
df_plays = df_plays[df_plays_columns]
df_plays

Unnamed: 0,seasonType,eventId,playOrder,playId,playDescription,text,shortText,period,clockValue,clockDisplayValue,...,wallclock,goalPositionX,goalPositionY,fieldpositionX,fieldPositionY,fieldPosition2X,fieldPosition2Y,athleteId,participant,updateDateTime
0,12891,732201,2,44689453,Offside,"Offside, AZ. Sven Mijnans is caught offside.",Troy Parrott Offside,1,71,2',...,2025-02-14 13:30:10,0.0,0.0,0.316,0.363,0.202,0.330,279790.0,Troy Parrott,2025-02-15 05:51:25
1,12891,732201,3,44689481,Offside,"Offside, AZ. Mayckel Lahdo is caught offside.",Troy Parrott Offside,1,127,3',...,2025-02-14 13:30:10,0.0,0.0,0.742,0.149,0.236,0.192,279790.0,Troy Parrott,2025-02-15 05:51:25
2,12891,732201,4,44689489,Foul,Foul by Berkan Kutlu (Galatasaray).,Berkan Kutlu Foul,1,198,4',...,2025-02-14 13:30:10,0.0,0.0,0.908,0.960,0.000,0.000,292252.0,Berkan Kutlu,2025-02-15 05:51:25
3,12891,732201,4,44689489,Foul,Foul by Berkan Kutlu (Galatasaray).,Berkan Kutlu Foul,1,198,4',...,2025-02-14 13:30:10,0.0,0.0,0.908,0.960,0.000,0.000,,Ernest Poku,2025-02-15 05:51:25
4,12891,732201,5,44689510,Shot On Target,Attempt saved. Ernest Poku (AZ) right footed s...,Ernest Poku Shot On Target,1,261,5',...,2025-02-14 13:30:10,0.0,0.0,0.488,0.489,0.066,0.502,318534.0,Ernest Poku,2025-02-15 05:51:25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23452,12892,720608,79,44401418,Foul,Matej Jurásek (Slavia Prague) wins a free kick...,Kasper Dolberg Foul,2,5400,90'+1',...,2024-12-20 16:31:47,0.0,0.0,0.124,0.977,0.000,0.000,219284.0,Kasper Dolberg,2024-12-21 06:01:07
23453,12892,720608,80,44401460,Yellow Card,Kasper Dolberg (RSC Anderlecht) is shown the y...,Kasper Dolberg Yellow Card,2,5400,90'+2',...,2024-12-20 16:31:47,0.0,0.0,0.000,0.000,0.000,0.000,219284.0,Kasper Dolberg,2024-12-21 06:01:07
23454,12892,720608,81,44401462,Shot Off Target,Attempt missed. Mojmír Chytil (Slavia Prague) ...,Mojmír Chytil Shot Off Target,2,5400,90'+2',...,2024-12-20 16:31:47,0.0,0.0,0.222,0.530,0.134,0.554,343733.0,Mojmír Chytil,2024-12-21 06:01:07
23455,12892,720608,79,44401418,Foul,Matej Jurásek (Slavia Prague) wins a free kick...,Kasper Dolberg Foul,2,5400,90'+1',...,2024-12-20 16:31:47,0.0,0.0,0.124,0.977,0.000,0.000,,Matej Jurásek,2024-12-21 06:01:07


In [228]:
df_plays = pd.merge(df_plays, df_leagues, how = "inner", left_on = ["seasonType"], right_on = ["seasonType"])

df_plays_columns = [
    "seasonName",
    "eventId",
    "playId",
    "playDescription",
    "text",
    "shortText",
    "period",
    "clockDisplayValue",
    "team",
    "participant",
]

df_plays = df_plays[df_plays_columns]

df_plays.dropna(inplace=True)
ASSIST_PATTERN = r"Assisted by (.*?)(?:, |\.| with a |\s+following)"

df_plays['Assister'] = df_plays['text'].str.extract(ASSIST_PATTERN, expand=False)

# Clean up the extracted string (This is still necessary)
df_plays['Assister'] = df_plays['Assister'].str.strip()

df_plays_columns = [
    "seasonName",
    "eventId",
    "playId",
    "playDescription",
    "text",
    "shortText",
    "period",
    "clockDisplayValue",
    "team",
    "participant",
    "Assister"
]
df_plays = df_plays[df_plays_columns]
df_plays.drop_duplicates(subset=['playId'], inplace=True)
df_plays




Unnamed: 0,seasonName,eventId,playId,playDescription,text,shortText,period,clockDisplayValue,team,participant,Assister
0,"2024-25 UEFA Europa League, Knockout Round Pla...",732201,44689453,Offside,"Offside, AZ. Sven Mijnans is caught offside.",Troy Parrott Offside,1,2',AZ Alkmaar,Troy Parrott,
1,"2024-25 UEFA Europa League, Knockout Round Pla...",732201,44689481,Offside,"Offside, AZ. Mayckel Lahdo is caught offside.",Troy Parrott Offside,1,3',AZ Alkmaar,Troy Parrott,
2,"2024-25 UEFA Europa League, Knockout Round Pla...",732201,44689489,Foul,Foul by Berkan Kutlu (Galatasaray).,Berkan Kutlu Foul,1,4',Galatasaray,Berkan Kutlu,
4,"2024-25 UEFA Europa League, Knockout Round Pla...",732201,44689510,Shot On Target,Attempt saved. Ernest Poku (AZ) right footed s...,Ernest Poku Shot On Target,1,5',AZ Alkmaar,Ernest Poku,Troy Parrott
6,"2024-25 UEFA Europa League, Knockout Round Pla...",732201,44689554,Shot Off Target,Attempt missed. Álvaro Morata (Galatasaray) ri...,Álvaro Morata Shot Off Target,1,5',Galatasaray,Álvaro Morata,
...,...,...,...,...,...,...,...,...,...,...,...
23449,"2024-25 UEFA Europa League, League Phase",720608,44401356,Yellow Card,Ondrej Lingr (Slavia Prague) is shown the yell...,Ondrej Lingr Yellow Card,2,89',Slavia Prague,Ondrej Lingr,
23450,"2024-25 UEFA Europa League, League Phase",720608,44401398,Foul,Mojmír Chytil (Slavia Prague) wins a free kick...,Jan-Carlo Simic Foul,2,90',Anderlecht,Jan-Carlo Simic,
23452,"2024-25 UEFA Europa League, League Phase",720608,44401418,Foul,Matej Jurásek (Slavia Prague) wins a free kick...,Kasper Dolberg Foul,2,90'+1',Anderlecht,Kasper Dolberg,
23453,"2024-25 UEFA Europa League, League Phase",720608,44401460,Yellow Card,Kasper Dolberg (RSC Anderlecht) is shown the y...,Kasper Dolberg Yellow Card,2,90'+2',Anderlecht,Kasper Dolberg,


In [229]:
if not df_plays.empty:

    season_name = df_plays['seasonName'].iloc[0].replace(' ', '_').replace('/', '-')
    file_name = f"{season_name}_plays_data.csv"


    df_plays.to_csv(file_name, index=False)

    print(f"\nSuccessfully saved data to: {file_name}")
else:
    print("\nDataFrame was empty. No file was saved.")
df_plays


Successfully saved data to: 2024-25_UEFA_Europa_League,_Knockout_Round_Playoffs_plays_data.csv


Unnamed: 0,seasonName,eventId,playId,playDescription,text,shortText,period,clockDisplayValue,team,participant,Assister
0,"2024-25 UEFA Europa League, Knockout Round Pla...",732201,44689453,Offside,"Offside, AZ. Sven Mijnans is caught offside.",Troy Parrott Offside,1,2',AZ Alkmaar,Troy Parrott,
1,"2024-25 UEFA Europa League, Knockout Round Pla...",732201,44689481,Offside,"Offside, AZ. Mayckel Lahdo is caught offside.",Troy Parrott Offside,1,3',AZ Alkmaar,Troy Parrott,
2,"2024-25 UEFA Europa League, Knockout Round Pla...",732201,44689489,Foul,Foul by Berkan Kutlu (Galatasaray).,Berkan Kutlu Foul,1,4',Galatasaray,Berkan Kutlu,
4,"2024-25 UEFA Europa League, Knockout Round Pla...",732201,44689510,Shot On Target,Attempt saved. Ernest Poku (AZ) right footed s...,Ernest Poku Shot On Target,1,5',AZ Alkmaar,Ernest Poku,Troy Parrott
6,"2024-25 UEFA Europa League, Knockout Round Pla...",732201,44689554,Shot Off Target,Attempt missed. Álvaro Morata (Galatasaray) ri...,Álvaro Morata Shot Off Target,1,5',Galatasaray,Álvaro Morata,
...,...,...,...,...,...,...,...,...,...,...,...
23449,"2024-25 UEFA Europa League, League Phase",720608,44401356,Yellow Card,Ondrej Lingr (Slavia Prague) is shown the yell...,Ondrej Lingr Yellow Card,2,89',Slavia Prague,Ondrej Lingr,
23450,"2024-25 UEFA Europa League, League Phase",720608,44401398,Foul,Mojmír Chytil (Slavia Prague) wins a free kick...,Jan-Carlo Simic Foul,2,90',Anderlecht,Jan-Carlo Simic,
23452,"2024-25 UEFA Europa League, League Phase",720608,44401418,Foul,Matej Jurásek (Slavia Prague) wins a free kick...,Kasper Dolberg Foul,2,90'+1',Anderlecht,Kasper Dolberg,
23453,"2024-25 UEFA Europa League, League Phase",720608,44401460,Yellow Card,Kasper Dolberg (RSC Anderlecht) is shown the y...,Kasper Dolberg Yellow Card,2,90'+2',Anderlecht,Kasper Dolberg,
