In [1]:
import pandas as pd
import os
import datetime
import kaggle
import zipfile


from directory import KAGGLE_DATASET_SLUG, KAGGLE_DATA_ROOT, BASE_DATA_DIR, PLAYER_STATS_DATA_DIR


if not os.path.exists(KAGGLE_DATA_ROOT):
    print(f"Data directory '{KAGGLE_DATA_ROOT}' not found. Downloading from Kaggle.")

    print(f"Downloading dataset '{KAGGLE_DATASET_SLUG}'...")
    kaggle.api.dataset_download_files(
        KAGGLE_DATASET_SLUG,
        path=data_dir,
        unzip=True
    )
    print(f" Download and unzip complete.")
else:
    print(f"Data directory '{KAGGLE_DATA_ROOT}' already exists. Skipping download.")

print("Loading Files")
df_leagues = pd.read_csv(BASE_DATA_DIR + "/leagues.csv")
df_players = pd.read_csv(BASE_DATA_DIR + "/players.csv", low_memory=False)
df_teams = pd.read_csv(BASE_DATA_DIR + "/teams.csv")
df_fixtures = pd.read_csv(BASE_DATA_DIR + "/fixtures.csv")
df_teamStats = pd.read_csv(BASE_DATA_DIR + "/teamStats.csv")
df_teamRoster = pd.read_csv(BASE_DATA_DIR + "/teamRoster.csv")
df_keyEventDescription = pd.read_csv(BASE_DATA_DIR + "/keyEventDescription.csv")
df_status = pd.read_csv(BASE_DATA_DIR + "/status.csv")
df_venues = pd.read_csv(BASE_DATA_DIR + "/venues.csv")
df_standings = pd.read_csv(BASE_DATA_DIR + "/standings.csv")
print("Base files loaded.")

Data directory './kaggle_data' already exists. Skipping download.
Loading Files
Base files loaded.


In [2]:
stats_file_1 = input("Enter the first player stats file (e.g., playerstats_2024_ENG.1): ")

df_playerstats =  PLAYER_STATS_DATA_DIR + '/' + stats_file_1 + ".csv"


try:
    df_playerstats = pd.read_csv(df_playerstats, low_memory=False)
    print(f" Loaded {stats_file_1}.csv")

except FileNotFoundError as e:
    print(f"❌ ERROR: File not found. Check your filenames. {e}")
except Exception as e:
    print(f"An error occurred: {e}")



 Loaded playerstats_2024_ENG.LEAGUE_CUP.csv


In [3]:
df_playerstats = pd.merge(df_playerstats, df_players, how = "inner", left_on = ["athleteId"], right_on = ["athleteId"])

In [4]:
df_playerstats = pd.merge(df_playerstats, df_teams, how = "inner", left_on = ["teamId"], right_on = ["teamId"])


In [5]:
df_playerstats = pd.merge(df_playerstats, df_leagues, how = "inner", left_on = ["seasonType"], right_on = ["seasonType"])


In [6]:
df_playerstats

Unnamed: 0,seasonType,year_x,league,teamId,athleteId,appearances_value,subIns_value,foulsCommitted_value,foulsSuffered_value,yellowCards_value,...,logoURL,venueId,slug_y,year_y,seasonName,seasonSlug,leagueId,midsizeName,leagueName,leagueShortName
0,12733,2024,ENG.LEAGUE_CUP,368,4946,3,0,3,3,1,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,10318,eng.everton,2024,"2024-25 English Carabao Cup, First Round",first-round,3920,ENG.LEAGUE_CUP,English Carabao Cup,Carabao Cup
1,12733,2024,ENG.LEAGUE_CUP,350,4948,0,0,0,0,0,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,3809,eng.wigan,2024,"2024-25 English Carabao Cup, First Round",first-round,3920,ENG.LEAGUE_CUP,English Carabao Cup,Carabao Cup
2,12733,2024,ENG.LEAGUE_CUP,368,4948,0,0,0,0,0,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,10318,eng.everton,2024,"2024-25 English Carabao Cup, First Round",first-round,3920,ENG.LEAGUE_CUP,English Carabao Cup,Carabao Cup
3,12733,2024,ENG.LEAGUE_CUP,374,5527,1,1,1,0,0,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,6289,eng.derby,2024,"2024-25 English Carabao Cup, First Round",first-round,3920,ENG.LEAGUE_CUP,English Carabao Cup,Carabao Cup
4,12733,2024,ENG.LEAGUE_CUP,302,5882,0,0,0,0,0,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,6294,eng.shrewsbury,2024,"2024-25 English Carabao Cup, First Round",first-round,3920,ENG.LEAGUE_CUP,English Carabao Cup,Carabao Cup
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7896,12730,2024,ENG.LEAGUE_CUP,376,390476,1,1,0,0,0,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,4850,eng.southampton,2024,"2024-25 English Carabao Cup, Fourth Round",fourth-round,3920,ENG.LEAGUE_CUP,English Carabao Cup,Carabao Cup
7897,12730,2024,ENG.LEAGUE_CUP,376,390477,1,1,0,1,0,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,4850,eng.southampton,2024,"2024-25 English Carabao Cup, Fourth Round",fourth-round,3920,ENG.LEAGUE_CUP,English Carabao Cup,Carabao Cup
7898,12730,2024,ENG.LEAGUE_CUP,346,390981,0,0,0,0,0,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,4248,eng.blackpool,2024,"2024-25 English Carabao Cup, Fourth Round",fourth-round,3920,ENG.LEAGUE_CUP,English Carabao Cup,Carabao Cup
7899,12730,2024,ENG.LEAGUE_CUP,334,392106,0,0,0,0,0,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,4089,eng.qpr,2024,"2024-25 English Carabao Cup, Fourth Round",fourth-round,3920,ENG.LEAGUE_CUP,English Carabao Cup,Carabao Cup


In [7]:

dropped_columns = ['seasonType', 'teamId', 'athleteId', 'timestamp_x', 'firstName', 'middleName', 'lastName',
                   'fullName', 'shortName', 'nickName', 'slug_x', 'gender', 'birthPlaceCountry', 'positionAbbreviation',
                   'headshotUrl', 'headshot_alt', 'timestamp_y', 'location', 'name', 'abbreviation', 'shortDisplayName',
                   'color', 'alternateColor', 'logoURL', 'venueId', 'slug_y', 'seasonSlug', 'leagueId', 'midsizeName',
                   'leagueName', 'leagueShortName']

df_playerstats = df_playerstats.drop(columns=dropped_columns)
df_playerstats = df_playerstats.drop(columns={"league", })

new_order = ['seasonName', 'displayName_y', 'positionName', 'displayName_x', 'jersey', 'age', 'citizenship',
             'displayHeight', 'displayWeight', 'appearances_value', 'subIns_value', 'totalGoals_value',
             'goalAssists_value', 'totalShots_value', 'shotsOnTarget_value', 'offsides_value', 'ownGoals_value',
             'shotsFaced_value', 'saves_value', 'goalsConceded_value', 'foulsCommitted_value', 'foulsSuffered_value',
             'yellowCards_value', 'redCards_value']

df_playerstats = df_playerstats[new_order]

df_playerstats = df_playerstats.rename(
    columns={'displayName_y': "Team", 'positionName': 'Position', "displayName_x": "Player Name", "age": "Age",
             "citizenship": "Country", "displayHeight": "Height", 'displayWeight': 'Weight', 'jersey': 'Shirt #',
             "appearances_value": "Appearances", 'subIns_value': "Substitute Appearances", "totalGoals_value": "Goals",
             'goalAssists_value': 'Assists', "totalShots_value": 'Shots', 'shotsOnTarget_value': 'Shots On Target',
             'shotsFaced_value': 'Shots Faced', 'saves_value': 'Saves', 'offsides_value': 'Offsides',
             'ownGoals_value': 'Own Goals', 'goalsConceded_value': 'Goals Conceded',
             'foulsCommitted_value': 'Fouls Committed', 'foulsSuffered_value': 'Fouls Suffered',
             'yellowCards_value': 'Yellow Cards', 'redCards_value': 'Red Cards'})

df_playerstats['Shirt #'] = df_playerstats['Shirt #'].astype(int)
df_playerstats['Age'] = df_playerstats['Age'].astype(int)

position_order = ['Goalkeeper', 'Defender', 'Midfielder', 'Forward']
df_playerstats['Position'] = pd.Categorical(df_playerstats['Position'], categories=position_order, ordered=True)
df_playerstats = df_playerstats.sort_values(by=['Team', 'Position'], ascending=True)

teams_to_drop = ['Burnley', 'Luton Town', 'Sheffield United']
df_playerstats = df_playerstats[~df_playerstats['Team'].isin(teams_to_drop)]

if not df_playerstats.empty:
    season_name = df_playerstats['seasonName'].iloc[0].replace(' ', '_').replace('/', '-')
    file_name_1 = f"{season_name}_player_stats.csv"
    df_playerstats.to_csv(file_name_1, index=False)
    print(f"\nSuccessfully saved data to: {file_name_1}")
else:
    print("\nDataFrame 1 was empty. No file was saved.")

df_playerstats


Successfully saved data to: 2024-25_English_Carabao_Cup,_First_Round_player_stats.csv


Unnamed: 0,seasonName,Team,Position,Player Name,Shirt #,Age,Country,Height,Weight,Appearances,...,Shots On Target,Offsides,Own Goals,Shots Faced,Saves,Goals Conceded,Fouls Committed,Fouls Suffered,Yellow Cards,Red Cards
8,"2024-25 English Carabao Cup, First Round",AFC Bournemouth,Goalkeeper,Mark Travers,-1,26,Republic of Ireland,"6' 3""",181 lbs,1,...,0,1,0,11,3,1,0,1,1,0
175,"2024-25 English Carabao Cup, First Round",AFC Bournemouth,Goalkeeper,Neto,22,36,Brazil,"6' 3""",183 lbs,0,...,0,0,0,0,0,0,0,0,0,0
222,"2024-25 English Carabao Cup, First Round",AFC Bournemouth,Goalkeeper,Andrei Radu,13,28,Romania,"6' 2""",152 lbs,3,...,0,0,0,30,5,4,0,0,0,0
1947,"2024-25 English Carabao Cup, Second Round",AFC Bournemouth,Goalkeeper,Mark Travers,-1,26,Republic of Ireland,"6' 3""",181 lbs,0,...,0,0,0,0,0,0,0,0,0,0
2120,"2024-25 English Carabao Cup, Second Round",AFC Bournemouth,Goalkeeper,Neto,22,36,Brazil,"6' 3""",183 lbs,1,...,0,0,0,14,3,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6685,"2024-25 English Carabao Cup, Fourth Round",Wycombe Wanderers,Forward,Daniel Udoh,23,29,Nigeria,"6' 0""",183 lbs,2,...,1,1,0,0,0,0,1,3,0,0
7019,"2024-25 English Carabao Cup, Fourth Round",Wycombe Wanderers,Forward,Beryly Lubula,30,27,Congo DR,"5' 10""",168 lbs,1,...,2,0,0,0,0,1,1,2,0,0
7598,"2024-25 English Carabao Cup, Fourth Round",Wycombe Wanderers,Forward,Jaiden Bartolo,-1,19,Gibraltar,"6' 0""",,0,...,0,0,0,0,0,0,0,0,0,0
7673,"2024-25 English Carabao Cup, Fourth Round",Wycombe Wanderers,Forward,Gideon Kodua,30,21,England,,,1,...,0,0,0,0,0,0,0,2,0,0


In [8]:
# df_playerstats2 = pd.merge(df_playerstats2, df_players, how = "inner", left_on = ["athleteId"], right_on = ["athleteId"])
# df_playerstats2 = pd.merge(df_playerstats2, df_teams, how = "inner", left_on = ["teamId"], right_on = ["teamId"])
# df_playerstats2 = pd.merge(df_playerstats2, df_leagues, how = "inner", left_on = ["seasonType"], right_on = ["seasonType"])
#
# df_playerstats2

In [9]:
# dropped_columns = [
#     'seasonType', 'teamId', 'athleteId', 'timestamp_x', 'firstName',
#     'middleName', 'lastName', 'fullName', 'shortName', 'nickName', 'slug_x',
#     'gender', 'birthPlaceCountry', 'positionAbbreviation', 'headshotUrl',
#     'headshot_alt', 'timestamp_y', 'location', 'name', 'abbreviation',
#     'shortDisplayName', 'color', 'alternateColor', 'logoURL', 'venueId',
#     'slug_y', 'seasonSlug', 'leagueId', 'midsizeName', 'leagueName',
#     'leagueShortName'
# ]
# df_playerstats2 = df_playerstats2.drop(columns=dropped_columns)
# df_playerstats2 = df_playerstats2.drop(columns={"league", })
#
# new_order = [
#     'seasonName', 'displayName_y', 'positionName', 'displayName_x',
#     'jersey', 'age', 'citizenship', 'displayHeight', 'displayWeight',
#     'appearances_value', 'subIns_value', 'totalGoals_value', 'goalAssists_value',
#     'totalShots_value', 'shotsOnTarget_value', 'offsides_value', 'ownGoals_value',
#     'shotsFaced_value', 'saves_value', 'goalsConceded_value', 'foulsCommitted_value',
#     'foulsSuffered_value', 'yellowCards_value', 'redCards_value'
# ]
# df_playerstats2 = df_playerstats2[new_order]
#
# df_playerstats2 = df_playerstats2.rename(columns={
#     'year_x': 'Year',
#     'displayName_y': "Team",
#     'positionName': 'Position',
#     "displayName_x": "Player Name",
#     "age": "Age",
#     "citizenship": "Country",
#     "displayHeight": "Height",
#     'displayWeight': 'Weight',
#     'jersey': 'Shirt #',
#     "appearances_value": "Appearances",
#     'subIns_value': "Substitute Appearances",
#     "totalGoals_value": "Goals",
#     'goalAssists_value': 'Assists',
#     "totalShots_value": 'Shots',
#     'shotsOnTarget_value': 'Shots On Target',
#     'shotsFaced_value': 'Shots Faced',
#     'saves_value': 'Saves',
#     'offsides_value': 'Offsides',
#     'ownGoals_value': 'Own Goals',
#     'goalsConceded_value': 'Goals Conceded',
#     'foulsCommitted_value': 'Fouls Committed',
#     'foulsSuffered_value': 'Fouls Suffered',
#     'yellowCards_value': 'Yellow Cards',
#     'redCards_value': 'Red Cards'
# })
#
# df_playerstats2['Shirt #'] = df_playerstats2['Shirt #'].astype(int)
# df_playerstats2['Age'] = df_playerstats2['Age'].astype(int)
#
# position_order = ['Goalkeeper', 'Defender', 'Midfielder', 'Forward']
# df_playerstats2['Position'] = pd.Categorical(df_playerstats2['Position'], categories=position_order, ordered=True)
# df_playerstats2 = df_playerstats2.sort_values(by=['Team', 'Position'], ascending=True)
#
# teams_to_drop = ['Burnley', 'Luton Town', 'Sheffield United']
# df_playerstats2 = df_playerstats2[~df_playerstats2['Team'].isin(teams_to_drop)]
#
# if not df_playerstats2.empty:
#     season_name = df_playerstats2['seasonName'].iloc[0].replace(' ', '_').replace('/', '-')
#     file_name_2 = f"{season_name}_player_stats.csv"
#     df_playerstats2.to_csv(file_name_2, index=False)
#     print(f"\nSuccessfully saved data to: {file_name_2}")
# else:
#     print("\nDataFrame 2 was empty. No file was saved.")
#
# df_playerstats2