In [11]:
import pandas as pd
import os
import datetime
import kaggle
import zipfile


from directory import KAGGLE_DATASET_SLUG, KAGGLE_DATA_ROOT, BASE_DATA_DIR, PLAYER_STATS_DATA_DIR


if not os.path.exists(KAGGLE_DATA_ROOT):
    print(f"Data directory '{KAGGLE_DATA_ROOT}' not found. Downloading from Kaggle.")

    print(f"Downloading dataset '{KAGGLE_DATASET_SLUG}'...")
    kaggle.api.dataset_download_files(
        KAGGLE_DATASET_SLUG,
        path=data_dir,
        unzip=True
    )
    print(f" Download and unzip complete.")
else:
    print(f"Data directory '{KAGGLE_DATA_ROOT}' already exists. Skipping download.")

print("Loading Files")
df_leagues = pd.read_csv(BASE_DATA_DIR + "/leagues.csv")
df_players = pd.read_csv(BASE_DATA_DIR + "/players.csv", low_memory=False)
df_teams = pd.read_csv(BASE_DATA_DIR + "/teams.csv")
df_fixtures = pd.read_csv(BASE_DATA_DIR + "/fixtures.csv")
df_teamStats = pd.read_csv(BASE_DATA_DIR + "/teamStats.csv")
df_teamRoster = pd.read_csv(BASE_DATA_DIR + "/teamRoster.csv")
df_keyEventDescription = pd.read_csv(BASE_DATA_DIR + "/keyEventDescription.csv")
df_status = pd.read_csv(BASE_DATA_DIR + "/status.csv")
df_venues = pd.read_csv(BASE_DATA_DIR + "/base_data/" + "/venues.csv")
df_standings = pd.read_csv(BASE_DATA_DIR + "/standings.csv")
print("Base files loaded.")

Data directory './kaggle_data' already exists. Skipping download.
Loading Files
Base files loaded.


In [12]:
stats_file_1 = input("Enter the first player stats file (e.g., playerstats_2024_ENG.1): ")

df_playerstats =  PLAYER_STATS_DATA_DIR + stats_file_1 + ".csv"


try:
    df_playerstats = pd.read_csv(df_playerstats, low_memory=False)
    print(f" Loaded {stats_file_1}.csv")

except FileNotFoundError as e:
    print(f"❌ ERROR: File not found. Check your filenames. {e}")
except Exception as e:
    print(f"An error occurred: {e}")



 Loaded playerstats_2024_ENG.1.csv


In [13]:
df_playerstats = pd.merge(df_playerstats, df_players, how = "inner", left_on = ["athleteId"], right_on = ["athleteId"])

In [14]:
df_playerstats = pd.merge(df_playerstats, df_teams, how = "inner", left_on = ["teamId"], right_on = ["teamId"])


In [15]:
df_playerstats = pd.merge(df_playerstats, df_leagues, how = "inner", left_on = ["seasonType"], right_on = ["seasonType"])


In [16]:
df_playerstats

Unnamed: 0,seasonType,year_x,league,teamId,athleteId,appearances_value,subIns_value,foulsCommitted_value,foulsSuffered_value,yellowCards_value,...,logoURL,venueId,slug_y,year_y,seasonName,seasonSlug,leagueId,midsizeName,leagueName,leagueShortName
0,12654,2024,ENG.1,368,4946,14,3,8,0,5,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,10318,eng.everton,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League
1,12654,2024,ENG.1,367,6327,17,3,17,7,5,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,7827,eng.tottenham,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League
2,12654,2024,ENG.1,349,7441,5,0,0,1,0,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,6020,eng.bournemouth,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League
3,12654,2024,ENG.1,373,9426,23,9,26,6,4,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,257,eng.ipswich,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League
4,12654,2024,ENG.1,331,17828,14,3,7,2,1,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,4440,eng.brighton,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,12654,2024,ENG.1,367,396748,0,0,0,0,0,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,7827,eng.tottenham,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League
796,12654,2024,ENG.1,349,397213,4,4,0,1,0,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,6020,eng.bournemouth,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League
797,12654,2024,ENG.1,349,398103,0,0,0,0,0,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,6020,eng.bournemouth,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League
798,12654,2024,ENG.1,349,398123,1,1,0,0,0,...,https://a.espncdn.com/i/teamlogos/soccer/500/3...,6020,eng.bournemouth,2024,2024-25 English Premier League,2024-25-english-premier-league,700,ENG.1,English Premier League,Premier League


In [17]:

dropped_columns = ['seasonType', 'teamId', 'athleteId', 'timestamp_x', 'firstName', 'middleName', 'lastName',
                   'fullName', 'shortName', 'nickName', 'slug_x', 'gender', 'birthPlaceCountry', 'positionAbbreviation',
                   'headshotUrl', 'headshot_alt', 'timestamp_y', 'location', 'name', 'abbreviation', 'shortDisplayName',
                   'color', 'alternateColor', 'logoURL', 'venueId', 'slug_y', 'seasonSlug', 'leagueId', 'midsizeName',
                   'leagueName', 'leagueShortName']

df_playerstats = df_playerstats.drop(columns=dropped_columns)
df_playerstats = df_playerstats.drop(columns={"league", })

new_order = ['seasonName', 'displayName_y', 'positionName', 'displayName_x', 'jersey', 'age', 'citizenship',
             'displayHeight', 'displayWeight', 'appearances_value', 'subIns_value', 'totalGoals_value',
             'goalAssists_value', 'totalShots_value', 'shotsOnTarget_value', 'offsides_value', 'ownGoals_value',
             'shotsFaced_value', 'saves_value', 'goalsConceded_value', 'foulsCommitted_value', 'foulsSuffered_value',
             'yellowCards_value', 'redCards_value']

df_playerstats = df_playerstats[new_order]

df_playerstats = df_playerstats.rename(
    columns={'displayName_y': "Team", 'positionName': 'Position', "displayName_x": "Player Name", "age": "Age",
             "citizenship": "Country", "displayHeight": "Height", 'displayWeight': 'Weight', 'jersey': 'Shirt #',
             "appearances_value": "Appearances", 'subIns_value': "Substitute Appearances", "totalGoals_value": "Goals",
             'goalAssists_value': 'Assists', "totalShots_value": 'Shots', 'shotsOnTarget_value': 'Shots On Target',
             'shotsFaced_value': 'Shots Faced', 'saves_value': 'Saves', 'offsides_value': 'Offsides',
             'ownGoals_value': 'Own Goals', 'goalsConceded_value': 'Goals Conceded',
             'foulsCommitted_value': 'Fouls Committed', 'foulsSuffered_value': 'Fouls Suffered',
             'yellowCards_value': 'Yellow Cards', 'redCards_value': 'Red Cards'})

df_playerstats['Shirt #'] = df_playerstats['Shirt #'].astype(int)
df_playerstats['Age'] = df_playerstats['Age'].astype(int)

position_order = ['Goalkeeper', 'Defender', 'Midfielder', 'Forward']
df_playerstats['Position'] = pd.Categorical(df_playerstats['Position'], categories=position_order, ordered=True)
df_playerstats = df_playerstats.sort_values(by=['Team', 'Position'], ascending=True)

teams_to_drop = ['Burnley', 'Luton Town', 'Sheffield United']
df_playerstats = df_playerstats[~df_playerstats['Team'].isin(teams_to_drop)]

if not df_playerstats.empty:
    season_name = df_playerstats['seasonName'].iloc[0].replace(' ', '_').replace('/', '-')
    file_name_1 = f"{season_name}_player_stats.csv"
    df_playerstats.to_csv(file_name_1, index=False)
    print(f"\nSuccessfully saved data to: {file_name_1}")
else:
    print("\nDataFrame 1 was empty. No file was saved.")

df_playerstats


Successfully saved data to: 2024-25_English_Premier_League_player_stats.csv


Unnamed: 0,seasonName,Team,Position,Player Name,Shirt #,Age,Country,Height,Weight,Appearances,...,Shots On Target,Offsides,Own Goals,Shots Faced,Saves,Goals Conceded,Fouls Committed,Fouls Suffered,Yellow Cards,Red Cards
2,2024-25 English Premier League,AFC Bournemouth,Goalkeeper,Mark Travers,-1,26,Republic of Ireland,"6' 3""",181 lbs,5,...,0,0,0,77,20,5,0,1,0,0
59,2024-25 English Premier League,AFC Bournemouth,Goalkeeper,Neto,22,36,Brazil,"6' 3""",183 lbs,2,...,0,1,0,25,11,2,0,2,0,0
132,2024-25 English Premier League,AFC Bournemouth,Goalkeeper,Kepa Arrizabalaga,13,31,Spain,"6' 2""",183 lbs,31,...,0,4,0,325,97,39,2,6,3,0
506,2024-25 English Premier League,AFC Bournemouth,Goalkeeper,William Dennis,40,25,England,"6' 2""",183 lbs,0,...,0,0,0,0,0,0,0,0,0,0
719,2024-25 English Premier League,AFC Bournemouth,Goalkeeper,Callan Mckenna,46,18,Scotland,"6' 2""",168 lbs,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,2024-25 English Premier League,Wolverhampton Wanderers,Forward,Matheus Cunha,10,26,Brazil,"6' 0""",168 lbs,33,...,39,9,0,0,1,51,40,71,4,0
537,2024-25 English Premier League,Wolverhampton Wanderers,Forward,Tawanda Chirewa,-1,22,Zimbabwe,"5' 11""",159 lbs,0,...,0,0,0,0,0,0,0,0,0,0
619,2024-25 English Premier League,Wolverhampton Wanderers,Forward,Chiquinho,10,25,Portugal,"5' 10""",161 lbs,1,...,0,0,0,0,0,0,1,0,0,0
706,2024-25 English Premier League,Wolverhampton Wanderers,Forward,Tom Edozie,-1,19,England,"5' 10""",150 lbs,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# df_playerstats2 = pd.merge(df_playerstats2, df_players, how = "inner", left_on = ["athleteId"], right_on = ["athleteId"])
# df_playerstats2 = pd.merge(df_playerstats2, df_teams, how = "inner", left_on = ["teamId"], right_on = ["teamId"])
# df_playerstats2 = pd.merge(df_playerstats2, df_leagues, how = "inner", left_on = ["seasonType"], right_on = ["seasonType"])
#
# df_playerstats2

In [19]:
# dropped_columns = [
#     'seasonType', 'teamId', 'athleteId', 'timestamp_x', 'firstName',
#     'middleName', 'lastName', 'fullName', 'shortName', 'nickName', 'slug_x',
#     'gender', 'birthPlaceCountry', 'positionAbbreviation', 'headshotUrl',
#     'headshot_alt', 'timestamp_y', 'location', 'name', 'abbreviation',
#     'shortDisplayName', 'color', 'alternateColor', 'logoURL', 'venueId',
#     'slug_y', 'seasonSlug', 'leagueId', 'midsizeName', 'leagueName',
#     'leagueShortName'
# ]
# df_playerstats2 = df_playerstats2.drop(columns=dropped_columns)
# df_playerstats2 = df_playerstats2.drop(columns={"league", })
#
# new_order = [
#     'seasonName', 'displayName_y', 'positionName', 'displayName_x',
#     'jersey', 'age', 'citizenship', 'displayHeight', 'displayWeight',
#     'appearances_value', 'subIns_value', 'totalGoals_value', 'goalAssists_value',
#     'totalShots_value', 'shotsOnTarget_value', 'offsides_value', 'ownGoals_value',
#     'shotsFaced_value', 'saves_value', 'goalsConceded_value', 'foulsCommitted_value',
#     'foulsSuffered_value', 'yellowCards_value', 'redCards_value'
# ]
# df_playerstats2 = df_playerstats2[new_order]
#
# df_playerstats2 = df_playerstats2.rename(columns={
#     'year_x': 'Year',
#     'displayName_y': "Team",
#     'positionName': 'Position',
#     "displayName_x": "Player Name",
#     "age": "Age",
#     "citizenship": "Country",
#     "displayHeight": "Height",
#     'displayWeight': 'Weight',
#     'jersey': 'Shirt #',
#     "appearances_value": "Appearances",
#     'subIns_value': "Substitute Appearances",
#     "totalGoals_value": "Goals",
#     'goalAssists_value': 'Assists',
#     "totalShots_value": 'Shots',
#     'shotsOnTarget_value': 'Shots On Target',
#     'shotsFaced_value': 'Shots Faced',
#     'saves_value': 'Saves',
#     'offsides_value': 'Offsides',
#     'ownGoals_value': 'Own Goals',
#     'goalsConceded_value': 'Goals Conceded',
#     'foulsCommitted_value': 'Fouls Committed',
#     'foulsSuffered_value': 'Fouls Suffered',
#     'yellowCards_value': 'Yellow Cards',
#     'redCards_value': 'Red Cards'
# })
#
# df_playerstats2['Shirt #'] = df_playerstats2['Shirt #'].astype(int)
# df_playerstats2['Age'] = df_playerstats2['Age'].astype(int)
#
# position_order = ['Goalkeeper', 'Defender', 'Midfielder', 'Forward']
# df_playerstats2['Position'] = pd.Categorical(df_playerstats2['Position'], categories=position_order, ordered=True)
# df_playerstats2 = df_playerstats2.sort_values(by=['Team', 'Position'], ascending=True)
#
# teams_to_drop = ['Burnley', 'Luton Town', 'Sheffield United']
# df_playerstats2 = df_playerstats2[~df_playerstats2['Team'].isin(teams_to_drop)]
#
# if not df_playerstats2.empty:
#     season_name = df_playerstats2['seasonName'].iloc[0].replace(' ', '_').replace('/', '-')
#     file_name_2 = f"{season_name}_player_stats.csv"
#     df_playerstats2.to_csv(file_name_2, index=False)
#     print(f"\nSuccessfully saved data to: {file_name_2}")
# else:
#     print("\nDataFrame 2 was empty. No file was saved.")
#
# df_playerstats2