In [7]:
from understatapi import UnderstatClient
import pandas as pd

understat = UnderstatClient()

#player_shot_data = understat.player(player='8260').get_shot_data()


In [32]:
league_2022 = understat.league(league="EPL").get_player_data(season="2022")
league_2023 = understat.league(league="EPL").get_player_data(season="2023")
league_2024 = understat.league(league="EPL").get_player_data(season="2024")

league_2022_df = pd.DataFrame(league_2022)
league_2023_df = pd.DataFrame(league_2023)
league_2024_df = pd.DataFrame(league_2024)

league_data = pd.concat([league_2022_df, league_2023_df, league_2024_df])

league_data['time'] = pd.to_numeric(league_data['time'])
league_data['goals'] = pd.to_numeric(league_data['goals'])

league_data = league_data.groupby('id').agg({
    'time': 'sum',
    'goals': 'sum',
    'player_name': 'first',
    'position': 'first'
}).reset_index()

league_data = league_data[league_data["position"].str.contains('F')]
league_data = league_data[league_data["time"] > 1000]
league_data = league_data[league_data["goals"] > 10]

In [35]:
league_data

Unnamed: 0,id,time,goals,player_name,position
6,10048,3463,19,Nicolas Jackson,F M S
18,10177,2382,12,Evan Ferguson,F S
33,10408,4185,17,Luis Díaz,F M S
52,10720,3927,21,Darwin Núñez,F S
73,10760,5916,16,Brennan Johnson,F M S
90,10846,4098,20,Julián Álvarez,F M S
105,11055,2384,11,Rasmus Højlund,F S
127,11296,3386,15,Cody Gakpo,F M S
140,11363,3081,12,Antoine Semenyo,F M S
174,11717,2879,11,Carlton Morris,F M S


In [49]:
unique_ids = league_data['id'].unique()
shot_df = pd.DataFrame()

# Loop through each ID
for player_id in unique_ids:
    try:
        # Get shot data for current player
        player_shot_data = understat.player(player=str(player_id)).get_shot_data()
        
        # Convert to DataFrame
        player_df = pd.DataFrame(player_shot_data)
        
        # Append to main DataFrame
        shot_df = pd.concat([shot_df, player_df], ignore_index=True)
        
        # Print progress
        print(f"Processed player ID: {player_id}")
        
        
    except Exception as e:
        print(f"Error processing player ID {player_id}: {e}")
        continue

# Now shot_df contains all the shot data
print("\nFinal DataFrame shape:", shot_df.shape)
print("\nFirst few rows:")
print(shot_df.head())


Processed player ID: 10048
Processed player ID: 10177
Processed player ID: 10408
Processed player ID: 10720
Processed player ID: 10760
Processed player ID: 10846
Processed player ID: 11055
Processed player ID: 11296
Processed player ID: 11363
Processed player ID: 11717
Processed player ID: 1250
Processed player ID: 1679
Processed player ID: 1776
Processed player ID: 2381
Processed player ID: 239
Processed player ID: 3697
Processed player ID: 4105
Processed player ID: 4456
Processed player ID: 453
Processed player ID: 468
Processed player ID: 482
Processed player ID: 501
Processed player ID: 5220
Processed player ID: 5221
Processed player ID: 5232
Processed player ID: 531
Processed player ID: 5543
Processed player ID: 5555
Processed player ID: 556
Processed player ID: 5735
Processed player ID: 5786
Processed player ID: 6026
Processed player ID: 6055
Processed player ID: 618
Processed player ID: 647
Processed player ID: 6552
Processed player ID: 6681
Processed player ID: 6691
Processed p

In [50]:
#shot_df = shot_df[shot_df["date"] > '2022-07-01']
print(shot_df.shape)

(19624, 20)


In [53]:
# First filter for goals and select columns
shot_df = shot_df[["id", "player", "result", "xG"]]
shot_df = shot_df[shot_df["result"] == "Goal"]

# Convert xG to numeric if it's not already
shot_df['xG'] = pd.to_numeric(shot_df['xG'])

# Create new DataFrame with the counts, grouping just by player and id
analysis_df = shot_df.groupby(['player']).agg({
    'result': 'count',  # Total goals
    'xG': lambda x: [sum(x > 0.3), sum(x <= 0.3)],  # Goals by xG threshold
    'id': 'first'  # Keep one ID for reference
}).reset_index()

# Rename and split the xG column into two
analysis_df = analysis_df.assign(
    total_goals=analysis_df['result'],
    high_xg_goals=analysis_df['xG'].str[0],
    low_xg_goals=analysis_df['xG'].str[1]
).drop(['result', 'xG'], axis=1)

# Reorder columns
analysis_df = analysis_df[['id', 'player', 'total_goals', 'high_xg_goals', 'low_xg_goals']]
analysis_df["low_chance_%"] = analysis_df["low_xg_goals"] /  analysis_df["total_goals"]
analysis_df["high_chance_%"] = analysis_df["high_xg_goals"] /  analysis_df["total_goals"]

# Sort by total goals descending
analysis_df = analysis_df.sort_values('low_chance_%', ascending=True)

print(analysis_df.head(50))

        id                 player  total_goals  high_xg_goals  low_xg_goals  \
32  485208         Julián Álvarez           23             19             4   
8   532070         Carlton Morris           11              9             2   
22  119020          Gabriel Jesus           73             59            14   
27  428714             Ivan Toney           36             29             7   
44  480598        Nicolas Jackson           31             24             7   
7    57294          Callum Wilson           88             68            20   
56  363053          Taiwo Awoniyi           37             28             9   
16  119919  Dominic Calvert-Lewin           56             42            14   
20  354876         Erling Haaland          135            100            35   
0    58777    Aleksandar Mitrovic           38             28            10   
30  221525   Jean-Philippe Mateta           50             36            14   
5   431116           Bryan Mbeumo           28      