Start data analysis with La Liga 2015/16 season


In [1]:
import json
import pandas as pd

# Define your path (with UTF-8 encoding)
file_path = r'C:\Users\nacho\open-data\data\matches\11\27.json'

# Load La Liga 2015/16 matches
with open(file_path, 'r', encoding='utf-8') as f:
    matches = json.load(f)

# Extract match IDs
match_ids = [match['match_id'] for match in matches]

print(f"Found {len(match_ids)} La Liga 2015/16 matches.")

Found 380 La Liga 2015/16 matches.


In [2]:
'''
Checks every game for each team is in the dataset
'''
home_teams = [match['home_team']['home_team_name'] for match in matches]
away_teams = [match['away_team']['away_team_name'] for match in matches]

all_teams = home_teams + away_teams

games_per_team = pd.Series(all_teams).value_counts()

print(games_per_team)

Levante UD                38
Las Palmas                38
RC Deportivo La Coruña    38
Málaga                    38
Espanyol                  38
Sporting Gijón            38
Rayo Vallecano            38
Real Betis                38
Athletic Club             38
Atlético Madrid           38
Valencia                  38
Eibar                     38
Getafe                    38
Villarreal                38
Sevilla                   38
Granada                   38
Real Sociedad             38
Celta Vigo                38
Real Madrid               38
Barcelona                 38
Name: count, dtype: int64


In [3]:
import os
from tqdm import tqdm 

'''
to extract all match events from each match
'''

# Base folder where event files are stored
events_folder = r'C:\Users\nacho\open-data\data\events'

# List to hold all events
all_events = {}

for match_id in tqdm(match_ids):
    event_file = os.path.join(events_folder, f"{match_id}.json")
    with open(event_file, 'r', encoding='utf-8') as f:
        match_events = json.load(f)
        all_events[match_id] = match_events  # Store events under the match_id key

print(f"Loaded events for {len(all_events)} matches.")
total_events = sum(len(ev) for ev in all_events.values())
print(f"Loaded a total of {total_events} events from {len(all_events)} matches.")

100%|████████████████████████████████████████████████████████████████████████████████| 380/380 [00:19<00:00, 19.17it/s]

Loaded events for 380 matches.
Loaded a total of 1295354 events from 380 matches.





Extracting the line up of one game to check accuracy and make sure it works

In [4]:
first_match = matches[12]
match_id = first_match['match_id']

print(f"Match ID: {match_id}")
print(f"Home team: {first_match['home_team']['home_team_name']}")
print(f"Away team: {first_match['away_team']['away_team_name']}")
print(f"Date: {first_match['match_date']}")

Match ID: 3825849
Home team: Valencia
Away team: Real Sociedad
Date: 2016-05-13


In [5]:

events_for_match = all_events[match_id]

# Find the Starting XI event(s)
starting_xi_events = [e for e in events_for_match if e['type']['id'] == 35]

for event in starting_xi_events:
    print(f"Team: {event['team']['name']}, Formation: {event['tactics']['formation']}")
    lineup = event['tactics']['lineup']
    for player_info in lineup:
        player_name = player_info['player']['name']
        position_name = player_info['position']['name']
        jersey_number = player_info.get('jersey_number', 'N/A')
        print(f"  #{jersey_number} {player_name} - {position_name}")
    print()

Team: Valencia, Formation: 433
  #13 Jaume Doménech Sánchez - Goalkeeper
  #19 Antonio Barragán Fernández - Right Back
  #4 Aderllan Leandro de Jesus Santos - Right Center Back
  #3 Rúben Miguel Nunes Vezo - Left Center Back
  #6 Guilherme Magdalena Siqueira - Left Back
  #12 Danilo Barbosa da Silva - Center Defensive Midfield
  #10 Daniel Parejo Muñoz - Right Center Midfield
  #21 André Filipe Tavares Gomes - Left Center Midfield
  #22 Santiago Mina Lorenzo - Right Wing
  #11 Pablo Daniel Piatti - Left Wing
  #9 Francisco Alcácer García - Center Forward

Team: Real Sociedad, Formation: 4231
  #1 Gerónimo Rulli - Goalkeeper
  #30 Aritz Elustondo Irribaria - Right Back
  #3 Mikel González de Martín Martínez - Right Center Back
  #23 Diego Antonio Reyes Rosales - Left Center Back
  #24 Alberto De La Bella Madureño - Left Back
  #4 Asier Illarramendi Andonegi - Right Defensive Midfield
  #14 Rubén Pardo Gutiérrez - Left Defensive Midfield
  #10 Xabier Prieto Argarate - Right Wing
  #28 Mi

need to narrow down to what postions i should go,  

examples: treat right back and left back as one? 
treat left back and left wing back as one?
treat defensive midfield, center midfield and attacking midfield as one?

Inititally only these positions:

fullback (LB, RB, LWB, RWB)

centreback 

Midfield (CDM CM CAM)

RM LM?

Wingers (LW RW)

Striker (ST CF




In [6]:
unique_positions = set()

for match_id, events in all_events.items():
    for event in events:
        if event['type']['name'] == 'Starting XI':
            lineup = event['tactics']['lineup']
            for player in lineup:
                position_name = player['position']['name']
                unique_positions.add(position_name)

print(f"Unique positions in the season: {len(unique_positions)}")
print("Position names:", unique_positions)

Unique positions in the season: 22
Position names: {'Center Back', 'Right Wing', 'Left Center Forward', 'Right Wing Back', 'Center Attacking Midfield', 'Right Defensive Midfield', 'Right Center Back', 'Center Midfield', 'Left Wing Back', 'Left Back', 'Right Center Forward', 'Left Center Midfield', 'Right Midfield', 'Goalkeeper', 'Right Back', 'Left Wing', 'Left Midfield', 'Center Defensive Midfield', 'Left Center Back', 'Right Center Midfield', 'Left Defensive Midfield', 'Center Forward'}


In [7]:
'''GET MINUTES PLAYED BY EACH PLAYER'''


from collections import defaultdict

player_seconds = defaultdict(float)
player_id_to_name = {}

for match_id, events in all_events.items():
    players_on_pitch = {}  # player_id -> start_time_in_seconds
    off_periods = defaultdict(list)  # player_id -> list of {'off': time, 'on': time}
    added_time_players = defaultdict(int)
    
    for event in events:
        event_time = event['minute'] * 60 + event['second']
        
        if event['type']['id'] == 35:
            for player in event['tactics']['lineup']:
                pid = player['player']['id']
                name = player['player']['name']
                players_on_pitch[pid] = 0
                player_id_to_name[pid] = name
        
        elif event['type']['id'] == 34:
            added_time = event_time - 2700  # 45 mins in secs
            if added_time > 0:
                for pid in players_on_pitch:
                    added_time_players[pid] = added_time
        
        elif event['type']['id'] == 19:
            player_out = event['player']['id']
            player_in = event['substitution']['replacement']['id']
            time_sub = event_time
            
            if player_out in players_on_pitch:
                start_time = players_on_pitch[player_out]
                added = added_time_players.get(player_out, 0)
                player_seconds[player_out] += time_sub - start_time + added
                del players_on_pitch[player_out]
            
            players_on_pitch[player_in] = time_sub
            name = event['substitution']['replacement']['name']
            player_id_to_name[player_in] = name
        
        elif event['type']['id'] == 27:
            pid = event['player']['id']
            off_periods[pid].append({'off': event_time})
        
        elif event['type']['id'] == 26:
            pid = event['player']['id']
            if off_periods[pid] and 'off' in off_periods[pid][-1] and 'on' not in off_periods[pid][-1]:
                off_periods[pid][-1]['on'] = event_time

    # End of match: finalize playing time for players still on pitch
    if events:
        last_event = events[-1]
        match_end_time = last_event['minute'] * 60 + last_event['second']
        for pid, start_time in players_on_pitch.items():
            play_time = match_end_time - start_time + added_time_players.get(pid, 0)
            
            for period in off_periods.get(pid, []):
                if 'on' in period and 'off' in period:
                    play_time -= (period['on'] - period['off'])
                elif 'off' in period:
                    play_time -= (match_end_time - period['off'])
            
            player_seconds[pid] += play_time
'''
# Print results
for pid, seconds in player_seconds.items():
    name = player_id_to_name.get(pid, "Unknown")
    minutes = round(seconds / 60, 2)
    print(pid, name, minutes)
print(len(player_seconds))
'''


'\n# Print results\nfor pid, seconds in player_seconds.items():\n    name = player_id_to_name.get(pid, "Unknown")\n    minutes = round(seconds / 60, 2)\n    print(pid, name, minutes)\nprint(len(player_seconds))\n'

In [8]:
player_data = []

for pid, seconds in player_seconds.items():
    name = player_id_to_name.get(pid, "Unknown")
    minutes = round(seconds / 60, 2)
    player_data.append({
        'player_id': pid,
        'name': name,
        'minutes': minutes
    })

In [9]:
'''ADD THEIR MAIN POSITION'''

from collections import defaultdict, Counter

player_position_counts = defaultdict(Counter)


for match_id, events in all_events.items():
    for event in events:
        if event['type']['id'] == 35:  # Lineup
            for player in event['tactics']['lineup']:
                pid = player['player']['id']
                pos = player.get('position', {}).get('name')
                if pos:
                    player_position_counts[pid][pos] += 1

        elif event['type']['id'] == 19:  # Sub
            replacement = event['substitution']['replacement']
            pid = replacement['id']
            pos = replacement.get('position', {}).get('name')
            if pos:
                player_position_counts[pid][pos] += 1

for player in player_data:
    pid = player['player_id']
    position_counter = player_position_counts.get(pid, {})
    most_played = position_counter.most_common(1)[0][0] if position_counter else "Unknown"
    player['most_played_position'] = most_played



In [10]:
# Mapping function
def map_general_position(pos):
    pos = pos.lower()
    
    if pos == "goalkeeper":
        return "goalkeeper"
    
    elif pos in ["right back", "left back", "right wing back", "left wing back"]:
        return "full back"
    
    elif pos in ["center back", "right center back", "left center back"]:
        return "center back"
    
    elif "midfield" in pos:
        return "midfielder"
    
    elif pos in ["right wing", "left wing"]:
        return "winger"
    
    elif pos in ["striker", "center forward", "secondary striker", "right center forward", "left center forward"]:
        return "striker"
    
    return "other"  # fallback for unexpected cases


In [11]:
for player in player_data:
    specific_pos = player.get('most_played_position', 'Unknown')
    player['position'] = map_general_position(specific_pos)


In [12]:
other_count = sum(1 for player in player_data if player.get('position') == 'other')
print(f"Number of players with position 'other': {other_count}")




Number of players with position 'other': 32


In [13]:
from collections import defaultdict

# Initialize stats containers per player
pass_stats = defaultdict(lambda: {
    'total_passes': 0,
    'completed_passes': 0,
    'progressive_passes': 0,
    'completed_progressive_passes': 0,
    'long_passes': 0,
    'completed_long_passes': 0,
    'crosses': 0,
    'completed_crosses': 0,
    'key_passes': 0,
    'assists': 0,
    'passes_received' : 0
})

def is_progressive_pass(start_x, end_x, pitch_length=100):
    # Determine progressive pass by x-coordinate movement closer to opponent goal
    if start_x < pitch_length / 2 and end_x < pitch_length / 2:
        return (end_x - start_x) >= 30
    elif (start_x < pitch_length / 2 and end_x >= pitch_length / 2) or (start_x >= pitch_length / 2 and end_x < pitch_length / 2):
        return (end_x - start_x) >= 15
    else:  # both in opponent half
        return (end_x - start_x) >= 10

def is_long_pass(length_yards, height_id):
    # Ground pass longer than 45m (~49.2 yards)
    # High pass longer than 25m (~27.3 yards)
    if height_id == 1:  # Ground Pass
        return length_yards >= 49.2
    elif height_id == 3:  # High Pass
        return length_yards >= 27.3
    return False

# Collect pass data
for match_id, events in all_events.items():
    for event in events:
        if event['type']['id'] == 30:  # Pass event
            pid = event['player']['id']
            # Completed pass if no outcome or outcome id is None
            completed = event.get('pass', {}).get('outcome', {}).get('id') is None
            length = event.get('pass', {}).get('length', 0)  # in yards
            height_id = event.get('pass', {}).get('height', {}).get('id', 1)  # default to ground pass if missing

            start_x = event.get('location', [0, 0])[0]
            end_x = event.get('pass', {}).get('end_location', [0, 0])[0]

            cross = event.get('pass', {}).get('cross', False)

            # Update total and completed passes
            pass_stats[pid]['total_passes'] += 1
            if completed:
                pass_stats[pid]['completed_passes'] += 1

            # Progressive passes
            if is_progressive_pass(start_x, end_x):
                pass_stats[pid]['progressive_passes'] += 1
                if completed:
                    pass_stats[pid]['completed_progressive_passes'] += 1

            # Long passes
            if is_long_pass(length, height_id):
                pass_stats[pid]['long_passes'] += 1
                if completed:
                    pass_stats[pid]['completed_long_passes'] += 1

            # Crosses
            if cross:
                pass_stats[pid]['crosses'] += 1
                if completed:
                    pass_stats[pid]['completed_crosses'] += 1

            # Key passes (assisted shot)
            if event.get('pass', {}).get('shot_assist', False):
                pass_stats[pid]['key_passes'] += 1

            # Assists (goal assists)
            if event.get('pass', {}).get('goal_assist', False):
                pass_stats[pid]['assists'] += 1
                
        elif event['type']['id'] == 42: #Ball receipt
            pid = event['player']['id']
            pass_stats[pid]['passes_received'] +=1

In [14]:
# Add passing stats per 90 minutes to player_data
for player in player_data:
    pid = player['player_id']
    minutes = player.get('minutes', 0)
    stats = pass_stats.get(pid, {})

    if minutes > 0:
        player['passes_per_90'] = stats.get('total_passes', 0) / (minutes / 90)
        player['pass_accuracy'] = (stats.get('completed_passes', 0) / max(stats.get('total_passes', 1), 1)) * 100

        player['progressive_passes_per_90'] = stats.get('progressive_passes', 0) / (minutes / 90)
        player['progressive_pass_accuracy'] = (stats.get('completed_progressive_passes', 0) / max(stats.get('progressive_passes', 1), 1)) * 100

        player['long_passes_per_90'] = stats.get('long_passes', 0) / (minutes / 90)
        player['long_pass_accuracy'] = (stats.get('completed_long_passes', 0) / max(stats.get('long_passes', 1), 1)) * 100

        player['crosses_per_90'] = stats.get('crosses', 0) / (minutes / 90)
        player['cross_accuracy'] = (stats.get('completed_crosses', 0) / max(stats.get('crosses', 1), 1)) * 100

        player['key_passes_per_90'] = stats.get('key_passes', 0) / (minutes / 90)
        player['assists_per_90'] = stats.get('assists', 0) / (minutes / 90)
        
        player['passes_received_per_90'] = stats.get('passes_received', 0) / (minutes/90)
    else:
        player['passes_per_90'] = 0
        player['pass_accuracy'] = 0
        player['progressive_passes_per_90'] = 0
        player['progressive_pass_accuracy'] = 0
        player['long_passes_per_90'] = 0
        player['long_pass_accuracy'] = 0
        player['crosses_per_90'] = 0
        player['cross_accuracy'] = 0
        player['key_passes_per_90'] = 0
        player['assists_per_90'] = 0
        player['passes_received_per_90'] = 0

In [15]:
# Initialize data containers
player_shots = defaultdict(lambda: {
    'total_shots': 0,
    'shots_on_target': 0,
    'goals': 0,
    'xg': 0.0,
    'first_time_shots': 0,
    'headers': 0,
    'shots_outside_box': 0
})

# Pitch box boundaries (StatsBomb pitch is 120x80)
# The penalty box on the attacking side is approximately x >= 84 for shots inside the box
penalty_box_x = 84

for match_id, events in all_events.items():
    for event in events:
        if event['type']['id'] == 16:  # Shot event
            pid = event['player']['id']
            location = event.get('location', [0, 0])
            x, y = location
            
            # Count total shots
            player_shots[pid]['total_shots'] += 1
            
            # Check shot outcome
            outcome_id = event.get('shot', {}).get('outcome', {}).get('id')
            # On target if goal (97), saved (100), or saved to post (116)
            if outcome_id in [97, 100, 116]:
                player_shots[pid]['shots_on_target'] += 1
            
            # Count goals
            if outcome_id == 97:
                player_shots[pid]['goals'] += 1
            
            # Accumulate xG
            xg = event.get('shot', {}).get('statsbomb_xg', 0)
            player_shots[pid]['xg'] += xg
            
            # First time shots
            if event.get('shot', {}).get('first_time', False):
                player_shots[pid]['first_time_shots'] += 1
            
            # Headers
            body_part_id = event.get('shot', {}).get('body_part', {}).get('id')
            if body_part_id == 37:  # 37 corresponds to 'Head'
                player_shots[pid]['headers'] += 1
            
            # Shots from outside the box (x < penalty_box_x)
            if x < penalty_box_x:
                player_shots[pid]['shots_outside_box'] += 1

In [16]:
# Now calculate per 90 stats and accuracy and add to player_data
for player in player_data:
    pid = player['player_id']
    seconds_played = player.get('minutes', 0) * 60
    minutes_played = player.get('minutes', 0)
    if seconds_played == 0:
        # Avoid division by zero if no minutes played
        player.update({
            'shots_per_90': 0,
            'shots_on_target_per_90': 0,
            'shot_accuracy': 0,
            'goals_per_90': 0,
            'xg_per_90': 0,
            'first_time_shots_per_90': 0,
            'headers_per_90': 0,
            'shots_outside_box_per_90': 0
        })
        continue
    
    stats = player_shots.get(pid, {
        'total_shots': 0,
        'shots_on_target': 0,
        'goals': 0,
        'xg': 0.0,
        'first_time_shots': 0,
        'headers': 0,
        'shots_outside_box': 0
    })
    
    shots = stats['total_shots']
    shots_on_target = stats['shots_on_target']
    goals = stats['goals']
    xg = stats['xg']
    first_time = stats['first_time_shots']
    headers = stats['headers']
    shots_outside = stats['shots_outside_box']
    
    # Calculate accuracy (on target / total shots)
    accuracy = shots_on_target / shots if shots > 0 else 0
    
    player['shots_per_90'] = shots / minutes_played * 90
    player['shots_on_target_per_90'] = shots_on_target / minutes_played * 90
    player['shot_accuracy'] = round(accuracy, 3)
    player['goals_per_90'] = goals / minutes_played * 90
    player['xg_per_90'] = xg / minutes_played * 90
    player['first_time_shots_per_90'] = first_time / minutes_played * 90
    player['headers_per_90'] = headers / minutes_played * 90
    player['shots_outside_box_per_90'] = shots_outside / minutes_played * 90

In [17]:
print(player_data)

[{'player_id': 24149, 'name': 'José Antonio García Rabasco', 'minutes': 3369.15, 'most_played_position': 'Right Defensive Midfield', 'position': 'midfielder', 'passes_per_90': 36.3830639775611, 'pass_accuracy': 77.01908957415566, 'progressive_passes_per_90': 11.913984239348203, 'progressive_pass_accuracy': 60.0896860986547, 'long_passes_per_90': 7.319353546146654, 'long_pass_accuracy': 43.79562043795621, 'crosses_per_90': 0.9883798584212634, 'cross_accuracy': 24.324324324324326, 'key_passes_per_90': 1.095231735007346, 'assists_per_90': 0.0801389074395619, 'passes_received_per_90': 27.674636035795377, 'shots_per_90': 0.6678242286630159, 'shots_on_target_per_90': 0.18699078402564442, 'shot_accuracy': 0.28, 'goals_per_90': 0.053425938293041264, 'xg_per_90': 0.05210306754908508, 'first_time_shots_per_90': 0.21370375317216506, 'headers_per_90': 0.0, 'shots_outside_box_per_90': 0.026712969146520632}, {'player_id': 26023, 'name': 'Juan Francisco García García', 'minutes': 2249.52, 'most_playe

In [18]:
fields_to_check = [
    'minutes',
    'most_played_position',
    # Passing stats
    'passes_per_90',
    'pass_accuracy',
    'progressive_passes_per_90',
    'progressive_pass_accuracy',
    'long_passes_per_90',
    'long_pass_accuracy',
    'key_passes_per_90',
    'assists_per_90',
    'crosses_per_90',
    'cross_accuracy',
    # Shot stats
    'shots_per_90',
    'shots_on_target_per_90',
    'shot_accuracy',
    'goals_per_90',
    'xg_per_90',
    'first_time_shots_per_90',
    'headers_per_90',
    'shots_outside_box_per_90' 
]


missing_values = []

for player in player_data:
    for field in fields_to_check:
        if field not in player or player[field] is None:
            missing_values.append((player.get('player_id'), player.get('name'), field))

if missing_values:
    print(f"Missing values found for {len(missing_values)} fields:")
    for pid, name, field in missing_values:
        print(f"Player ID {pid} ({name}): Missing {field}")
else:
    print("No missing values found in checked fields.")


No missing values found in checked fields.


In [19]:
# Dribble-related stats
dribble_stats = defaultdict(lambda: {
    'dribbles': 0,
    'successful_dribbles': 0,
    'progressive_runs': 0,
    'fouls_won': 0,
    'carry_distances' : []
})

def is_progressive_run(start_x, end_x, pitch_length=120):
    if start_x < pitch_length / 2 and end_x < pitch_length / 2:
        return (end_x - start_x) >= 30
    elif (start_x < pitch_length / 2 and end_x >= pitch_length / 2) or (start_x >= pitch_length / 2 and end_x < pitch_length / 2):
        return (end_x - start_x) >= 15
    else:
        return (end_x - start_x) >= 10

# Extract dribbling data
for match_id, events in all_events.items():
    for event in events:
        player = event.get('player')
        if not player:
            continue
        pid = player['id']
        etype = event['type']['id']

        if etype == 14:  # Dribble
            dribble_stats[pid]['dribbles'] += 1
            if event.get('dribble', {}).get('outcome', {}).get('id') == 8:
                dribble_stats[pid]['successful_dribbles'] += 1

        elif etype == 43:  # Carry
            start_x = event.get('location', [0, 0])[0]
            end_x = event.get('carry', {}).get('end_location', [0, 0])[0]
            if is_progressive_run(start_x, end_x):
                dribble_stats[pid]['progressive_runs'] += 1

        elif etype == 21:  # Foul Won
            dribble_stats[pid]['fouls_won'] += 1




In [20]:
for player in player_data:
    pid = player['player_id']
    minutes = player.get('minutes', 0)
    stats = dribble_stats.get(pid, {})

    if minutes > 0:
        factor = minutes / 90
        player['dribbles_per_90'] = stats.get('dribbles', 0) / factor
        player['successful_dribbles_per_90'] = stats.get('successful_dribbles', 0) / factor
        player['dribble_success_rate'] = (stats.get('successful_dribbles', 0) / max(stats.get('dribbles', 1), 1)) * 100
        player['progressive_runs_per_90'] = stats.get('progressive_runs', 0) / factor
        player['fouls_won_per_90'] = stats.get('fouls_won', 0) / factor
    else:
        player['dribbles_per_90'] = 0
        player['successful_dribbles_per_90'] = 0
        player['dribble_success_rate'] = 0
        player['progressive_runs_per_90'] = 0
        player['fouls_won_per_90'] = 0


In [21]:
defensive_stats = defaultdict(lambda: {
    'pressures': 0,
    'blocks': 0,
    'interceptions': 0,
    'dribbled_past': 0,
    'clearances': 0,
    'ball_recoveries': 0,
    'fouls_made': 0,
    'yellow_cards': 0,
    'red_cards': 0,
    'duels': 0,
    'duels_won': 0
})

for match_id, events in all_events.items():
    for event in events:
        if 'player' not in event:
            continue

        pid = event['player']['id']
        etype = event['type']['id']

        if etype == 17:  # Pressure
            defensive_stats[pid]['pressures'] += 1
        elif etype == 2:  # Ball Recovery
            defensive_stats[pid]['ball_recoveries'] += 1
        elif etype == 4:  # Duel
            defensive_stats[pid]['duels'] += 1
            if event.get('duel', {}).get('outcome', {}).get('name') == 'Won':
                defensive_stats[pid]['duels_won'] += 1
        elif etype == 10:  # Interception
            defensive_stats[pid]['interceptions'] += 1
        elif etype == 22:  # Foul Committed (plus cards)
            defensive_stats[pid]['fouls_made'] += 1
            card = event.get('foul_committed', {}).get('card', {}).get('id')
            if card == 5:
                defensive_stats[pid]['yellow_cards'] += 1
            elif card == 6:
                defensive_stats[pid]['yellow_cards'] += 1
                defensive_stats[pid]['red_cards'] += 1
            elif card == 7:
                defensive_stats[pid]['red_cards'] += 1
        elif etype == 24:  # Bad Behaviour
            card = event.get('bad_behaviour', {}).get('card', {}).get('id')
            if card == 65:
                defensive_stats[pid]['yellow_cards'] += 1
            elif card == 66:
                defensive_stats[pid]['yellow_cards'] += 1
                defensive_stats[pid]['red_cards'] += 1
            elif card == 67:
                defensive_stats[pid]['red_cards'] += 1
        elif etype == 9:  # Clearance
            defensive_stats[pid]['clearances'] += 1
        elif etype == 6:  # Block
            defensive_stats[pid]['blocks'] += 1
        elif etype == 39:  # Dribbled Past
            defensive_stats[pid]['dribbled_past'] += 1



In [22]:
for player in player_data:
    pid = player['player_id']
    minutes = player.get('minutes', 0)
    stats = defensive_stats.get(pid, {})

    if minutes > 0:
        factor = minutes / 90
        player['pressures_per_90'] = stats.get('pressures', 0) / factor
        player['blocks_per_90'] = stats.get('blocks', 0) / factor
        player['interceptions_per_90'] = stats.get('interceptions', 0) / factor
        player['dribbled_past_per_90'] = stats.get('dribbled_past', 0) / factor
        player['clearances_per_90'] = stats.get('clearances', 0) / factor
        player['ball_recoveries_per_90'] = stats.get('ball_recoveries', 0) / factor
        player['fouls_made_per_90'] = stats.get('fouls_made', 0) / factor
        player['yellow_cards_per_90'] = stats.get('yellow_cards', 0) / factor
        player['red_cards_per_90'] = stats.get('red_cards', 0) / factor
        player['duels_per_90'] = stats.get('duels', 0) / factor
        player['duels_won_per_90'] = stats.get('duels_won', 0) / factor
        player['duel_success_rate'] = (stats.get('duels_won', 0) / max(stats.get('duels', 1), 1)) * 100
    else:
        player['pressures_per_90'] = 0
        player['blocks_per_90'] = 0
        player['interceptions_per_90'] = 0
        player['dribbled_past_per_90'] = 0
        player['clearances_per_90'] = 0
        player['ball_recoveries_per_90'] = 0
        player['fouls_made_per_90'] = 0
        player['yellow_cards_per_90'] = 0
        player['red_cards_per_90'] = 0
        player['duels_per_90'] = 0
        player['duels_won_per_90'] = 0
        player['duel_success_rate'] = 0


In [23]:
import math

goalkeeping_stats = defaultdict(lambda: {
    'shots_faced': 0,
    'saves': 0,
    'close_range_saves': 0,
    'goals_conceded': 0,
    'smothers': 0,
    'collections': 0,
    'punches': 0,
    'sweeper_claims': 0,
    'sweeper_clears': 0,
    'successful_sweepers': 0,
    'successful_collections': 0,
    'successful_punches': 0,
    'successful_smothers': 0,
})

save_types = {29, 31, 33, 114, 110, 109}  # includes post saves, not 134 (off target)
goal_types = {26, 28}
shot_faced_types = save_types | goal_types | {32}  # include non-save faced shots
smother_type = 34
collection_types = {25, 49}  # includes "Collected" + "Collected Twice"
punch_types = {30, 117}
sweeper_type = 27

# Outcome categories
successful_outcomes = {15, 17, 47, 48, 49, 51, 53, 56, 59, 4, 16, 117}
fail_outcomes = {13, 14, 50, 52, 55, 58}

# Process events
for match_id, events in all_events.items():
    for event in events:
        if event['type']['id'] != 23:  # Not a goalkeeper event
            continue

        player = event.get('player')
        if not player:
            continue
        pid = player['id']
        gk_type = event['goalkeeper']['type']['id']
        outcome_id = event['goalkeeper'].get('outcome', {}).get('id')
        loc = event.get('location', [None, None])
        x = loc[0] if loc else None

        # Shots faced
        if gk_type in shot_faced_types:
            goalkeeping_stats[pid]['shots_faced'] += 1

        # Saves (including post saves, excluding off target saves)
        #if gk_type in save_types:
        #    goalkeeping_stats[pid]['saves'] += 1
          #  if x is not None and x <= 12:  # <12m assuming pitch 100 = 120m
           #     goalkeeping_stats[pid]['close_range_saves'] += 1
                
        # Goals conceded
        if gk_type in goal_types:
            goalkeeping_stats[pid]['goals_conceded'] += 1
            
        if gk_type in save_types:
            goalkeeping_stats[pid]['saves'] += 1

        # Smothers
        if gk_type == smother_type:
            goalkeeping_stats[pid]['smothers'] += 1
            if outcome_id in successful_outcomes:
                goalkeeping_stats[pid]['successful_smothers'] += 1

        # Collections
        if gk_type in collection_types:
            goalkeeping_stats[pid]['collections'] += 1
            if outcome_id in successful_outcomes:
                goalkeeping_stats[pid]['successful_collections'] += 1

        # Punches
        if gk_type in punch_types:
            goalkeeping_stats[pid]['punches'] += 1
            if outcome_id in successful_outcomes:
                goalkeeping_stats[pid]['successful_punches'] += 1

        # Sweeper actions
        if gk_type == sweeper_type:
            if outcome_id == 47:  # Claim
                goalkeeping_stats[pid]['sweeper_claims'] += 1
                if outcome_id in successful_outcomes:
                    goalkeeping_stats[pid]['successful_sweepers'] += 1
            elif outcome_id == 48:  # Clear
                goalkeeping_stats[pid]['sweeper_clears'] += 1
                if outcome_id in successful_outcomes:
                    goalkeeping_stats[pid]['successful_sweepers'] += 1
                    
                    


In [24]:
for player in player_data:
    pid = player['player_id']
    minutes = player.get('minutes', 0)

    if pid in goalkeeping_stats and minutes > 0:
        stats = goalkeeping_stats[pid]
        factor = minutes / 90

        player.update({

            # Per 90s
            'shots_faced_per_90': stats['shots_faced'] / factor,
            'saves_per_90': stats['saves'] / factor,
            'goals_conceded_per_90': stats['goals_conceded'] / factor,
            'smothers_per_90': stats['smothers'] / factor,
            'collections_per_90': stats['collections'] / factor,
            'punches_per_90': stats['punches'] / factor,
            'sweeper_claims_per_90': stats['sweeper_claims'] / factor,
            'sweeper_clears_per_90': stats['sweeper_clears'] / factor,

            # Success rates
            'save_percentage': (stats['saves'] / stats['shots_faced']) * 100 if stats['shots_faced'] else 0,
            'smother_success_rate': (stats['successful_smothers'] / stats['smothers']) * 100 if stats['smothers'] else 0,
            'collection_success_rate': (stats['successful_collections'] / stats['collections']) * 100 if stats['collections'] else 0,
            'punch_success_rate': (stats['successful_punches'] / stats['punches']) * 100 if stats['punches'] else 0,
            'sweeper_success_rate': (
                stats['successful_sweepers'] /
                (stats['sweeper_claims'] + stats['sweeper_clears'])
            ) * 100 if (stats['sweeper_claims'] + stats['sweeper_clears']) else 0
        })



In [25]:
for player in player_data:
    position = player.get('position', '').lower()
    if 'goalkeeper' in position or position == 'gk':
        print(player)


{'player_id': 6730, 'name': 'Rubén Iván Martínez Andrade', 'minutes': 2124.32, 'most_played_position': 'Goalkeeper', 'position': 'goalkeeper', 'passes_per_90': 17.49736386231829, 'pass_accuracy': 55.205811138014525, 'progressive_passes_per_90': 12.32865105068916, 'progressive_pass_accuracy': 37.80068728522337, 'long_passes_per_90': 12.159185056865255, 'long_pass_accuracy': 37.63066202090592, 'crosses_per_90': 0.0, 'cross_accuracy': 0.0, 'key_passes_per_90': 0.0, 'assists_per_90': 0.0, 'passes_received_per_90': 4.872147322437297, 'shots_per_90': 0.0, 'shots_on_target_per_90': 0.0, 'shot_accuracy': 0, 'goals_per_90': 0.0, 'xg_per_90': 0.0, 'first_time_shots_per_90': 0.0, 'headers_per_90': 0.0, 'shots_outside_box_per_90': 0.0, 'dribbles_per_90': 0.0, 'successful_dribbles_per_90': 0.0, 'dribble_success_rate': 0.0, 'progressive_runs_per_90': 0.0, 'fouls_won_per_90': 0.042366498455976495, 'pressures_per_90': 0.0, 'blocks_per_90': 0.0, 'interceptions_per_90': 0.0, 'dribbled_past_per_90': 0.0,

In [26]:
# Define event type IDs that count as touches
# (You should adjust this list based on your event types)
touch_event_types = {
    42, # Reception
    2, # Recovery
    3, # Dispossessed
    4, # Duel
    6, # Block
    9, # Clearance
    10,  # Interception
    14,  # Dribble 
    16, # Shot
    21, # Foul Won
    23, #Goalkeeper
    30, # Pass
    33, #50-50
    37, #Error
    38, # Miscontrol
    43, # Carry
    
    # Add other event IDs that mean ball touch here
}

other_stats = defaultdict(lambda: {
    'x_total': 0.0,
    'y_total': 0.0,
    'touches': 0
})

for match_id, events in all_events.items():
    for event in events:
        player = event.get('player')
        location = event.get('location')
        etype = event.get('type', {}).get('id')

        # Skip if no player, location, or type
        if not player or not location or len(location) < 2 or etype not in touch_event_types:
            continue

        pid = player['id']
        x, y = location[0], location[1]

        other_stats[pid]['x_total'] += x
        other_stats[pid]['y_total'] += y
        other_stats[pid]['touches'] += 1


In [27]:
# Add to player data
for player in player_data:
    pid = player['player_id']
    minutes = player.get('minutes', 0)
    stats = other_stats.get(pid, {})

    if minutes > 0 and stats.get('touches', 0) > 0:
        factor = minutes / 90
        player['touches_per_90'] = stats['touches'] / factor
        player['average_position_x'] = stats['x_total'] / stats['touches']
        player['average_position_y'] = stats['y_total'] / stats['touches']
    else:
        player['touches_per_90'] = 0
        player['average_position_x'] = None
        player['average_position_y'] = None

In [28]:
print(player_data)

[{'player_id': 24149, 'name': 'José Antonio García Rabasco', 'minutes': 3369.15, 'most_played_position': 'Right Defensive Midfield', 'position': 'midfielder', 'passes_per_90': 36.3830639775611, 'pass_accuracy': 77.01908957415566, 'progressive_passes_per_90': 11.913984239348203, 'progressive_pass_accuracy': 60.0896860986547, 'long_passes_per_90': 7.319353546146654, 'long_pass_accuracy': 43.79562043795621, 'crosses_per_90': 0.9883798584212634, 'cross_accuracy': 24.324324324324326, 'key_passes_per_90': 1.095231735007346, 'assists_per_90': 0.0801389074395619, 'passes_received_per_90': 27.674636035795377, 'shots_per_90': 0.6678242286630159, 'shots_on_target_per_90': 0.18699078402564442, 'shot_accuracy': 0.28, 'goals_per_90': 0.053425938293041264, 'xg_per_90': 0.05210306754908508, 'first_time_shots_per_90': 0.21370375317216506, 'headers_per_90': 0.0, 'shots_outside_box_per_90': 0.026712969146520632, 'dribbles_per_90': 0.5075464137838921, 'successful_dribbles_per_90': 0.34726859890476824, 'dr

In [29]:
import math

def check_nan_or_errors(player_data):
    problematic = []
    for player in player_data:
        for key, value in player.items():
            if value is None:
                problematic.append((player.get('player_id', 'unknown'), key, 'None'))
            elif isinstance(value, float) and math.isnan(value):
                problematic.append((player.get('player_id', 'unknown'), key, 'NaN'))
            # You can add more error checks here if needed

    if problematic:
        print("Found NaN or None in the following player stats:")
        for pid, col, val in problematic:
            print(f"Player {pid} - Column: {col} - Value: {val}")
    else:
        print("No NaN or None values found in player stats.")

check_nan_or_errors(player_data)


Found NaN or None in the following player stats:
Player 206604 - Column: average_position_x - Value: None
Player 206604 - Column: average_position_y - Value: None


In [30]:
for player in player_data:
    if player.get('player_id') == 206604:
        print(player)
        break


{'player_id': 206604, 'name': 'Imanol Corral Matellán', 'minutes': 48.7, 'most_played_position': 'Unknown', 'position': 'other', 'passes_per_90': 0.0, 'pass_accuracy': 0.0, 'progressive_passes_per_90': 0.0, 'progressive_pass_accuracy': 0.0, 'long_passes_per_90': 0.0, 'long_pass_accuracy': 0.0, 'crosses_per_90': 0.0, 'cross_accuracy': 0.0, 'key_passes_per_90': 0.0, 'assists_per_90': 0.0, 'passes_received_per_90': 0.0, 'shots_per_90': 0.0, 'shots_on_target_per_90': 0.0, 'shot_accuracy': 0, 'goals_per_90': 0.0, 'xg_per_90': 0.0, 'first_time_shots_per_90': 0.0, 'headers_per_90': 0.0, 'shots_outside_box_per_90': 0.0, 'dribbles_per_90': 0.0, 'successful_dribbles_per_90': 0.0, 'dribble_success_rate': 0.0, 'progressive_runs_per_90': 0.0, 'fouls_won_per_90': 0.0, 'pressures_per_90': 0.0, 'blocks_per_90': 0.0, 'interceptions_per_90': 0.0, 'dribbled_past_per_90': 0.0, 'clearances_per_90': 0.0, 'ball_recoveries_per_90': 0.0, 'fouls_made_per_90': 0.0, 'yellow_cards_per_90': 0.0, 'red_cards_per_90':

In [31]:
unknown_positions = sum(1 for player in player_data if player.get('most_played_position') == 'Unknown')
print(f"Number of players with 'Unknown' position: {unknown_positions}")

Number of players with 'Unknown' position: 32


In [32]:
for player in player_data:
    if player.get('most_played_position') == 'Unknown':
        print(player)


{'player_id': 25057, 'name': 'Alejandro Barrera García', 'minutes': 316.17, 'most_played_position': 'Unknown', 'position': 'other', 'passes_per_90': 9.962994591517221, 'pass_accuracy': 68.57142857142857, 'progressive_passes_per_90': 3.131226871619698, 'progressive_pass_accuracy': 63.63636363636363, 'long_passes_per_90': 0.5693139766581269, 'long_pass_accuracy': 0.0, 'crosses_per_90': 0.0, 'cross_accuracy': 0.0, 'key_passes_per_90': 0.0, 'assists_per_90': 0.0, 'passes_received_per_90': 13.094221463136918, 'shots_per_90': 0.569313976658127, 'shots_on_target_per_90': 0.2846569883290635, 'shot_accuracy': 0.5, 'goals_per_90': 0.0, 'xg_per_90': 0.03252313948192428, 'first_time_shots_per_90': 0.0, 'headers_per_90': 0.569313976658127, 'shots_outside_box_per_90': 0.0, 'dribbles_per_90': 0.28465698832906344, 'successful_dribbles_per_90': 0.28465698832906344, 'dribble_success_rate': 100.0, 'progressive_runs_per_90': 1.1386279533162538, 'fouls_won_per_90': 0.0, 'pressures_per_90': 8.25505266154284

In [33]:
import pandas as pd

df_players = pd.DataFrame(player_data)
print(df_players.head())  # To see the first few rows

   player_id                          name  minutes      most_played_position  \
0      24149   José Antonio García Rabasco  3369.15  Right Defensive Midfield   
1      26023  Juan Francisco García García  2249.52          Left Center Back   
2       6742        Sergio Gontán Gallardo  3311.02                Right Wing   
3       6566          Borja González Tomás  3721.57            Center Forward   
4       5678  Jefferson Andrés Lerma Solís  4122.73     Right Center Midfield   

      position  passes_per_90  pass_accuracy  progressive_passes_per_90  \
0   midfielder      36.383064      77.019090                  11.913984   
1  center back      30.606529      64.705882                  15.363277   
2       winger      25.007400      64.891304                   6.469306   
3      striker      15.646622      69.397218                   2.684351   
4   midfielder      26.436366      73.988439                   7.531417   

   progressive_pass_accuracy  long_passes_per_90  ...  smother

In [36]:
cols_to_fill = [
    'shots_faced_per_90',
    'saves_per_90',
    'goals_conceded_per_90',
    'smothers_per_90',
    'collections_per_90',
    'punches_per_90',
    'sweeper_claims_per_90',
    'sweeper_clears_per_90',
    'save_percentage',
    'smother_success_rate',
    'collection_success_rate',
    'punch_success_rate',
    'sweeper_success_rate'  # fixed missing quote here
]

df_players[cols_to_fill] = df_players[cols_to_fill].fillna(0.0)

In [37]:
print(df_players.columns)
print(df_players['name'].head(20))  # or try another likely column name, e.g. 'player_name'
keylor_row = df_players[df_players['name'].str.contains('José Antonio García Rabasco', case=False, na=False)]
print(keylor_row.to_dict(orient='records'))


Index(['player_id', 'name', 'minutes', 'most_played_position', 'position',
       'passes_per_90', 'pass_accuracy', 'progressive_passes_per_90',
       'progressive_pass_accuracy', 'long_passes_per_90', 'long_pass_accuracy',
       'crosses_per_90', 'cross_accuracy', 'key_passes_per_90',
       'assists_per_90', 'passes_received_per_90', 'shots_per_90',
       'shots_on_target_per_90', 'shot_accuracy', 'goals_per_90', 'xg_per_90',
       'first_time_shots_per_90', 'headers_per_90', 'shots_outside_box_per_90',
       'dribbles_per_90', 'successful_dribbles_per_90', 'dribble_success_rate',
       'progressive_runs_per_90', 'fouls_won_per_90', 'pressures_per_90',
       'blocks_per_90', 'interceptions_per_90', 'dribbled_past_per_90',
       'clearances_per_90', 'ball_recoveries_per_90', 'fouls_made_per_90',
       'yellow_cards_per_90', 'red_cards_per_90', 'duels_per_90',
       'duels_won_per_90', 'duel_success_rate', 'touches_per_90',
       'average_position_x', 'average_position_y', '

In [38]:
for col in df_players.columns:
    nan_count = df_players[col].isna().sum()
    if nan_count > 0:
        print(f"Column '{col}' has {nan_count} NaN(s).")
        players_with_nan = df_players[df_players[col].isna()]
        print("Players with NaN in this column:")
        for idx, row in players_with_nan.iterrows():
            pid = row.get('player_id', 'Unknown')
            name = row.get('name', 'Unknown')
            print(f" - Player ID: {pid}, Name: {name}")
        print()


Column 'average_position_x' has 1 NaN(s).
Players with NaN in this column:
 - Player ID: 206604, Name: Imanol Corral Matellán

Column 'average_position_y' has 1 NaN(s).
Players with NaN in this column:
 - Player ID: 206604, Name: Imanol Corral Matellán



In [39]:
# Drop all rows with any NaNs
df_players_clean = df_players.dropna()

print(f"Removed {len(df_players) - len(df_players_clean)} players with NaN values.")

Removed 1 players with NaN values.


In [40]:
df_players_clean.to_csv('cleaned_player_data.csv', index=False)
