In [1]:
import pandas as pd
import numpy as np
import ast
import os
import json
import bz2
from tqdm import tqdm
import matplotlib.pyplot as plt
from mplsoccer import Pitch
from plottable import Table, ColDef
from plottable.cmap import normed_cmap
import matplotlib.lines as mlines
import matplotlib.font_manager as font_manager

# Fonts
normal_font_path = '/Library/Fonts/USSF 90 Min Display-Medium.otf'
bold_font_path = '/Library/Fonts/USSF 90 Min Display-Bold.otf'
font_normal = font_manager.FontProperties(fname=normal_font_path, size=12)
font_bold = font_manager.FontProperties(fname=bold_font_path, size=25)

plt.rcParams['font.sans-serif'] = font_normal.get_name()


In [2]:
# Load Data

comps = pd.read_csv('./Data/competitions.csv')
players = pd.read_csv('./Data/players.csv')
rosters = pd.read_csv('./Data/rosters.csv')
metadata = pd.read_csv('./Data/metadata.csv')
metadata['awayTeam'] = metadata['awayTeam'].apply(ast.literal_eval)
metadata['homeTeam'] = metadata['homeTeam'].apply(ast.literal_eval)
events = pd.read_json('./Data/events.json')

In [32]:
# Parse Metadata

away_flatten = pd.json_normalize(metadata['awayTeam'])
md = pd.concat([metadata.drop(columns=['awayTeam']), away_flatten.rename(columns={'id': 'away_team_id', 'name': 'away_team_name', 'shortName': 'away_short_name'})], axis=1)
home_flatten = pd.json_normalize(metadata['homeTeam'])
md = pd.concat([md.drop(columns=['homeTeam']), home_flatten.rename(columns={'id': 'home_team_id', 'name': 'home_team_name', 'shortName': 'home_short_name'})], axis=1)
md = md.rename(columns={'id': 'gameId'})

# Remove Final and Third Place  From Analysis
md = md[~md['gameId'].isin([10517, 10516])]

In [None]:
# Clean Up Events Data

events = events.dropna(axis=1, how='all')

ge_sel_columns = [
    'gameId',
    'game_event_id',
    'pressurePlayer',
    'gameEventType',
    'possessionEvents',
    'gameClock',
    'duration',
    'pressureType',
    'touches',
    'team',
    'player'
]

events = events[ge_sel_columns]
team_flatten = pd.json_normalize(events['team'])
events = pd.concat([events.drop(columns=['team']), team_flatten.rename(columns={'id': 'team_id', 'name': 'team'})], axis=1)
player_flatten = pd.json_normalize(events['player'])
events = pd.concat([events.drop(columns=['player']), player_flatten.rename(columns={'id': 'player_id', 'nickname': 'player'})], axis=1)
events = events[events['gameEventType'] != 'OUT']
events = events[events['gameEventType'] != 'VID']
events = events[~events['gameId'].isin([10517, 10516])]

events['opponent'] = events.groupby('gameId')['team'].transform(
    lambda x: x.apply(lambda team: x.unique()[1] if team == x.unique()[0] else x.unique()[0])
)



In [5]:
# Parse Possession Events

pe = events[['game_event_id','possessionEvents']]

pe = pe.explode('possessionEvents').reset_index(drop = True)
flattened = pd.json_normalize(pe['possessionEvents'])
pe = pd.concat([pe.drop(columns=['possessionEvents']), flattened], axis=1)


In [6]:
# Clean Possession Events

pe_sel_columns = ['game_event_id',
               'id',
               'gameClock',
               'possessionEventType',
               'passingEvent.passOutcomeType', 
               'passingEvent.shotOutcomeType',
               'challengeEvent.challengeOutcomeType',
               'shootingEvent.shotOutcomeType',
               'reboundEvent.reboundOutcomeType',
               'reboundEvent.shotOutcomeType',
               'ballCarryEvent.ballCarryOutcome',
               'ballCarryEvent.touchOutcomeType',
               'crossEvent.shotOutcomeType',
               'crossEvent.crossOutcomeType',
               'clearanceEvent.clearanceOutcomeType',
               'clearanceEvent.shotOutcomeType',
               'passingEvent.passerPlayer.id',
               'passingEvent.passerPlayer.nickname',
               'passingEvent.pressureType',
               'passingEvent.targetPlayer.id',
               'passingEvent.targetPlayer.nickname',
               'passingEvent.receiverPlayer.id',
               'passingEvent.receiverPlayer.nickname',
               'passingEvent.pressurePlayer.id',
               'passingEvent.pressurePlayer.nickname',
               'challengeEvent.ballCarrierPlayer.id',
               'challengeEvent.ballCarrierPlayer.nickname',
               'challengeEvent.challengeWinnerPlayer.id',
               'challengeEvent.challengeWinnerPlayer.nickname',
               'challengeEvent.challengerPlayer.id',
               'challengeEvent.challengerPlayer.nickname',
               'shootingEvent.pressurePlayer.id',
               'shootingEvent.pressurePlayer.nickname',
               'shootingEvent.shooterPlayer.id',
               'shootingEvent.shooterPlayer.nickname',
               'reboundEvent.rebounderPlayer.id',
               'reboundEvent.rebounderPlayer.nickname',
               'crossEvent.intendedTargetPlayer.id',
               'crossEvent.intendedTargetPlayer.nickname',
               'crossEvent.defenderPlayer.id',
               'crossEvent.defenderPlayer.nickname',
               'crossEvent.crosserPlayer.id',
               'crossEvent.crosserPlayer.nickname',
               'clearanceEvent.clearancePlayer.id',
               'clearanceEvent.clearancePlayer.nickname',
               'passingEvent.defenderPlayer.id',
               'passingEvent.defenderPlayer.nickname',
               'ballCarryEvent.ballCarrierPlayer.id',
               'ballCarryEvent.ballCarrierPlayer.nickname',
               'crossEvent.pressurePlayer.id',
               'crossEvent.pressurePlayer.nickname',
               'crossEvent.completeToPlayer.id',
               'crossEvent.completeToPlayer.nickname',
               'passingEvent.blockerPlayer.id',
               'passingEvent.blockerPlayer.nickname',
               'ballCarryEvent.defenderPlayer.id',
               'ballCarryEvent.defenderPlayer.nickname',
               'ballCarryEvent.pressurePlayer.id',
               'ballCarryEvent.pressurePlayer.nickname',
               'passingEvent.deflectorPlayer.id',
               'passingEvent.deflectorPlayer.nickname',
               'crossEvent.deflectorPlayer.id',
               'crossEvent.deflectorPlayer.nickname',
               'challengeEvent.pressurePlayer.id',
               'challengeEvent.pressurePlayer.nickname'
               ]

possession_events = pe[pe_sel_columns]
possession_events = possession_events[possession_events['possessionEventType'].notna()]
possession_events = possession_events.rename(columns={'id': 'possession_event_id'})
possession_events['possession_event_id'] = possession_events['possession_event_id'].astype(int)

In [7]:
# Join Game Events Data
full_data = possession_events.merge(events, on = 'game_event_id')

full_data['opposition_regain'] = (
    # Passing Outcome: Blocked, Defensive Interception, Out of Play
    ((full_data['passingEvent.passOutcomeType'].isin(['B', 'D', 'O'])) &
    (full_data['gameId'] == full_data['gameId'].shift(-1)) &
    (full_data['opponent'] == full_data['team'].shift(-1))) |
    # Challenge Outcome: Distribution Disrrupted, Forces Carrier Out of Play, Beats Man Loses Ball, Out of Play
    ((full_data['challengeEvent.challengeOutcomeType'].isin(['B', 'C', 'M', 'O'])) &
    (full_data['gameId'] == full_data['gameId'].shift(-1)) &
    (full_data['opponent'] == full_data['team'].shift(-1))) |
    # Ball Carry Outcome: Beats Man Loses Ball, Forced Out of Play, Successful Tackle
    ((full_data['ballCarryEvent.ballCarryOutcome'].isin(['L', 'O', 'S'])) &
    (full_data['gameId'] == full_data['gameId'].shift(-1)) &
    (full_data['opponent'] == full_data['team'].shift(-1)))
)

# Filter for regains that resulted in shots for the opposition within 20 seconds

full_data['shot_after_regain'] = False

for idx, row in full_data.iterrows():
    if row['opposition_regain']:

        subsequent_rows = full_data[
            (full_data['gameId'] == row['gameId']) & 
            (full_data['gameClock_y'] > row['gameClock_y']) & 
            (full_data['gameClock_y'] <= row['gameClock_y'] + row['duration'] + 20)
        ]
        shot_occurred = (
            (subsequent_rows['possessionEventType'] == 'SH') &
            (subsequent_rows['team'] == row['opponent'])
        ).any()
        
        if shot_occurred:
            full_data.at[idx, 'shot_after_regain'] = True


In [8]:
# Define a function to process each .jsonl.bz2 file
def process_jsonl_bz2(file_path, opposition_df):
    # Extract gameId from the file name (remove directory and extension)
    game_id = os.path.basename(file_path).replace('.jsonl.bz2', '')

    # Open the bz2 file
    with bz2.BZ2File(file_path, 'rb') as f:
    # Read and parse the JSON lines
        data = [json.loads(line) for line in f]

    # Convert to a Pandas DataFrame
    df = pd.DataFrame(data)
    df['gameId'] = game_id

    # Select only the required columns
    df = df[['gameId', 'frameNum', 'period', 'ballsSmoothed', 'possession_event_id']]
    df['possession_event_id'] = pd.to_numeric(df['possession_event_id'], errors='coerce').fillna(0).astype(int)

    # Merge with the opposition_regain dataframe to include the flag
    df = df.merge(opposition_df, on='possession_event_id', how='left')

    # Filter rows where opposition_regain is True
    filtered_df = df[df['opposition_regain'] == True]

    # Extract the last frame of each possession_event_id
    # Assuming there's a frame column to determine the last frame
    last_frames = (
        filtered_df[filtered_df['ballsSmoothed'].notna()]
        .sort_values('frameNum')  # Ensure sorting by a time-related column
        .groupby('possession_event_id')
        .tail(1)  # Get the last frame for each possession_event_id
    )

    return last_frames


In [9]:
# Define the directory containing the .jsonl.bz2 files
data_folder = '/PFF_FC/Data'

# Get all .jsonl.bz2 files in the directory
file_paths = [os.path.join(data_folder, f) for f in os.listdir(data_folder) if f.endswith('.jsonl.bz2')]

opposition_df = full_data[full_data['opposition_regain'] == True]
all_filtered_frames = []

# Process each file in the folder
for file_path in tqdm(file_paths, desc="Processing files"):
    filtered_last_frames = process_jsonl_bz2(file_path, opposition_df)
    all_filtered_frames.append(filtered_last_frames)

# Combine data from all files
combined_filtered_frames = pd.concat(all_filtered_frames, ignore_index=True)

# Save File to save time from having to run function above
combined_filtered_frames.to_csv('filtered_frames_regains.csv')

In [None]:
combined_filtered_frames = pd.read_csv('filtered_frames_regains.csv')
# ballsSmoothed column stored as string in csv
combined_filtered_frames['ballsSmoothed'] = combined_filtered_frames['ballsSmoothed'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

ball_flatten = pd.json_normalize(combined_filtered_frames['ballsSmoothed'])
final_df = pd.concat([combined_filtered_frames.drop(columns=['ballsSmoothed']), ball_flatten], axis=1)
final_df = final_df.rename(columns={'gameId_x': 'gameId'})
final_df['gameId'] = final_df['gameId'].astype(int)

final_df = final_df.merge(md[['gameId', 'homeTeamStartLeft', 'away_team_name', 'home_team_name']], on = 'gameId')

In [12]:
# Transform Coordinates
final_df.loc[
    ((final_df['homeTeamStartLeft'] == True) & (final_df['team'] == final_df['home_team_name']) & (final_df['period'] == 2)) |
    ((final_df['homeTeamStartLeft'] == False) & (final_df['team'] == final_df['home_team_name']) & (final_df['period'] == 1)) |
    ((final_df['homeTeamStartLeft'] == True) & (final_df['team'] == final_df['away_team_name']) & (final_df['period'] == 1)) |
    ((final_df['homeTeamStartLeft'] == False) & (final_df['team'] == final_df['away_team_name']) & (final_df['period'] == 2)),
    ['x', 'y']
] *= -1

# Attacking Half Regains

final_df['attacking_half_regain'] = final_df['x'] > 0
final_df['att_regain_shot'] = (
    ((final_df['attacking_half_regain'] == True) &
    (final_df['shot_after_regain'] == True)) 
)

In [None]:
# Count the number of games each opponent played
games_per_opponent = full_data.groupby('team')['gameId'].nunique()

# Group by 'opponent' and calculate the raw counts
summary = final_df.groupby('opponent').agg(
    attacking_half_regains=('attacking_half_regain', 'sum'),
    att_regain_shots=('att_regain_shot', 'sum')
).reset_index()

# Add the number of games to the summary
summary['games_played'] = summary['opponent'].map(games_per_opponent)

# Normalize by the number of games
summary['attacking_half_regains_per_game'] = (summary['attacking_half_regains'] / summary['games_played']).round(2)
summary['regain_to_shot_percentage'] = ((summary['att_regain_shots'] / summary['attacking_half_regains'])*100).round(2)

summary = summary.set_index('opponent')
summary

In [None]:
# Plot Pitch
pitch = Pitch(pitch_type='impect', axis=False)

att_regains_df = final_df[(final_df['attacking_half_regain'] == True) & (final_df['opponent'].isin(['France', 'Argentina']))]

teams = sorted(att_regains_df['opponent'].dropna().unique())

fig, axs = pitch.grid(nrows=2, ncols=1, grid_height=0.715, grid_width=0.95,
                      axis=False,
                      title_height=0)

team_color_map = {
    'France': '#003366',
    'Argentina': '#6CACE4'
}

team_edge_color_map = {
    'France': 'white',
    'Argentina': 'black' 
}

for i, ax in enumerate(axs['pitch'].flat[:len(teams)]):

    team_colors = team_color_map.get(teams[i])
    team_edge_color = team_edge_color_map.get(teams[i])

    ax.text(0, 38, teams[i],
            ha='center', va='center', fontsize=15, color = team_colors, fontproperties=font_normal)
    
    df = att_regains_df.loc[att_regains_df.opponent == teams[i]]

    average_regain_height = df['x'].mean()
    regains_to_shots = df[df['att_regain_shot'] == True]

    kde = pitch.kdeplot(df.x, df.y, ax=ax, cmap='coolwarm', fill=True, levels=15, shade_lowest=False, alpha=0.8)
    ax.axvline(average_regain_height, color='black', linestyle='dotted', linewidth=2)

    pitch.scatter(regains_to_shots.x, regains_to_shots.y, c=team_colors, edgecolors= team_edge_color, marker='o', s=100, ax=ax, zorder = 3)


custom_legend_elements = [
        mlines.Line2D([], [], color='black', linestyle='dotted', linewidth=2, label='Average Regain Height'),
        mlines.Line2D([], [], color='white', marker='o', markeredgecolor='black', markersize=10, label='Regain Leading to Shot')
    ]
fig.legend(
    handles=custom_legend_elements, 
    loc='lower center',
    bbox_to_anchor=(0.5, 0.1),
    fontsize=10,
    ncol=2,
    prop=font_normal
)

fig.savefig('wc_regains.png', 
            dpi=300, 
            bbox_inches='tight')



In [None]:
# Plot Table
fig, ax = plt.subplots(figsize = (12, 14))

cmap = normed_cmap(summary['attacking_half_regains_per_game'], cmap="coolwarm", num_stds=1.5)
cmap2 = normed_cmap(summary['regain_to_shot_percentage'], cmap="coolwarm", num_stds=1.5)

tab = Table(
            summary,
            ax=ax,
            column_definitions=[
                ColDef("opponent", width=.75, title="Team", textprops={"ha": "left", "weight": "bold"}),
                ColDef("attacking_half_regains", width=0.5, title="Regains"),
                ColDef("att_regain_shots", width=0.5, title="Shots After\nRegain"),
                ColDef("games_played", width=0.5, title="Num of Games"),
                ColDef("attacking_half_regains_per_game", width=0.5, title="Regains\nPer Game", cmap = cmap),
                ColDef("regain_to_shot_percentage", width=0.5, title="Regain to Shot (%)", cmap = cmap2)
            ],
            col_label_divider=True,
            textprops={"fontsize": 12, "ha": "center", "fontproperties": font_normal},
            cell_kw={"facecolor": 'none'},
            col_label_cell_kw={"facecolor": 'none'}
        )

fig.savefig('wc_regains_table.png', 
            dpi=300, 
            bbox_inches='tight')
