In [12]:
from mplsoccer.pitch import Pitch
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import os
import json
import math
import jsonlines
from skillcorner.client import SkillcornerClient
import pyarrow.parquet as pq
from utils import load_json_file, load_jsonl_file, euclidean_distance, check_target_area, get_receive_frame, explode_data
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Import tracking data and events data, extract matches that existing in both

In [13]:
tracking_files = os.listdir('data/FA/tracking')
tracking_files = [f for f in tracking_files if f.endswith('.jsonl')]
events_path = 'data/wyscout_events.parquet'
events_df = pd.read_parquet(events_path)
match_ids = [int(f.split('.')[0]) for f in tracking_files]
event_match_ids = list(set(events_df.sk_match_id))
match_ids = [i for i in match_ids if i in event_match_ids]
events_df = events_df[events_df['sk_match_id'].isin(match_ids)]

In [15]:
events_df.head(1)

Unnamed: 0,sk_match_id,wy_match_id,frame,is_matched,frame_tracking_data_available,is_matched_applicable,aerialduel_firsttouch,aerialduel_height,aerialduel_opponent_height,aerialduel_opponent_id,aerialduel_opponent_name,aerialduel_opponent_position,aerialduel_relatedduelid,carry_endlocation_x,carry_endlocation_y,carry_progression,groundduel_dueltype,groundduel_keptpossession,groundduel_opponent_id,groundduel_opponent_name,groundduel_opponent_position,groundduel_progressedwithball,groundduel_recoveredpossession,groundduel_relatedduelid,groundduel_side,groundduel_stoppedprogress,groundduel_takeon,wy_event_id,infraction_opponent_id,infraction_opponent_name,infraction_opponent_position,infraction_redcard,infraction_type,infraction_yellowcard,location_x,location_y,matchperiod,matchtimestamp,minute,opponentteam_formation,opponentteam_id,opponentteam_name,pass_accurate,pass_angle,pass_endlocation_x,pass_endlocation_y,pass_height,pass_length,pass_recipient_id,pass_recipient_name,pass_recipient_position,wy_player_id,player_name,player_position,sk_player_id,possession_attack_flank,possession_attack_withgoal,possession_attack_withshot,possession_attack_withshotongoal,possession_attack_xg,possession_duration,possession_endlocation_x,possession_endlocation_y,possession_eventindex,possession_eventsnumber,possession_id,possession_startlocation_x,possession_startlocation_y,possession_team_formation,possession_team_id,possession_team_name,possession_types,relatedeventid,second,shot_bodypart,shot_goalzone,shot_goalkeeperactionid,shot_goalkeeper_id,shot_goalkeeper_name,shot_isgoal,shot_ontarget,shot_postshotxg,shot_xg,team_formation,wy_team_id,sk_team_id,team_name,type_primary,type_secondary,videotimestamp,possession_team_id_src,possession_src,possession_team_name_src,phase_of_play_name,controlled_phase_name
106932,1133203,5509644,107,True,True,True,,,,,,,,,,,,,,,,,,,,,,1783543568,,,,,,,51,50,1H,00:00:00.862,0,4-2-3-1,1954,Aston Villa,True,-153,38,40,,15,134338,K. Zelem,DMF,11164,H. Ladd,RCMF,64166,,,,,,20.4477385,69,100,0,8,1783543568,51,50,4-1-4-1,63195,Manchester United,,1783543569,0,,,,,,,,,,4-1-4-1,63195,2027,Manchester United,pass,"[back_pass, short_or_medium_pass]",10.862353,63195,1,Manchester United,Controlled,Phase 1


In [5]:
cols = ['sk_match_id','frame', 'minute', 'player_name', 'pass_recipient_name', 'pass_recipient_id', 'pass_recipient_position', 'sk_player_id', 'pass_angle','type_primary', 'pass_accurate', 'team_name', 'possession_team_name', 'possession_team_name_src','possession_types']
events_df = events_df[cols]

#### Create map between wyscout short name and sk player id

In [6]:
recipient_names = set(events_df.pass_recipient_name.dropna())

In [7]:
short_name_map = {}
for i in match_ids:
    match_path = f'data/FA/match/{i}.json'
    match_data = load_json_file(match_path)
    home_team = match_data['home_team']['short_name']
    away_team = match_data['away_team']['short_name']
    player_data = match_data['players']
    for p in player_data:
        if p['short_name'] not in short_name_map.keys():
            short_name_map[p['short_name']] = p['id']

In [8]:
len(short_name_map.keys())

323

In [9]:
missing = [n for n in recipient_names if n not in short_name_map.keys()]
missing

['S. Fearne', 'A. James', 'J. Olme', 'Geyse Ferreira']

In [10]:
short_name_map['Geyse Ferreira'] = 22555
short_name_map['A. James'] = 64167
short_name_map['J. Olme'] = 22800
short_name_map['S. Fearne'] = 809180

In [11]:
with open('data/short_name_map.json', "w") as json_file:
    json.dump(short_name_map, json_file, indent=4)

#### Extract passes which recipient was in the wide area, final third

In [9]:
events_df_pass = events_df[(events_df['type_primary'] == 'pass') & (events_df['pass_accurate'] == True) &
(~events_df['pass_recipient_name'].isna())]

In [13]:
len(match_ids)

102

In [15]:
%%time
for match_id in match_ids:
    print(match_id)
    match_path = f'data/FA/match/{match_id}.json'
    match_data = load_json_file(match_path)
    tracking_path = f'data/FA/tracking/{match_id}.jsonl'

    home_id = match_data['home_team']['id']
    away_id = match_data['away_team']['id']
    home_p_ids = {p['id'] for p in match_data['players'] if p['team_id'] == home_id}
    away_p_ids = {p['id'] for p in match_data['players'] if p['team_id'] == away_id}
    home_start = match_data['home_team_side'][0]

    match_event_pass = events_df_pass[events_df_pass['sk_match_id'] == match_id].reset_index(drop=True)
    passing_frames = match_event_pass['frame'].to_numpy() 
    
    pitch_length = match_data['pitch_length']
    box_width = 40.3 
    distance_threshold = 2

    results = []  
    
    for i in range(len(passing_frames)):
        start_frame = passing_frames[i]
        if i<len(passing_frames)-1:
            end_frame = passing_frames[i + 1]
            end_frame = min(end_frame, 100+start_frame)
        else: end_frame = 100+start_frame
        
        row = match_event_pass.iloc[i]
        recipient = row['pass_recipient_name']

        target_player_id = short_name_map[recipient]
        team = 'home' if target_player_id in home_p_ids else 'away' if target_player_id in away_p_ids else None
        
        # Get receive frame details        
        result = get_receive_frame(
            tracking_path, team, home_start, pitch_length, start_frame, end_frame, target_player_id, distance_threshold
        )
        results.append(result)
        
    # Convert results into DataFrame and merge with match_event_pass
    results_df = pd.DataFrame(
        results, columns=['period', 'receive_frame', 'receive_x', 'receive_y', 'target_area']
    )
    match_event_pass = pd.concat([match_event_pass, results_df], axis=1)
    match_event_pass = match_event_pass[match_event_pass['target_area']==True]
            
    match_event_pass.to_csv(f'data/pass_data_2/{match_id}_pass.csv', index = False)

1133199
1133200
1133201
1133202
1133203
1137933
1145391
1150609
1150610
1152233
1152234
1152235
1165960
1168361
1169168
1169169
1169170
1171524
1185376
1185377
1188242
1188243
1188244
1189122
1224542
1227049
1227892
1227893
1227894
1227895
1245008
1245009
1245010
1245011
1245012
1245013
1258164
1260792
1260793
1260794
1260795
1260796
1275862
1275864
1275865
1275866
1277980
1301698
1303616
1303617
1303618
1303619
1304210
1304211
1313027
1315113
1315114
1315583
1315585
1338003
1363387
1364347
1365752
1365753
1366200
1369292
1374566
1376028
1376029
1376030
1376031
1376032
1384072
1385140
1385655
1385656
1385658
1386129
1401804
1402958
1404691
1404692
1410651
1423691
1424336
1424937
1424938
1424939
1424941
1442160
1444085
1445378
1445379
1445380
1445947
1453349
1454436
1454967
1454968
1454969
1454970
1618208
CPU times: total: 15h 27min 35s
Wall time: 15h 29min 17s
