In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from lib.glob_fix import glob
import json

In [2]:
# A function that loads wyscout data and converts it to a pandas dataframe
def load_wyscout_json(path):
    with open(path, encoding='latin-1') as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    return df

# A function that converts the positions column to start and end coordinates. Not all functions contain end coordinates, so we need to handle that.
def fix_positions(df):
    df['x'] = df['positions'].apply(lambda x: x[0]['x']).clip(0, 100)
    df['y'] = df['positions'].apply(lambda x: 100 - x[0]['y']).clip(0, 100)
    df['end_x'] = df['positions'].apply(lambda x: x[1]['x'] if len(x) > 1 else np.NaN)
    df['end_y'] = df['positions'].apply(lambda x: 100 - x[1]['y'] if len(x) > 1 else np.NaN)
    df = df.drop(columns=['positions'])
    return df

# A function that converts the tags column to a list of tags
def fix_tags(df):
    tags_map = {
        101:    'goal',                 #   Goal
        102:    'own_goal',             #	Own goal
        301:    'assist',               #	Assist
        302:    'key_pass',             #	Key pass
        1901:   'counter_attack',       #	Counter attack
        401:    'left',                 #	Left foot
        402:    'right',                #	Right foot
        403:    'head',                 #	Head/body
        1101:   'direct',               #	Direct
        1102:   'indirect',             #	Indirect
        2001:   'dangerous_ball_lost',  #	Dangerous ball lost
        2101:   'blocked',              #	Blocked
        801:    'high',                 #	High
        802:    'low',                  #	Low
        1401:   'interception',         #	Interception
        1501:   'clearance',            #	Clearance
        201:    'opportunity',          #	Opportunity
        1301:   'feint',                #	Feint
        1302:   'missed ball',          #   Missed ball
        501:    'free_space_r',         #	Free space right
        502:    'free_space_l',         #	Free space left
        503:    'take_on_l',            #	Take on left
        504:    'take_on_r',            #	Take on right
        1601:   'sliding_tackle',       #	Sliding tackle
        601:    'anticipated',          #	Anticipated
        602:    'anticipation',         #	Anticipation
        1701:   'red',                  #	Red card
        1702:   'yellow',               #	Yellow card
        1703:   'second_yellow',        #	Second yellow card
        901:    'through',              #	Through
        1001:   'fairplay',             #	Fairplay
        701:    'lost',                 #	Lost
        702:    'neutral',              #	Neutral
        703:    'won',                  #	Won
        1801:   'accurate',             #	Accurate
        1802:   'not_accurate',         #   Not accurate
    }

    tags_goal_mouth_map = {
        1201:   'gb',                   #	Position: Goal low center
        1202:   'gbr',                  #	Position: Goal low right
        1203:   'gc',                   #	Position: Goal center
        1204:   'gl',                   #	Position: Goal center left
        1205:   'glb',                  #	Position: Goal low left
        1206:   'gr',                   #	Position: Goal center right
        1207:   'gt',                   #	Position: Goal high center
        1208:   'gtl',                  #	Position: Goal high left
        1209:   'gtr',                  #	Position: Goal high right
        1210:   'obr',                  #	Position: Out low right
        1211:   'ol',                   #	Position: Out center left
        1212:   'olb',                  #	Position: Out low left
        1213:   'or',                   #	Position: Out center right
        1214:   'ot',                   #	Position: Out high center
        1215:   'otl',                  #	Position: Out high left
        1216:   'otr',                  #	Position: Out high right
        1217:   'pbr',                  #	Position: Post low right
        1218:   'pl',                   #	Position: Post center left
        1219:   'plb',                  #	Position: Post low left
        1220:   'pr',                   #	Position: Post center right
        1221:   'pt',                   #	Position: Post high center
        1222:   'ptl',                  #	Position: Post high left
        1223:   'ptr',                  #	Position: Post high right
    }

    df['tags'] = df['tags'].apply(lambda x: [t['id'] for t in x])
    # create a column for each tag
    for tag_id in tags_map.keys():
        df[tags_map[tag_id]] = df['tags'].apply(lambda x: 1 if tag_id in x else np.NaN)

    # create a column, that contains the goal mouth placement name if the goal mouth placement tag is present
    df['goal_mouth_placement'] = ''
    for tag_id in tags_goal_mouth_map.keys():
        df['goal_mouth_placement'] += df['tags'].apply(lambda x: tags_goal_mouth_map[tag_id] * (tag_id in x))
    
    df = df.drop(columns=['tags'])

    return df

# Convert second variable to minutes and seconds
def fix_time_variables(df):
    df['absolute_sec'] = df['eventSec'].apply(lambda x: x).round(1)
    df['minute'] = df['eventSec'].apply(lambda x: int(x/60))
    df['second'] = df['eventSec'].apply(lambda x: int(x%60))
    df['period'] = df['matchPeriod'].map({'1H': 1, '2H': 2, 'E1': 3, 'E2': 4})
    df = df.drop(columns=['eventSec', 'matchPeriod'])
    return df 

# Fix type names
def fix_type_names(df):
    df['type_name'] = df['eventName'].str.replace(' ', '_').str.lower()
    df['subtype_name'] = df['subEventName'].str.replace(' ', '_').str.lower()
    df = df.drop(columns=['eventName', 'eventId', 'subEventName', 'subEventId'])
    return df

def fix_club_names(df):
    df_teams = load_wyscout_json('data/wyscout/json/teams.json')
    df['wyId'] = df['teamId']
    df = df.merge(df_teams[['wyId', 'name']], on='wyId', how='left').rename(columns={'wyId': 'team_id', 'name': 'team_name'})
    df = df.drop(columns=['teamId'])
    return df

def fix_player_names(df):
    df_players = load_wyscout_json('data/wyscout/json/players.json')
    df['wyId'] = df['playerId']
    df = df.merge(df_players[['wyId', 'shortName']], on='wyId', how='left').rename(columns={'wyId': 'player_id', 'shortName': 'player_name'})
    df = df.drop(columns=['playerId'])
    return df

def fix_match_names(df, competition):
    df_matches = load_wyscout_json(f'data/wyscout/json/matches/{competition}.json')
    home_team_id = []
    away_team_id = []
    for _, row in df_matches.iterrows():
        teams = list(row['teamsData'].keys())
        if row['teamsData'][teams[0]]['side'] == 'home':
            home_team_id.append(teams[0])
            away_team_id.append(teams[1])
        else:
            home_team_id.append(teams[1])
            away_team_id.append(teams[0])
    df_matches['label'] = df_matches['label'].str.split(',').str[0]
    df_matches['home_team_id'] = home_team_id
    df_matches['away_team_id'] = away_team_id
    df['wyId'] = df['matchId']
    df = df.merge(df_matches[['wyId', 'label', 'winner', 'home_team_id', 'away_team_id']], on='wyId', how='left').rename(columns={'wyId': 'match_id', 'label': 'match_name', 'winner': 'match_winner'})
    df = df.drop(columns=['matchId'])
    return df

def load_wyscout_events_json(competition):
    df = load_wyscout_json(f'data/wyscout/json/events/{competition}.json')
    df = fix_match_names(df, competition)
    df = fix_time_variables(df)
    df = fix_player_names(df)
    df = fix_club_names(df)
    df = fix_positions(df)
    df = fix_type_names(df)
    df = fix_tags(df)
    return df

def convert_wyscout_events(competition):
    df = load_wyscout_events_json(competition)
    df.to_csv(f'data/wyscout/csv/events/{competition}.csv', index=False)

In [3]:
df = load_wyscout_events_json('European_Championship')
df.head(12)

Unnamed: 0,id,match_id,match_name,match_winner,home_team_id,away_team_id,absolute_sec,minute,second,period,...,yellow,second_yellow,through,fairplay,lost,neutral,won,accurate,not_accurate,goal_mouth_placement
0,88178642,1694390,France - Romania,4418,4418,11944,1.3,0,1,1.0,...,,,,,,,,1.0,,
1,88178643,1694390,France - Romania,4418,4418,11944,2.4,0,2,1.0,...,,,,,,,,1.0,,
2,88178644,1694390,France - Romania,4418,4418,11944,3.2,0,3,1.0,...,,,,,,,,1.0,,
3,88178645,1694390,France - Romania,4418,4418,11944,6.0,0,6,1.0,...,,,,,,,,,1.0,
4,88178646,1694390,France - Romania,4418,4418,11944,13.1,0,13,1.0,...,,,,,,1.0,,1.0,,
5,88178663,1694390,France - Romania,4418,4418,11944,14.1,0,14,1.0,...,,,,,,1.0,,1.0,,
6,88178648,1694390,France - Romania,4418,4418,11944,27.1,0,27,1.0,...,,,,,,,,,1.0,
7,88178667,1694390,France - Romania,4418,4418,11944,29.0,0,28,1.0,...,,,,,,,,,1.0,
8,88178649,1694390,France - Romania,4418,4418,11944,31.2,0,31,1.0,...,,,,,,,,1.0,,gc
9,88178674,1694390,France - Romania,4418,4418,11944,32.7,0,32,1.0,...,,,,,,,,1.0,,gc


In [4]:
for i, competition in enumerate(glob('data/wyscout/json/events/*.json')):
    competition = competition.split('/')[-1].split('.')[0]
    print(f'{i}/{len(glob("data/wyscout/json/events/*.json"))} - Converting {competition}.............................................', end='\r')
    convert_wyscout_events(competition)

6/7 - Converting World_Cup.........................................................