In [61]:
import os;
import warnings
import pandas as pd
import tqdm
import json
from io import BytesIO

import socceraction.spadl as spadl
import socceraction.spadl.statsbomb as statsbomb
from socceraction.spadl.wyscout import convert_to_spadl

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [63]:
WYSCOUT_BASE_PATH = "../../wyscout"
TEAMS_PATH = f"{WYSCOUT_BASE_PATH}/teams.json"
PLAYERS_PATH = f"{WYSCOUT_BASE_PATH}/players.json"
MATCHES_PATH = f"{WYSCOUT_BASE_PATH}/matches_"
EVENTS_PATH = f"{WYSCOUT_BASE_PATH}/events_"

In [64]:
#os.listdir(WYSCOUT_BASE_PATH)

In [65]:
def read_json_file(filename):
    with open(filename, "rb") as json_file:
        return BytesIO(json_file.read()).getvalue().decode("unicode_escape")

## Teams to HDF

In [66]:
df_teams = pd.read_json(read_json_file(TEAMS_PATH))
df_teams["nation"] = df_teams["area"].apply(lambda x: x["name"])
#df_teams = df_teams[["wyId", "name", "nation"]]

In [67]:
df_teams.head(3)

Unnamed: 0,city,name,wyId,officialName,area,type,nation
0,Newcastle upon Tyne,Newcastle United,1613,Newcastle United FC,"{'name': 'England', 'id': '0', 'alpha3code': '...",club,England
1,Vigo,Celta de Vigo,692,Real Club Celta de Vigo,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club,Spain
2,Barcelona,Espanyol,691,Reial Club Deportiu Espanyol,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club,Spain


In [68]:
df_teams.to_hdf(path_or_buf = f"{WYSCOUT_BASE_PATH}/wyscout.h5",
                key = "teams", mode = "w")

## Players to HDF

In [69]:
df_players = pd.read_json(read_json_file(PLAYERS_PATH))
df_players["role"] = df_players["role"].apply(lambda x: x["name"])
#df_players = df_players[["wyId", "shortName", "role", "foot", "currentTeamId", "currentNationalTeamId"]]

In [70]:
df_players.head(2)

Unnamed: 0,passportArea,weight,firstName,middleName,lastName,currentTeamId,birthDate,height,role,birthArea,wyId,foot,shortName,currentNationalTeamId
0,"{'name': 'Turkey', 'id': '792', 'alpha3code': ...",78,Harun,,Tekin,4502,1989-06-17,187,Goalkeeper,"{'name': 'Turkey', 'id': '792', 'alpha3code': ...",32777,right,H. Tekin,4687
1,"{'name': 'Senegal', 'id': '686', 'alpha3code':...",73,Malang,,Sarr,3775,1999-01-23,182,Defender,"{'name': 'France', 'id': '250', 'alpha3code': ...",393228,left,M. Sarr,4423


In [71]:
df_players.to_hdf(path_or_buf = f"{WYSCOUT_BASE_PATH}/wyscout.h5",
                  key = "players", mode = "a")

## Matches to HDF

In [72]:
# all_competitions = ["England", "France", "Germany", "Italy", "Spain",
#                     "European Championship", "World Cup"]

# The goal is to train on England and test on Italy
competitions = ["Italy", "England"]

In [73]:
dfs_matches = list()

for comp in competitions:
    comp_name = comp.replace(" ", "_")
    filename = f"{MATCHES_PATH}{comp_name}.json"
    df_matches = pd.read_json(read_json_file(filename))
    dfs_matches.append(df_matches)

df_matches = pd.concat(dfs_matches)
#df_matches = df_matches[["wyId", "competitionId", "seasonId",
#                         "label", "winner", "teamsData"]]

In [74]:
df_matches.head(1)

Unnamed: 0,status,roundId,gameweek,teamsData,seasonId,dateutc,winner,venue,wyId,label,date,referees,duration,competitionId
0,Played,4406278,38,"{'3162': {'scoreET': 0, 'coachId': 251025, 'si...",181248,2018-05-20 18:45:00,3161,,2576335,"Lazio - Internazionale, 2 - 3","May 20, 2018 at 8:45:00 PM GMT+2","[{'refereeId': 377206, 'role': 'referee'}, {'r...",Regular,524


In [75]:
df_matches.to_hdf(path_or_buf = f"{WYSCOUT_BASE_PATH}/wyscout.h5",
                  key = "matches", mode = "a")

## Events to HDF

In [76]:
for comp in competitions:
    comp_name = comp.replace(" ", "_")
    filename = f"{EVENTS_PATH}{comp_name}.json"
    df_events = pd.read_json(read_json_file(filename))
    break

In [77]:
df_events.head(3)

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
0,8,Simple pass,[{'id': 1801}],8327,"[{'y': 52, 'x': 49}, {'y': 44, 'x': 43}]",2575959,Pass,3158,1H,2.530536,85,180423957
1,8,Simple pass,[{'id': 1801}],20438,"[{'y': 44, 'x': 43}, {'y': 17, 'x': 36}]",2575959,Pass,3158,1H,3.768418,85,180423958
2,7,Touch,[],8306,"[{'y': 17, 'x': 36}, {'y': 56, 'x': 78}]",2575959,Others on the ball,3158,1H,4.868265,72,180423959


In [78]:
df_events_matches = df_events.groupby("matchId", as_index = False)

In [79]:
df_events_matches.head(1).head(3)

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
0,8,Simple pass,[{'id': 1801}],8327,"[{'y': 52, 'x': 49}, {'y': 44, 'x': 43}]",2575959,Pass,3158,1H,2.530536,85,180423957
1613,8,Simple pass,[{'id': 1801}],259292,"[{'y': 51, 'x': 50}, {'y': 38, 'x': 30}]",2575960,Pass,3166,1H,2.912448,85,180460833
3157,8,Simple pass,[{'id': 1801}],351010,"[{'y': 48, 'x': 50}, {'y': 44, 'x': 35}]",2575961,Pass,3157,1H,2.361132,85,180454688


In [80]:
for comp in competitions:
    comp_name = comp.replace(" ", "_")
    filename = f"{EVENTS_PATH}{comp_name}.json"
    df_events = pd.read_json(read_json_file(filename))
    df_events_matches = df_events.groupby("matchId", as_index = False)
    for match_id, df_events_match in df_events_matches:
        df_events_match.to_hdf(path_or_buf = "wyscout.h5",
                               key = f"events/match_{match_id}", mode = "a")

## Spadl

In [81]:
convert_to_spadl(f"{WYSCOUT_BASE_PATH}/wyscout.h5",
                 f"{WYSCOUT_BASE_PATH}/spadl.h5")

...Inserting actiontypes
...Inserting bodyparts
...Inserting results
...Converting games
...Converting players
...Converting teams

  0%|          | 0/760 [00:00<?, ?game/s]


...Generating player_games





KeyError: 'No object named events/match_2576335 in the file'