In [26]:
import os;
import warnings
import pandas as pd
import tqdm
import json
from io import BytesIO
import pickle

from tqdm.notebook import tqdm
from xgboost import XGBClassifier

import socceraction.spadl as spadl
import socceraction.spadl.statsbomb as statsbomb

import pathlib

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [27]:
STATSBOMB_BASE_PATH = pathlib.Path("../../open-data/data")

STATSBOMB_COMPETITIONS_PATH = f"{STATSBOMB_BASE_PATH}/competitions.json"
STATSBOMB_MATCHES_PATH = f"{STATSBOMB_BASE_PATH}/matches"
STATSBOMB_EVENTS_PATH = f"{STATSBOMB_BASE_PATH}/events"

In [28]:
SBL = statsbomb.StatsBombLoader(root = STATSBOMB_BASE_PATH,
                                getter = "local")

In [29]:
set(SBL.competitions().competition_name)

{'Champions League',
 "FA Women's Super League",
 'FIFA World Cup',
 'La Liga',
 'NWSL',
 'Premier League',
 "Women's World Cup"}

In [30]:
competitions = SBL.competitions()
target_competitions = competitions[competitions.competition_name == "FIFA World Cup"]
target_competitions

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,season_name,match_updated,match_available
17,43,3,International,FIFA World Cup,male,2018,2020-10-25T14:03:50.263266,2020-10-25T14:03:50.263266


In [31]:
games = list(SBL.matches(row.competition_id, row.season_id)
             for row in target_competitions.itertuples())

# games is a list so...
games = pd.concat(games, sort = True).reset_index(drop = True)
games[["home_team_id", "away_team_id", "match_date", "home_score", "away_score"]].head()

Unnamed: 0,home_team_id,away_team_id,match_date,home_score,away_score
0,785,776,2018-07-01,1,1
1,775,793,2018-06-22,2,0
2,789,769,2018-06-24,0,3
3,785,775,2018-06-16,2,0
4,781,795,2018-06-22,2,0


In [37]:
matches_verbose = tqdm(list(games.iterrows()))

actions = dict()
teams, players = list(), list()

for __, match in matches_verbose:
    teams.append(SBL.teams(match.match_id))
    players.append(SBL.players(match.match_id))
    
    events = SBL.events(match.match_id)
    actions[match.id] = statsbomb.convert_to_actions(events = events,
                                                     home_team_id = match.home_team_id)
    
teams = pd.concat(teams).drop_duplicates("team_id").reset_index(drop = True)
players = pd.concat(players).reset_index(drop = True)

HBox(children=(FloatProgress(value=0.0, max=64.0), HTML(value='')))

    player_id                     player_name     player_nickname  \
0        3027   Mathias Jattah-Njie Jørgensen               Zanka   
1        3043     Christian Dannemann Eriksen   Christian Eriksen   
2        3815               Kasper Schmeichel                None   
3        3959             Andreas Christensen                None   
4        4447  Martin Braithwaite Christensen  Martin Braithwaite   
5        4763                Henrik Dalsgaard                None   
6        5516      Pione Sisto Ifolo Emirmija         Pione Sisto   
7        5520                    Lasse Schöne                None   
8        5522               Nicolai Jørgensen                None   
9        5527                  Thomas Delaney                None   
10       5534               Simon Thorup Kjær          Simon Kjær   
11       5536           Yussuf Yurary Poulsen      Yussuf Poulsen   
12       5732         Andreas Evald Cornelius   Andreas Cornelius   
13       6355                   Jo

In [34]:
teams[:5]

Unnamed: 0,team_id,team_name
0,776,Denmark
1,785,Croatia
2,775,Nigeria
3,793,Iceland
4,789,Poland


In [35]:
players[:5]

Unnamed: 0,player_id,player_name,player_nickname,jersey_number,country_id,country_name,extra
0,3027,Mathias Jattah-Njie Jørgensen,Zanka,13,61,Denmark,{}
1,3043,Christian Dannemann Eriksen,Christian Eriksen,10,61,Denmark,{}
2,3815,Kasper Schmeichel,,1,61,Denmark,{}
3,3959,Andreas Christensen,,6,61,Denmark,{}
4,4447,Martin Braithwaite Christensen,Martin Braithwaite,11,61,Denmark,{}


In [36]:
spadl_path = os.path.join(STATSBOMB_BASE_PATH, "spadl-world-cup.h5")

players_main_cols = ["player_id", "player_name", "player_nickname"]
players_game_cols = ["player_id", "match_id", "team_id", "is_starter",
                     "starting_position_id", "starting_position_name", "minutes_played"]

with pd.HDFStore(spadl_path) as file:
    file["competitions"] = target_competitions
    file["games"] = games
    file["teams"] = teams
    file["players"] = players[players_main_cols].drop_duplicates(subset = "player_id")
    file["player_games"] = players[players_game_cols]
    
    # action's keys are the game ids
    for game_id in actions.keys():
        file[f"actions/game_{game_id}"] = actions[game_id]
        
    file["actiontypes"] = spadl.actiontypes_df()
    file["results"] = spadl.results_df()
    file["bodyparts"] = spadl.bodyparts_df()

KeyError: "['match_id', 'team_id', 'is_starter', 'starting_position_id', 'starting_position_name', 'minutes_played'] not in index"