In [6]:
import os
import warnings
import tqdm
import pandas as pd
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
import numpy as np

In [7]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append(os.path.dirname(os.path.abspath(os.getcwd())))

from socceractions.data.statsbomb import StatsBombLoader
import socceractions.spadl as spadl
import socceractions.vaep.formula as vaepformula

In [8]:
# Configure file and folder names
main_folder = os.path.dirname(os.path.dirname(os.getcwd()))
datafolder = main_folder + "/data-fifa"
spadl_h5 = os.path.join(datafolder, "spadl-statsbomb.h5")
predictions_h5 = os.path.join(datafolder, "predictions.h5")

In [9]:
with pd.HDFStore(spadl_h5) as spadlstore:
    games = (
        spadlstore["games"]
        .merge(spadlstore["competitions"], how='left')
        .merge(spadlstore["teams"].add_prefix('home_'), how='left')
        .merge(spadlstore["teams"].add_prefix('away_'), how='left'))
    players = spadlstore["players"]
    teams = spadlstore["teams"]
print("nb of games:", len(games))

nb of games: 64


In [10]:
games.head()

Unnamed: 0,game_id,season_id,competition_id,competition_stage,game_day,game_date,home_team_id,away_team_id,home_score,away_score,venue,referee,competition_name,country_name,competition_gender,season_name,home_team_name,away_team_name
0,7585,3,43,Round of 16,4,2018-07-03 20:00:00,769,768,1,1,Otkritie Bank Arena,Mark Geiger,FIFA World Cup,International,male,2018,Colombia,England
1,7570,3,43,Group Stage,3,2018-06-28 20:00:00,768,782,0,1,Stadion Kaliningrad,Damir Skomina,FIFA World Cup,International,male,2018,England,Belgium
2,7586,3,43,Round of 16,4,2018-07-03 16:00:00,790,773,1,0,Saint-Petersburg Stadium,Damir Skomina,FIFA World Cup,International,male,2018,Sweden,Switzerland
3,7557,3,43,Group Stage,3,2018-06-25 20:00:00,797,780,1,1,Mordovia Arena,Enrique Cáceres,FIFA World Cup,International,male,2018,Iran,Portugal
4,7542,3,43,Group Stage,2,2018-06-20 14:00:00,780,788,1,0,Stadion Luzhniki,Mark Geiger,FIFA World Cup,International,male,2018,Portugal,Morocco


### 1. Data Loading

[1] time seconds : event 시작 시간 <br>
[2] period가 바뀐다고 Home, Away가 flip되진 않음 <br>
[3] 100 x 100 규격으로 맞춰두었음 <br>
[4] Away는 (from left to right) 포맷으로 바꿔야 함 <br>

In [11]:
home_away_id = games[['game_id', 'home_team_id', 'away_team_id']].copy()

In [12]:
A = []
for game in tqdm.tqdm(list(games.itertuples()), desc="Rating actions"):
    actions = pd.read_hdf(spadl_h5, f"actions/game_{game.game_id}")
    actions = (
        spadl.add_names(actions)
        .merge(players, how="left")
        .merge(teams, how="left")
        .sort_values(["game_id", "period_id", "action_id"])
        .reset_index(drop=True)
    )
    preds = pd.read_hdf(predictions_h5, f"game_{game.game_id}")
    values = vaepformula.value(actions, preds.scores, preds.concedes)
    A.append(pd.concat([actions, preds, values], axis=1))
A = pd.concat(A).sort_values(["game_id", "period_id", "time_seconds"]).reset_index(drop=True)
A = pd.merge(left=A, right=home_away_id, how='left', on='game_id')
A.columns

Rating actions: 100%|██████████| 64/64 [00:21<00:00,  2.94it/s]


Index(['game_id', 'original_event_id', 'period_id', 'time_seconds', 'team_id',
       'player_id', 'duration', 'start_x', 'start_y', 'end_x', 'end_y',
       'type_id', 'result_id', 'bodypart_id', 'action_id', 'type_name',
       'result_name', 'bodypart_name', 'player_name', 'nickname', 'team_name',
       'scores', 'concedes', 'offensive_value', 'defensive_value',
       'vaep_value', 'home_team_id', 'away_team_id'],
      dtype='object')

In [13]:
# 축구 액션
A['type_name'].value_counts()

type_name
pass                56438
dribble             52946
throw_in             2178
take_on              2109
clearance            2074
foul                 1876
tackle               1830
interception         1681
shot                 1556
bad_touch            1547
cross                1305
goalkick              677
freekick_crossed      636
freekick_short        554
corner_crossed        446
keeper_save           326
keeper_claim          174
corner_short          112
keeper_punch           84
shot_freekick          82
shot_penalty           68
Name: count, dtype: int64

In [14]:
# 경기 이벤트 수 & 전/후반전 시간
for i in list(A['game_id'].unique()):
    print(f'{i}번 경기의 이벤트 수 : {len(A[A['game_id'] == i])}')
    print(f'전반전 시간 : {A[(A['game_id'] == i)&(A['period_id'] == 1)].iloc[-1]['time_seconds'] // 60}')
    print(f'후반전 시간 : {A[(A['game_id'] == i)&(A['period_id'] == 2)].iloc[-1]['time_seconds'] // 60}')
    if i > 7530 :
        break

7525번 경기의 이벤트 수 : 1893
전반전 시간 : 47.0
후반전 시간 : 49.0
7529번 경기의 이벤트 수 : 1871
전반전 시간 : 46.0
후반전 시간 : 49.0
7530번 경기의 이벤트 수 : 2035
전반전 시간 : 46.0
후반전 시간 : 49.0
7531번 경기의 이벤트 수 : 2129
전반전 시간 : 46.0
후반전 시간 : 49.0


In [15]:
A[A['game_id'] == 7525][['game_id', 'time_seconds', 'team_id', 
                         'player_id', 'start_x', 'start_y', 'end_x', 
                         'end_y', 'type_id', 'result_id', 'type_name', 'result_name']].head()

Unnamed: 0,game_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,type_id,result_id,type_name,result_name
0,7525,0.612,799,5196.0,49.583333,49.375,71.25,48.125,0,1,pass,success
1,7525,1.732,799,5173.0,71.25,48.125,70.416667,49.375,21,1,dribble,success
2,7525,2.933,799,5173.0,70.416667,49.375,25.416667,19.375,0,0,pass,fail
3,7525,5.893,796,5175.0,31.25,15.625,37.083333,6.875,0,0,pass,fail
4,7525,7.772,799,5178.0,37.083333,6.875,24.583333,6.875,0,1,pass,success


In [110]:
A.head()

Unnamed: 0,game_id,original_event_id,period_id,time_seconds,team_id,player_id,duration,start_x,start_y,end_x,...,player_name,nickname,team_name,scores,concedes,offensive_value,defensive_value,vaep_value,home_team_id,away_team_id
0,7525,e23943da-dc40-49f3-a7a0-e269288098c3,1,0.612,799,5196.0,1.12,49.583333,49.375,71.25,...,Mohammad Ibrahim Al Sahlawi,Mohammad Al Sahlawi,Saudi Arabia,0.000738,0.000799,0.0,-0.0,0.0,796,799
1,7525,fd89c08f-16ad-4102-a3ab-4dd8d58ae687,1,1.732,799,5173.0,1.201,71.25,48.125,70.416667,...,Abdullah Ibrahim Otayf,Abdullah Otayf,Saudi Arabia,0.002685,0.001047,0.001946,-0.000248,0.001699,796,799
2,7525,c4ba0e20-8090-4285-945e-e2f9d136ae21,1,2.933,799,5173.0,2.96,70.416667,49.375,25.416667,...,Abdullah Ibrahim Otayf,Abdullah Otayf,Saudi Arabia,0.006562,0.00124,0.003877,-0.000193,0.003685,796,799
3,7525,576d4649-3b45-4de2-b9f7-692c3d53a12d,1,5.893,796,5175.0,1.879,31.25,15.625,37.083333,...,Ilya Kutepov,,Russia,0.001677,0.004637,0.000438,0.001926,0.002363,796,799
4,7525,a7a37529-06c3-462f-bab5-5135acfa524a,1,7.772,799,5178.0,1.2,37.083333,6.875,24.583333,...,Salman Mohammed Al Faraj,Salman Al Faraj,Saudi Arabia,0.022594,0.001721,0.017957,-4.3e-05,0.017914,796,799


### 2. Checking the format of GIM

**GTR**: game time remain [0, 100] </br>
**X**: coordinate of ball / start [0, 100] </br>
**Y**: coordinate of ball / start [0, 100] </br>
**MP**: manpower          [-5, 5] </br>
**GD**: goal difference   [-inf, inf] </br>
**Action**: type of action [one-hot representation] </br>
**OC**: outcome           [S or F] </br>
**Velocity**: velocity of ball [-inf, inf] </br>
**ED**: event duration    [0, inf] </br>
**Angle**: angle between ball and goal [-pi, pi] </br>
**T**: team who performs action[Home or Away] </br>
**Reward**: [1,0,0] / [0,1,0] indicate the scoring event of home team and away team respectively

### 3. Covert former format into GIM

In [81]:
from preprocess_data import play_left_to_right, goal_sequence
from labels import game_remain, manpower_gd, onehot_action, get_team, get_angle_velocity, get_reward

In [82]:
dataset = A.copy()

# play flow: 왼쪽 -> 오른쪽
game = []

for g_id in tqdm.tqdm(list(games['game_id'].unique()), desc="Rating actions"):
    gamestates = dataset[dataset['game_id'] == g_id].copy()
    home_team_id, away_team_id = gamestates[['home_team_id', 'away_team_id']].iloc[0].values
    gamestates = play_left_to_right(gamestates, home_team_id, away_team_id)
    game.append(gamestates)

dataset = pd.concat(game).sort_values("game_id").reset_index(drop=True)

# goal 행 추가
dataset = goal_sequence(dataset)

Rating actions: 100%|██████████| 64/64 [02:15<00:00,  2.11s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  goal[['start_x', 'end_x']] = 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  goal[['start_y', 'end_y']] = 50
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  goal['type_name'] = 'goal'
A value is trying to be set on a copy of a slice 

In [83]:
# 1. GTR, ED
game = []

for g_id in tqdm.tqdm(list(games['game_id'].unique()), desc="Rating actions"):
    gamestates = dataset[dataset['game_id'] == g_id].copy()
    gamestates = game_remain(gamestates, duration_drop=False)
    game.append(gamestates)

dataset = pd.concat(game).sort_values(['game_id','period_id','time_seconds']).reset_index(drop=True)

Rating actions: 100%|██████████| 64/64 [00:01<00:00, 55.65it/s]


In [84]:
# 2. Action, T, Outcome
dataset = onehot_action(dataset)
dataset = get_team(dataset)
dataset.rename(columns = {'result_name' : 'OC'}, inplace = True)

dataset[['Action', 'T', 'OC']].head()

one-hot encoding: 128882it [00:13, 9511.55it/s] 
Team discrete: 128882it [00:12, 10121.10it/s]


Unnamed: 0,Action,T,OC
0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Away,success
1,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Away,success
2,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Away,fail
3,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Home,fail
4,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Away,success


In [108]:
# 3. X, Y of ball (start) | Velocity of ball
dataset = get_angle_velocity(dataset)
dataset.rename(columns = {'start_x' : 'X', 'start_y' : 'Y'}, inplace = True)

dataset[['time_seconds', 'X', 'Y', 'Angle', 'Velocity']].head()

Calculating angle, velocity: 39it [00:01, 28.72it/s]

첫 행 계산 과정
ball_start: [[50.41666667 50.625     ]]
ball_end: [[28.75  51.875]]
goal_location: [[100  50]]
end: [[-21.66666667   1.25      ]]
ball_start: [[71.25  -1.875]]
ball_start: [3.11027394]
0    50.416667
1    28.750000
dtype: float64
0    50.625
1    51.875
dtype: float64
-19.34523809523809
1.1160714285714157


Calculating angle, velocity: 128882it [02:58, 722.98it/s]


Unnamed: 0,X,Y,Angle,Velocity
0,50.416667,50.625,3.110274,"[-19.34523809523809, 1.1160714285714157]"
1,28.75,51.875,0.973918,"[0.6938662225922803, -1.0407993338884323]"
2,29.583333,50.625,1.466071,"[15.2027027027027, 10.13513513513513]"
3,31.25,15.625,1.583674,"[3.104488202944826, -4.656732304417243]"
4,62.916667,93.125,1.052691,"[10.416666666666679, 0.0]"


In [109]:
# 4. Reward: [home, away, neither]
dataset = get_reward(dataset)

dataset['Reward'].head()

0    [0, 0, 0]
1    [0, 0, 0]
2    [0, 0, 0]
3    [0, 0, 0]
4    [0, 0, 0]
Name: Reward, dtype: object

In [None]:
# 5. MP: manpower | GD: goal difference


### 4. (S, A, R)