In [1]:
import os
import warnings
import tqdm
import pandas as pd
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
import numpy as np
from sklearn.preprocessing import scale

In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append(os.path.dirname(os.path.abspath(os.getcwd())))

from socceractions.data.statsbomb import StatsBombLoader
import socceractions.spadl as spadl
import socceractions.vaep.formula as vaepformula

In [3]:
# Configure file and folder names
main_folder = os.path.dirname(os.path.dirname(os.getcwd()))
datafolder = main_folder + "/data-fifa"
spadl_h5 = os.path.join(datafolder, "spadl-statsbomb.h5")
predictions_h5 = os.path.join(datafolder, "predictions.h5")

In [4]:
with pd.HDFStore(spadl_h5) as spadlstore:
    games = (
        spadlstore["games"]
        .merge(spadlstore["competitions"], how='left')
        .merge(spadlstore["teams"].add_prefix('home_'), how='left')
        .merge(spadlstore["teams"].add_prefix('away_'), how='left'))
    players = spadlstore["players"]
    teams = spadlstore["teams"]
print("nb of games:", len(games))

nb of games: 64


In [5]:
games.head()

Unnamed: 0,game_id,season_id,competition_id,competition_stage,game_day,game_date,home_team_id,away_team_id,home_score,away_score,venue,referee,competition_name,country_name,competition_gender,season_name,home_team_name,away_team_name
0,7585,3,43,Round of 16,4,2018-07-03 20:00:00,769,768,1,1,Otkritie Bank Arena,Mark Geiger,FIFA World Cup,International,male,2018,Colombia,England
1,7570,3,43,Group Stage,3,2018-06-28 20:00:00,768,782,0,1,Stadion Kaliningrad,Damir Skomina,FIFA World Cup,International,male,2018,England,Belgium
2,7586,3,43,Round of 16,4,2018-07-03 16:00:00,790,773,1,0,Saint-Petersburg Stadium,Damir Skomina,FIFA World Cup,International,male,2018,Sweden,Switzerland
3,7557,3,43,Group Stage,3,2018-06-25 20:00:00,797,780,1,1,Mordovia Arena,Enrique Cáceres,FIFA World Cup,International,male,2018,Iran,Portugal
4,7542,3,43,Group Stage,2,2018-06-20 14:00:00,780,788,1,0,Stadion Luzhniki,Mark Geiger,FIFA World Cup,International,male,2018,Portugal,Morocco


In [6]:
games[games['game_id']==7525]

Unnamed: 0,game_id,season_id,competition_id,competition_stage,game_day,game_date,home_team_id,away_team_id,home_score,away_score,venue,referee,competition_name,country_name,competition_gender,season_name,home_team_name,away_team_name
52,7525,3,43,Group Stage,1,2018-06-14 17:00:00,796,799,5,0,Stadion Luzhniki,Néstor Fabián Pitana,FIFA World Cup,International,male,2018,Russia,Saudi Arabia


### 1. Data Loading

[1] time seconds : event 시작 시간 <br>
[2] period가 바뀐다고 Home, Away가 flip되진 않음 <br>
[3] 100 x 100 규격으로 맞춰두었음 <br>
[4] Away는 (from left to right) 포맷으로 바꿔야 함 <br>

In [7]:
home_away_id = games[['game_id', 'home_team_id', 'away_team_id']].copy()

In [8]:
A = []
for game in tqdm.tqdm(list(games.itertuples()), desc="Rating actions"):
    actions = pd.read_hdf(spadl_h5, f"actions/game_{game.game_id}")
    actions = (
        spadl.add_names(actions)
        .merge(players, how="left")
        .merge(teams, how="left")
        .sort_values(["game_id", "period_id", "action_id"])
        .reset_index(drop=True)
    )
    preds = pd.read_hdf(predictions_h5, f"game_{game.game_id}")
    values = vaepformula.value(actions, preds.scores, preds.concedes)
    A.append(pd.concat([actions, preds, values], axis=1))
A = pd.concat(A).sort_values(["game_id", "period_id", "time_seconds"]).reset_index(drop=True)
A = pd.merge(left=A, right=home_away_id, how='left', on='game_id')
A.columns

Rating actions: 100%|██████████| 64/64 [00:08<00:00,  7.91it/s]


Index(['game_id', 'original_event_id', 'period_id', 'time_seconds', 'team_id',
       'player_id', 'duration', 'extra', 'start_x', 'start_y', 'end_x',
       'end_y', 'type_id', 'result_id', 'bodypart_id', 'action_id',
       'type_name', 'result_name', 'bodypart_name', 'player_name', 'nickname',
       'team_name', 'scores', 'concedes', 'offensive_value', 'defensive_value',
       'vaep_value', 'home_team_id', 'away_team_id'],
      dtype='object')

In [9]:
A['result_name'].value_counts()

result_name
success        109568
fail            18810
yellow_card       167
offside           140
owngoal            12
red_card            2
Name: count, dtype: int64

In [10]:
# 축구 액션
A['type_name'].value_counts()

type_name
pass                56438
dribble             52946
throw_in             2178
take_on              2109
clearance            2074
foul                 1876
tackle               1830
interception         1681
shot                 1556
bad_touch            1547
cross                1305
goalkick              677
freekick_crossed      636
freekick_short        554
corner_crossed        446
keeper_save           326
keeper_claim          174
corner_short          112
keeper_punch           84
shot_freekick          82
shot_penalty           68
Name: count, dtype: int64

In [11]:
A[(A['type_name'].str.contains('shot'))&(A['result_name']=='success')][['type_name', 'result_name']]

Unnamed: 0,type_name,result_name
270,shot,success
838,shot,success
1532,shot,success
1851,shot,success
1891,shot_freekick,success
...,...,...
127491,shot,success
127572,shot_penalty,success
128036,shot,success
128156,shot,success


In [12]:
# score 확률이 0.8 넘으면 골이라고 볼 수 있음.
A[A['scores'] > 0.8][['type_name', 'result_name']]

Unnamed: 0,type_name,result_name
270,shot,success
838,shot,success
1532,shot,success
1851,shot,success
1891,shot_freekick,success
...,...,...
127491,shot,success
127572,shot_penalty,success
128036,shot,success
128156,shot,success


In [13]:
# 경기 이벤트 수 & 전/후반전 시간
for i in list(A['game_id'].unique()):
    print(f'{i}번 경기의 이벤트 수 : {len(A[A['game_id'] == i])}')
    print(f'전반전 시간 : {A[(A['game_id'] == i)&(A['period_id'] == 1)].iloc[-1]['time_seconds'] // 60}')
    print(f'후반전 시간 : {A[(A['game_id'] == i)&(A['period_id'] == 2)].iloc[-1]['time_seconds'] // 60}')
    if i > 7530 :
        break

7525번 경기의 이벤트 수 : 1893
전반전 시간 : 47.0
후반전 시간 : 49.0
7529번 경기의 이벤트 수 : 1871
전반전 시간 : 46.0
후반전 시간 : 49.0
7530번 경기의 이벤트 수 : 2035
전반전 시간 : 46.0
후반전 시간 : 49.0
7531번 경기의 이벤트 수 : 2129
전반전 시간 : 46.0
후반전 시간 : 49.0


In [14]:
A[A['game_id'] == 7525][['game_id', 'time_seconds', 'team_id', 
                         'player_id', 'start_x', 'start_y', 'end_x', 
                         'end_y', 'type_id', 'result_id', 'type_name', 'result_name']].head()

Unnamed: 0,game_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,type_id,result_id,type_name,result_name
0,7525,0.612,799,5196.0,49.583333,49.375,71.25,48.125,0,1,pass,success
1,7525,1.732,799,5173.0,71.25,48.125,70.416667,49.375,21,1,dribble,success
2,7525,2.933,799,5173.0,70.416667,49.375,25.416667,19.375,0,0,pass,fail
3,7525,5.893,796,5175.0,31.25,15.625,37.083333,6.875,0,0,pass,fail
4,7525,7.772,799,5178.0,37.083333,6.875,24.583333,6.875,0,1,pass,success


In [15]:
A.head()

Unnamed: 0,game_id,original_event_id,period_id,time_seconds,team_id,player_id,duration,extra,start_x,start_y,...,player_name,nickname,team_name,scores,concedes,offensive_value,defensive_value,vaep_value,home_team_id,away_team_id
0,7525,e23943da-dc40-49f3-a7a0-e269288098c3,1,0.612,799,5196.0,1.12,"{'pass': {'recipient': {'id': 5173, 'name': 'A...",49.583333,49.375,...,Mohammad Ibrahim Al Sahlawi,Mohammad Al Sahlawi,Saudi Arabia,0.000738,0.000799,0.0,-0.0,0.0,796,799
1,7525,fd89c08f-16ad-4102-a3ab-4dd8d58ae687,1,1.732,799,5173.0,1.201,"{'carry': {'end_location': [36.0, 40.0]}}",71.25,48.125,...,Abdullah Ibrahim Otayf,Abdullah Otayf,Saudi Arabia,0.002685,0.001047,0.001946,-0.000248,0.001699,796,799
2,7525,c4ba0e20-8090-4285-945e-e2f9d136ae21,1,2.933,799,5173.0,2.96,"{'pass': {'recipient': {'id': 5183, 'name': 'Y...",70.416667,49.375,...,Abdullah Ibrahim Otayf,Abdullah Otayf,Saudi Arabia,0.006562,0.00124,0.003877,-0.000193,0.003685,796,799
3,7525,576d4649-3b45-4de2-b9f7-692c3d53a12d,1,5.893,796,5175.0,1.879,"{'pass': {'length': 9.899495, 'angle': 0.78539...",31.25,15.625,...,Ilya Kutepov,,Russia,0.001677,0.004637,0.000438,0.001926,0.002363,796,799
4,7525,a7a37529-06c3-462f-bab5-5135acfa524a,1,7.772,799,5178.0,1.2,"{'pass': {'recipient': {'id': 5187, 'name': 'S...",37.083333,6.875,...,Salman Mohammed Al Faraj,Salman Al Faraj,Saudi Arabia,0.022594,0.001721,0.017957,-4.3e-05,0.017914,796,799


In [16]:
A[A['game_id'] == 7525].iloc[1889, :]

game_id                                                    7525
original_event_id          93798d11-6c4a-4a09-b874-d8a14f0b51d6
period_id                                                     2
time_seconds                                             2867.8
team_id                                                     796
player_id                                                5177.0
duration                                                   0.52
extra                {'carry': {'end_location': [100.0, 54.0]}}
start_x                                               82.083333
start_y                                                  34.375
end_x                                                 82.916667
end_y                                                    33.125
type_id                                                      21
result_id                                                     1
bodypart_id                                                   0
action_id                               

### 2. Checking the format of GIM

**GTR**: game time remain [0, 100] </br>
**X**: coordinate of ball / start [0, 100] </br>
**Y**: coordinate of ball / start [0, 100] </br>
**MP**: manpower          [-5, 5] </br>
**GD**: goal difference   [-inf, inf] </br>
**Action**: type of action [one-hot representation] </br>
**OC**: outcome           [S or F] </br>
**Velocity**: velocity of ball [-inf, inf] </br>
**ED**: event duration    [0, inf] </br>
**Angle**: angle between ball and goal [-pi, pi] </br>
**T**: team who performs action[Home or Away] </br>
**Reward**: [1,0,0] / [0,1,0] indicate the scoring event of home team and away team respectively

### 3. Covert former format into GIM

In [17]:
from preprocess_data import play_left_to_right, goal_sequence
from labels import game_remain, goal_difference, onehot_action, get_team, get_angle_velocity, get_reward, get_manpower

In [18]:
dataset = A.copy()

# play flow: 왼쪽 -> 오른쪽
game = []

for g_id in tqdm.tqdm(list(games['game_id'].unique()), desc="Rating actions"):
    gamestates = dataset[dataset['game_id'] == g_id].copy()
    home_team_id, away_team_id = gamestates[['home_team_id', 'away_team_id']].iloc[0].values
    gamestates = play_left_to_right(gamestates, home_team_id, away_team_id)
    game.append(gamestates)

dataset = pd.concat(game).sort_values(['game_id','period_id','time_seconds']).reset_index(drop=True)

# goal 행 추가
dataset = goal_sequence(dataset)

Rating actions: 100%|██████████| 64/64 [01:04<00:00,  1.00s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  goal[['start_x', 'end_x']] = 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  goal[['start_y', 'end_y']] = 50
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  goal['type_name'] = 'goal'
A value is trying to be set on a copy of a slice 

In [19]:
# 1. GTR, ED
game = []

for g_id in tqdm.tqdm(list(games['game_id'].unique()), desc="Rating actions"):
    gamestates = dataset[dataset['game_id'] == g_id].copy()
    gamestates = game_remain(gamestates, duration_drop=False)
    game.append(gamestates)

dataset = pd.concat(game).sort_values(['game_id','period_id','time_seconds']).reset_index(drop=True)

Rating actions: 100%|██████████| 64/64 [00:01<00:00, 48.93it/s]


In [20]:
# 2. Action, T, Outcome
dataset = onehot_action(dataset)
dataset = get_team(dataset)
dataset.rename(columns = {'result_name' : 'OC'}, inplace = True)
dataset.loc[dataset['OC'] != 'success', 'OC'] = 0
dataset.loc[dataset['OC'] == 'success', 'OC'] = 1

dataset[['Action', 'T', 'OC']].head()

one-hot encoding: 128882it [00:07, 17390.38it/s]
Team discrete: 128882it [00:06, 19118.18it/s]


Unnamed: 0,Action,T,OC
0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,1
1,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,1
2,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,0
3,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0
4,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,1


In [21]:
# 3. X, Y of ball (start) | Velocity of ball
dataset = get_angle_velocity(dataset)
dataset.rename(columns = {'start_x' : 'X', 'start_y' : 'Y'}, inplace = True)

dataset[['time_seconds', 'X', 'Y', 'Angle', 'VX', 'VY']].head()

Calculating angle, velocity: 128882it [03:13, 665.51it/s]


Unnamed: 0,time_seconds,X,Y,Angle,VX,VY
0,0.612,50.416667,50.625,3.110274,-19.345238,1.116071
1,1.732,28.75,51.875,0.973918,0.693866,-1.040799
2,2.933,29.583333,50.625,1.466071,15.202703,10.135135
3,5.893,31.25,15.625,1.583674,3.104488,-4.656732
4,7.772,62.916667,93.125,1.052691,10.416667,0.0


In [22]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128882 entries, 0 to 128881
Data columns (total 36 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   game_id            128882 non-null  int64  
 1   original_event_id  126457 non-null  object 
 2   period_id          128882 non-null  int64  
 3   time_seconds       128882 non-null  float64
 4   team_id            128882 non-null  int64  
 5   player_id          128882 non-null  float64
 6   duration           116564 non-null  float64
 7   extra              126457 non-null  object 
 8   X                  128882 non-null  float64
 9   Y                  128882 non-null  float64
 10  end_x              128882 non-null  float64
 11  end_y              128882 non-null  float64
 12  type_id            128882 non-null  int64  
 13  result_id          128882 non-null  int64  
 14  bodypart_id        128882 non-null  int64  
 15  action_id          128882 non-null  int64  
 16  ty

In [23]:
dataset = dataset.sort_values(['game_id','period_id','time_seconds']).reset_index(drop=True)
dataset.tail(2)

Unnamed: 0,game_id,original_event_id,period_id,time_seconds,team_id,player_id,duration,extra,X,Y,...,vaep_value,home_team_id,away_team_id,ED,GTR,Action,T,Angle,VX,VY
128880,8658,ff3f4c36-1efd-4e25-9002-d7bbbb89707a,2,2985.427,771,3099.0,0.0,"{'goalkeeper': {'type': {'id': 25, 'name': 'Co...",3.75,59.375,...,0.110919,771,785,0.0,0.28,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,1.570796,0.0,0.0
128881,8658,fc0e0785-1841-440e-9bfc-c55b62a80edb,2,2985.427,771,3099.0,16.56,"{'carry': {'end_location': [18.0, 31.0]}}",3.75,59.375,...,-0.035633,771,785,16.56,0.28,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0.364938,0.654187,0.150966


In [24]:
# 4. Reward: [home, away, neither]
dataset = get_reward(dataset, games)

# [1,0,0] or [0,1,0]
display(dataset[dataset['type_name']=="goal"]['Reward'])
# [0,0,1]
display(dataset[dataset['period_id']==1].iloc[-1]['Reward'])

271       [1, 0, 0]
840       [1, 0, 0]
1535      [1, 0, 0]
1855      [1, 0, 0]
1896      [1, 0, 0]
            ...    
127670    [0, 1, 0]
127752    [1, 0, 0]
128217    [1, 0, 0]
128338    [1, 0, 0]
128403    [0, 1, 0]
Name: Reward, Length: 183, dtype: object

[0, 0, 1]

In [25]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128882 entries, 0 to 128881
Data columns (total 37 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   game_id            128882 non-null  int64  
 1   original_event_id  126457 non-null  object 
 2   period_id          128882 non-null  int64  
 3   time_seconds       128882 non-null  float64
 4   team_id            128882 non-null  int64  
 5   player_id          128882 non-null  float64
 6   duration           116564 non-null  float64
 7   extra              126457 non-null  object 
 8   X                  128882 non-null  float64
 9   Y                  128882 non-null  float64
 10  end_x              128882 non-null  float64
 11  end_y              128882 non-null  float64
 12  type_id            128882 non-null  int64  
 13  result_id          128882 non-null  int64  
 14  bodypart_id        128882 non-null  int64  
 15  action_id          128882 non-null  int64  
 16  ty

In [26]:
# 5. MP: manpower (X) | GD: goal difference (O)
game = []

for g_id in tqdm.tqdm(list(games['game_id'].unique()), desc="gd calculated.."):
    gamestates = dataset[dataset['game_id'] == g_id].copy()
    gamestates = goal_difference(gamestates)
    game.append(gamestates)

dataset = pd.concat(game).sort_values(['game_id','period_id','time_seconds']).reset_index(drop=True)

dataset = get_manpower(dataset, games)

dataset[dataset['type_name'] == 'goal'][['game_id', 'type_name', 'GD', 'MP']]

gd calculated..: 100%|██████████| 64/64 [00:07<00:00,  8.87it/s]
Calculating manpower...: 100%|██████████| 64/64 [00:06<00:00,  9.15it/s]


Unnamed: 0,game_id,type_name,GD,MP
275,7525,goal,1,0
835,7525,goal,0,0
1147,7525,goal,2,0
1290,7525,goal,3,0
1423,7525,goal,4,0
...,...,...,...,...
127202,8658,goal,1,0
127216,8658,goal,2,0
127297,8658,goal,0,0
128679,8658,goal,0,0


In [27]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128882 entries, 0 to 128881
Data columns (total 40 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   game_id            128882 non-null  int64  
 1   original_event_id  126457 non-null  object 
 2   period_id          128882 non-null  int64  
 3   time_seconds       128882 non-null  float64
 4   team_id            128882 non-null  int64  
 5   player_id          128882 non-null  float64
 6   duration           116564 non-null  float64
 7   extra              126457 non-null  object 
 8   X                  128882 non-null  float64
 9   Y                  128882 non-null  float64
 10  end_x              128882 non-null  float64
 11  end_y              128882 non-null  float64
 12  type_id            128882 non-null  int64  
 13  result_id          128882 non-null  int64  
 14  bodypart_id        128882 non-null  int64  
 15  action_id          128882 non-null  int64  
 16  ty

In [28]:
dataset[dataset['type_name'] == 'goal'][['game_id', 'type_name', 'GD', 'MP']]

Unnamed: 0,game_id,type_name,GD,MP
275,7525,goal,1,0
835,7525,goal,0,0
1147,7525,goal,2,0
1290,7525,goal,3,0
1423,7525,goal,4,0
...,...,...,...,...
127202,8658,goal,1,0
127216,8658,goal,2,0
127297,8658,goal,0,0
128679,8658,goal,0,0


In [29]:
dataset.sort_values(['game_id','period_id','time_seconds'], inplace=True)

In [30]:
dataset.columns

Index(['game_id', 'original_event_id', 'period_id', 'time_seconds', 'team_id',
       'player_id', 'duration', 'extra', 'X', 'Y', 'end_x', 'end_y', 'type_id',
       'result_id', 'bodypart_id', 'action_id', 'type_name', 'OC',
       'bodypart_name', 'player_name', 'nickname', 'team_name', 'scores',
       'concedes', 'offensive_value', 'defensive_value', 'vaep_value',
       'home_team_id', 'away_team_id', 'ED', 'GTR', 'Action', 'T', 'Angle',
       'VX', 'VY', 'Reward', 'GD', 'red', 'MP'],
      dtype='object')

In [31]:
dataset[['GTR', 'X', 'Y', 'MP', 'GD', 'Action', 'OC', 'VX', 'VY', 'ED', 'Angle', 'T', 'Reward', 'game_id']].head()

Unnamed: 0,GTR,X,Y,MP,GD,Action,OC,VX,VY,ED,Angle,T,Reward,game_id
622,99.99,50.416667,50.625,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,-19.345238,1.116071,1.12,3.110274,2,"[0, 0, 0]",7525
623,99.97,28.75,51.875,0,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0.693866,-1.040799,1.201,0.973918,2,"[0, 0, 0]",7525
624,99.95,29.583333,50.625,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,15.202703,10.135135,2.96,1.466071,2,"[0, 0, 0]",7525
625,99.9,31.25,15.625,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,3.104488,-4.656732,1.879,1.583674,1,"[0, 0, 0]",7525
626,99.87,62.916667,93.125,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,10.416667,0.0,1.2,1.052691,2,"[0, 0, 0]",7525


In [32]:
dataset[['GTR', 'X', 'Y', 'MP', 'GD', 'Action', 'OC', 'VX', 'VY', 'ED', 'Angle', 'T', 'Reward', 'game_id']].info()

<class 'pandas.core.frame.DataFrame'>
Index: 128882 entries, 622 to 128823
Data columns (total 14 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   GTR      128882 non-null  float64
 1   X        128882 non-null  float64
 2   Y        128882 non-null  float64
 3   MP       128882 non-null  int64  
 4   GD       128882 non-null  int64  
 5   Action   128882 non-null  object 
 6   OC       128882 non-null  object 
 7   VX       128882 non-null  float64
 8   VY       128882 non-null  float64
 9   ED       128882 non-null  float64
 10  Angle    128882 non-null  float64
 11  T        128882 non-null  int64  
 12  Reward   128882 non-null  object 
 13  game_id  128882 non-null  int64  
dtypes: float64(7), int64(4), object(3)
memory usage: 14.7+ MB


### 4. (S, A, R)

4-1. Episode 부여 "divide a soccer game into goal-scoring episodes"

In [33]:
game = []
for g_id in tqdm.tqdm(list(games['game_id'].unique()), desc="episode calculated.."):
    gamestates = dataset[dataset['game_id'] == g_id].copy()
    epi = []
    episode = 1
    for idx, action in gamestates.iterrows():
        if sum(action['Reward']) == 0:
            epi.append(episode)
        else:
            epi.append(episode)
            episode += 1
    gamestates['episode'] = epi
    game.append(gamestates)

dataset = pd.concat(game).sort_values(['game_id','period_id','time_seconds']).reset_index(drop=True)
dataset.tail(1)

episode calculated..: 100%|██████████| 64/64 [00:05<00:00, 11.17it/s]


Unnamed: 0,game_id,original_event_id,period_id,time_seconds,team_id,player_id,duration,extra,X,Y,...,Action,T,Angle,VX,VY,Reward,GD,red,MP,episode
128881,8658,fc0e0785-1841-440e-9bfc-c55b62a80edb,2,2985.427,771,3099.0,16.56,"{'carry': {'end_location': [18.0, 31.0]}}",3.75,59.375,...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0.364938,0.654187,0.150966,"[0, 0, 1]",1,0,0,7


4-2. Standard Scaler

In [35]:
from sklearn.preprocessing import StandardScaler

# standard scaler 선언 및 학습
standardScaler = StandardScaler().fit(dataset[['GTR', 'X', 'Y', 'MP', 'GD', 'OC', 'VX', 'VY', 'ED', 'Angle', 'T']])
state_action = standardScaler.transform(dataset[['GTR', 'X', 'Y', 'MP', 'GD', 'OC', 'VX', 'VY', 'ED', 'Angle', 'T']])
state_action = pd.DataFrame(state_action, columns=['GTR', 'X', 'Y', 'MP', 'GD', 'OC', 'VX', 'VY', 'ED', 'Angle', 'T'])
state_action['Action'] = dataset['Action']
state_action['Reward'] = dataset['Reward']
state_action['game_id'] = dataset['game_id']
state_action['episode'] = dataset['episode']
state_action['team'] = dataset['T']

data = state_action.copy()

In [36]:
data.head()

Unnamed: 0,GTR,X,Y,MP,GD,OC,VX,VY,ED,Angle,T,Action,Reward,game_id,episode,team
0,1.671319,0.02955,0.01617,0.05379,-0.047282,0.417508,-0.170825,0.006406,-0.021134,2.26381,0.990133,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0]",7525,1,2
1,1.670621,-0.898677,0.058367,0.05379,-0.047282,0.417508,-0.014231,-0.012304,-0.018675,-0.655923,0.990133,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0]",7525,1,2
2,1.669923,-0.862976,0.01617,0.05379,-0.047282,-2.395165,0.099147,0.08464,0.034715,0.016697,0.990133,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0]",7525,1,2
3,1.668179,-0.791574,-1.165345,0.05379,-0.047282,-2.395165,0.004606,-0.04367,0.001904,0.177423,-1.009966,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0]",7525,1,1
4,1.667132,0.565066,1.450868,0.05379,-0.047282,0.417508,0.061747,-0.003276,-0.018706,-0.548265,0.990133,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0]",7525,1,2


In [37]:
sum(games['home_score']) + sum(games['away_score']) + len(games)*2

297

In [38]:
data[data['Reward'].apply(lambda x: sum(x) > 0)]

Unnamed: 0,GTR,X,Y,MP,GD,OC,VX,VY,ED,Angle,T,Action,Reward,game_id,episode,team
271,1.255387,2.153762,-0.004928,0.05379,-0.047282,0.417508,-0.019654,-0.003276,-0.055129,0.159823,-1.009966,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[1, 0, 0]",7525,1,1
840,0.149606,2.153762,-0.004928,0.05379,0.944869,0.417508,-0.019654,-0.003276,-0.055129,0.159823,-1.009966,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[1, 0, 0]",7525,2,1
943,-0.025211,-1.005780,-0.152617,0.05379,1.937020,0.417508,-0.063485,0.111410,-0.023562,0.868437,-1.009966,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1]",7525,3,1
1535,-0.957571,2.153762,-0.004928,0.05379,1.937020,0.417508,-0.019654,-0.003276,-0.055129,0.159823,-1.009966,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[1, 0, 0]",7525,4,1
1854,-1.676729,2.153762,-0.004928,0.05379,2.929171,0.417508,-0.019654,-0.003276,-0.055129,0.159823,-1.009966,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[1, 0, 0]",7525,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127910,-0.033586,1.528994,1.493065,0.05379,-0.047282,-2.395165,-0.019654,-0.003276,-0.055129,0.159823,0.990133,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1]",8658,3,2
128216,-0.517212,2.153762,-0.004928,0.05379,-0.047282,0.417508,-0.019654,-0.003276,-0.055129,0.159823,-1.009966,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[1, 0, 0]",8658,4,1
128338,-0.732855,2.153762,-0.004928,0.05379,0.944869,0.417508,-0.019654,-0.003276,-0.055129,0.159823,-1.009966,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[1, 0, 0]",8658,5,1
128402,-0.858473,2.153762,-0.004928,0.05379,1.937020,0.417508,-0.019654,-0.003276,-0.055129,0.159823,0.990133,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 1, 0]",8658,6,2


4-3. datastore/game/episode/ + state_input | reward | action

In [39]:
gim_folder = os.path.dirname(os.getcwd())
data_dir = str(gim_folder) + "/datastore"

for g_id in tqdm.tqdm(list(games['game_id'].unique()), desc="Data packing ... "):
    gamestates = data[data['game_id'] == g_id].copy()
    for epi in list(gamestates['episode'].unique()):
        epi_dir = data_dir + f"/{g_id}/{epi}/"
        if not os.path.exists(epi_dir): 
            os.makedirs(epi_dir) 
        g = gamestates[gamestates['episode']==epi].copy()
        s = g[['GTR', 'X', 'Y', 'MP', 'GD', 'OC', 'VX', 'VY', 'ED', 'Angle', 'T']].to_numpy()
        a = g['Action'].to_numpy()
        r = g['Reward'].to_numpy()
        np.save(epi_dir + 'state', s)
        np.save(epi_dir + 'reward', r)
        np.save(epi_dir + 'action', a)

Data packing ... :   0%|          | 0/64 [00:00<?, ?it/s]

Data packing ... : 100%|██████████| 64/64 [06:47<00:00,  6.37s/it]
