In [2]:
import os
wd = os.path.normpath(os.getcwd() + '/..')
os.chdir(wd)
os.getcwd()

'/home/hyunsung/Workspace/soccercpd'

In [3]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime, timedelta
from joblib import Parallel, delayed
from src.myconstants import *

pd.set_option('display.width', 250)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 20)

### Merging formation period records

In [20]:
formcpd_type = 'rank'
form_periods_files = os.listdir(f'data/{formcpd_type}/form')
form_periods_files.sort()
form_periods_list = []

for file in form_periods_files:
    path = f'data/{formcpd_type}/form/{file}'
    form_periods_list.append(pd.read_pickle(path))

form_periods = pd.concat(form_periods_list, ignore_index=True)
form_periods.to_pickle(f'data/{formcpd_type}/form_periods.pkl')
form_periods

  0%|          | 0/457 [17:07<?, ?it/s]


Unnamed: 0,activity_id,session,form_period,start_dt,end_dt,duration,coords,edge_mat
0,12864,1,1,2020-01-01 14:00:00,2020-01-01 14:47:00,2820.0,"[[-706.0, 1597.0], [-1362.0, 686.0], [910.0, 1...","[[0.0, 0.981, 0.953, 0.205, 0.027, 0.814, 0.27..."
1,12868,1,1,2020-01-01 14:00:00,2020-01-01 14:47:00,2820.0,"[[-807.0, -1612.0], [44.0, 199.0], [-373.0, -2...","[[0.0, 0.31, 0.721, 0.997, 0.207, 0.049, 0.115..."
2,12868,2,2,2020-01-01 15:02:00,2020-01-01 15:50:00,2880.0,"[[-928.0, -1910.0], [-111.0, 658.0], [-206.0, ...","[[0.0, 0.168, 0.853, 0.989, 0.126, 0.042, 0.24..."
3,12870,1,1,2020-01-01 13:30:00,2020-01-01 14:19:00,2940.0,"[[-1400.0, 57.0], [439.0, -1071.0], [-283.0, 3...","[[0.0, 0.219, 0.884, 0.097, 0.395, 0.959, 0.34..."
4,12870,2,2,2020-01-01 14:33:00,2020-01-01 15:23:00,3000.0,"[[-1266.0, -146.0], [292.0, -877.0], [-205.0, ...","[[0.0, 0.393, 0.871, 0.067, 0.217, 0.998, 0.28..."
...,...,...,...,...,...,...,...,...
859,9064,2,2,2020-01-01 16:02:00,2020-01-01 16:51:00,2940.0,"[[-1421.0, 563.0], [-1550.0, -608.0], [61.0, -...","[[0.0, 0.966, 0.411, 0.062, 0.026, 0.04, 0.866..."
860,9281,1,1,2020-01-01 15:00:00,2020-01-01 15:47:00,2820.0,"[[-1437.0, 113.0], [-1109.0, -959.0], [1523.0,...","[[0.0, 0.969, 0.053, 0.032, 0.028, 0.201, 0.98..."
861,9281,2,2,2020-01-01 16:03:00,2020-01-01 16:51:00,2880.0,"[[-1253.0, -90.0], [-1090.0, -1033.0], [1471.0...","[[0.0, 0.933, 0.042, 0.034, 0.061, 0.195, 0.97..."
862,9285,1,1,2020-01-01 14:01:00,2020-01-01 14:48:00,2820.0,"[[-638.0, 1741.0], [-1425.0, 599.0], [-1561.0,...","[[0.0, 0.986, 0.243, 0.315, 0.159, 0.738, 0.23..."


### Generating role period records

In [4]:
role_period_cols = [
    LABEL_ACTIVITY_ID, LABEL_PLAYER_PERIOD, LABEL_FORM_PERIOD, LABEL_ROLE_PERIOD,
    LABEL_SESSION, LABEL_START_TIME, LABEL_END_TIME
]
activity_records = pd.read_csv('data/activity_records.csv', header=0, encoding='utf-8-sig')
form_periods = pd.read_pickle('data/form_periods.pkl')

In [5]:
def generate_role_period_records(fgp_file, formcpd_type='gseg_avg'):
    activity_id = int(fgp_file.split('.')[0])
    fgp_path = f'data/{formcpd_type}/fgp/{fgp_file}'
    fgp_df = pd.read_csv(fgp_path, header=0, encoding='utf-8-sig')

    grouped = fgp_df.groupby(LABEL_ROLE_PERIOD)
    match_role_periods = grouped.first()[
        [LABEL_PLAYER_PERIOD, LABEL_FORM_PERIOD, LABEL_SESSION, LABEL_GAMETIME]
    ]
    match_role_periods[LABEL_ACTIVITY_ID] = activity_id
    match_role_periods[LABEL_START_TIME] = match_role_periods[LABEL_GAMETIME].apply(lambda x: int(x[:2]))
    match_role_periods[LABEL_END_TIME] = grouped[LABEL_GAMETIME].last().apply(lambda x: int(x[:2]))
    match_role_periods.reset_index(inplace=True)

    return match_role_periods[role_period_cols]

In [6]:
formcpd_type = 'gseg_avg'
fgp_files = os.listdir(f'data/{formcpd_type}/fgp')
role_period_list = Parallel(n_jobs=50)(
    delayed(generate_role_period_records)(f, formcpd_type) for f in tqdm(fgp_files)
)
role_periods = pd.concat(role_period_list, axis=0, ignore_index=True).astype(int)
role_periods.sort_values([LABEL_ACTIVITY_ID, LABEL_ROLE_PERIOD], inplace=True)
role_periods

100%|██████████| 457/457 [00:03<00:00, 144.82it/s]


Unnamed: 0,activity_id,player_period,form_period,role_period,session,start_time,end_time
763,1879,1,1,1,1,0,35
764,1879,2,1,2,1,35,47
765,1879,3,2,3,2,0,31
766,1879,4,2,4,2,31,42
767,1879,4,2,5,2,42,49
...,...,...,...,...,...,...,...
1780,22875,2,1,2,1,29,40
1781,22875,2,1,3,1,40,47
1782,22875,3,2,4,2,0,18
1783,22875,4,2,5,2,18,49


In [7]:
role_periods.to_csv(f'data/{formcpd_type}/role_periods.csv', index=False)

### Generating and merging role-assignment records

In [None]:
role_record_cols = [
    LABEL_PLAYER_PERIOD, LABEL_FORM_PERIOD, LABEL_SESSION,
    LABEL_DATETIME, LABEL_SQUAD_NUM, LABEL_PLAYER_NAME, LABEL_BASE_ROLE
]
activity_records = pd.read_csv('data/activity_records.csv', header=0, encoding='utf-8-sig')
form_periods = pd.read_pickle('data/form_periods.pkl')

In [19]:
def generate_role_records(fgp_file, formcpd_type='gseg_avg', freq=1):
    activity_id = int(fgp_file.split('.')[0])
    fgp_path = f'data/{formcpd_type}/fgp/{fgp_file}'
    fgp_df = pd.read_csv(fgp_path, header=0, encoding='utf-8-sig')
    fgp_df[LABEL_DATETIME] = fgp_df[LABEL_DATETIME].apply(lambda dt: datetime.strptime(dt, '%Y-%m-%d %H:%M:%S'))

    grouped = fgp_df.groupby([LABEL_PLAYER_ID, LABEL_ROLE_PERIOD], as_index=False)
    role_records = grouped[role_record_cols].first()
    role_records[LABEL_ACTIVITY_ID] = activity_id
    role_records[LABEL_START_DT] = grouped[LABEL_DATETIME].first()[LABEL_DATETIME] - timedelta(seconds=freq)
    role_records[LABEL_END_DT] = grouped[LABEL_DATETIME].last()[LABEL_DATETIME]
    role_records[LABEL_DURATION] = grouped[LABEL_GAMETIME].count()[LABEL_GAMETIME] * freq

    role_records = pd.merge(role_records, form_periods[[LABEL_ACTIVITY_ID, LABEL_FORM_PERIOD, LABEL_COORDS]])
    role_records[LABEL_X] = role_records.apply(lambda x: x[LABEL_COORDS][x[LABEL_BASE_ROLE]-1, 0], axis=1)
    role_records[LABEL_Y] = role_records.apply(lambda x: x[LABEL_COORDS][x[LABEL_BASE_ROLE]-1, 1], axis=1)

    role_records = role_records[HEADER_ROLE_RECORDS].sort_values(
        [LABEL_SQUAD_NUM, LABEL_ROLE_PERIOD], ignore_index=True)
    target_path = f'data/{formcpd_type}/role/{activity_id}.csv'
    role_records.to_csv(target_path, index=False, encoding='utf-8-sig')

    return target_path

In [10]:
activity_records = pd.read_csv('data/activity_records.csv', header=0, encoding='utf-8-sig')
form_periods = pd.read_pickle('data/form_periods.pkl')
activity_ids = activity_records[activity_records[LABEL_STATS_SAVED] == 1][LABEL_ACTIVITY_ID]
role_records_list = []

target_paths = Parallel(n_jobs=50)(delayed(generate_role_records)(i) for i in tqdm(activity_ids))
pd.read_csv(target_paths[0], header=0)

100%|██████████| 457/457 [00:09<00:00, 48.60it/s]


Unnamed: 0,activity_id,session,player_period,form_period,role_period,start_dt,end_dt,duration,player_id,squad_num,player_name,base_role,x,y
0,1879,1,2,1,2,2020-01-01 16:35:00,2020-01-01 16:47:00,720,1248,7,P07,3,1042.0,1812.0
1,1879,2,3,2,3,2020-01-01 17:02:00,2020-01-01 17:33:00,1860,1248,7,P07,3,674.0,1848.0
2,1879,2,4,2,4,2020-01-01 17:33:00,2020-01-01 17:44:00,660,1248,7,P07,3,674.0,1848.0
3,1879,2,4,2,5,2020-01-01 17:44:00,2020-01-01 17:51:00,420,1248,7,P07,3,674.0,1848.0
4,1879,2,5,2,6,2020-01-01 17:51:00,2020-01-01 17:53:00,120,1248,7,P07,9,-1318.0,546.0
5,1879,1,1,1,1,2020-01-01 16:00:00,2020-01-01 16:35:00,2100,1252,9,P09,1,1710.0,-60.0
6,1879,1,2,1,2,2020-01-01 16:35:00,2020-01-01 16:47:00,720,1252,9,P09,1,1710.0,-60.0
7,1879,2,3,2,3,2020-01-01 17:02:00,2020-01-01 17:33:00,1860,1252,9,P09,1,1701.0,24.0
8,1879,2,4,2,4,2020-01-01 17:33:00,2020-01-01 17:44:00,660,1252,9,P09,1,1701.0,24.0
9,1879,2,4,2,5,2020-01-01 17:44:00,2020-01-01 17:51:00,420,1252,9,P09,1,1701.0,24.0


In [11]:
role_records_files = os.listdir('data/role_avg')
role_records_files.sort()
role_records_list = []

for activity_id in tqdm(activity_ids):
    path = f'data/role_avg/{activity_id}.csv'
    role_records_list.append(pd.read_csv(path, header=0, encoding='utf-8-sig'))

role_records = pd.concat(role_records_list, ignore_index=True)
role_records

100%|██████████| 457/457 [00:00<00:00, 471.13it/s]


Unnamed: 0,activity_id,session,player_period,form_period,role_period,start_dt,end_dt,duration,player_id,squad_num,player_name,base_role,x,y
0,1879,1,2,1,2,2020-01-01 16:35:00,2020-01-01 16:47:00,720,1248,7,P07,3,1042.0,1812.0
1,1879,2,3,2,3,2020-01-01 17:02:00,2020-01-01 17:33:00,1860,1248,7,P07,3,674.0,1848.0
2,1879,2,4,2,4,2020-01-01 17:33:00,2020-01-01 17:44:00,660,1248,7,P07,3,674.0,1848.0
3,1879,2,4,2,5,2020-01-01 17:44:00,2020-01-01 17:51:00,420,1248,7,P07,3,674.0,1848.0
4,1879,2,5,2,6,2020-01-01 17:51:00,2020-01-01 17:53:00,120,1248,7,P07,9,-1318.0,546.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21182,22875,1,2,1,2,2020-01-01 15:30:00,2020-01-01 15:41:00,660,11817,62,P62,10,-778.0,-1710.0
21183,22875,1,2,1,3,2020-01-01 15:41:00,2020-01-01 15:48:00,420,11817,62,P62,10,-778.0,-1710.0
21184,22875,2,3,2,4,2020-01-01 16:02:00,2020-01-01 16:20:00,1080,11817,62,P62,10,-1052.0,-1476.0
21185,22875,2,4,2,5,2020-01-01 16:20:00,2020-01-01 16:51:20,1880,11817,62,P62,10,-1052.0,-1476.0


In [12]:
role_records.to_csv('data/role_records.csv', index=False, encoding='utf-8-sig')