In [67]:
import os
import numpy as np
import collections
from tempfile import mkdtemp
from sklearn.preprocessing import normalize

In [56]:
### READING DATASET ###

# Paths
dataset_path = 'Dataset'
games_dir = next(os.walk(dataset_path))[1]

tag_to_skip = [
    'highlights', 'highlights2', '.DS_Store', '.git', 
    'em_mcs_energy_diff_ascending', 'em_mcs_energy_diff', 'em_mcs_energy']

# Iterating through games
games_data = []
games_path = [os.path.join(dataset_path, g) for g in games_dir]
for path in games_path:
    game_name = os.path.basename(path)
    game_info = os.listdir(path)
    data = collections.OrderedDict()
    for info in game_info:
        tag_name = os.path.splitext(info)[0]
        if tag_name not in tag_to_skip:
            csv_file = os.path.join(path, info)
            if tag_name == 'highlights_boundaries':
                data_value = np.genfromtxt(csv_file, delimiter=',', dtype=None)
            else:
                data_value = np.fromfile(csv_file, sep=' ')
                if 'hue' in tag_name:
                    print(game_name, data_value[0])
            data[tag_name] = data_value
    data['name'] = game_name
    games_data.append(data)

games_backup = games_data[:]

amg_vas 43.706
arg_ger 90.0801
arg_mex 22.9615
arg_nig 139.9809
arg_sko 73.4417
bar_int 91.5123
bay_int 60.7835
bra_chi 91.3901
bra_ita 70.8342
bra_ned 95.3558
chi_swi 22.4776
cor_flu 40.2301
cru_spo 229.0032
den_jap 85.3481
fra_mex 78.5984
ger_eng 205.7013
ger_spa 22.7033
ger_uru 86.8161
ita_svk 20.1644
ned_jap 25.3723
ned_svk 22.1358
por_nko 78.2097
san_vas 83.0715
spa_ned 35.1096
spa_por 16.0773
spa_swi 242.6725
spa_usa 211.5297
spo_cru 38.9187
uru_ned 76.7098
uru_sko 91.7161


In [60]:
### CRIANDO VETOR DE GT ###

wanted_cases = ['Gol', 'Perigo']
to_skip = ['name', 'highlights_boundaries']

for game in games_data:
    h = game['highlights_boundaries']
    end = h[-1][0]
    gt = np.zeros(end)
    for line in h:
        tag = line[2].decode('UTF-8').split(' ')
        name = [t for t in tag if t != ''][-1]
        if name in wanted_cases:
            gt[line[0]-1:line[1]] = 1                   
            
    game['gt'] = gt
    del game['highlights_boundaries']

In [62]:
### ANTES DE CORTAR OS VETORES ###

for game in games_data:
    print(game['name'])
    values = [len(v) for k, v in game.items() if k not in ['name']]
    min_l = min(values)
    max_l = max(values)
    print('\tMin: {}  Max: {}  Diff: {}'.format(min_l, max_l, max_l-min_l))

amg_vas
	Min: 209234  Max: 254254  Diff: 45020
arg_ger
	Min: 171534  Max: 171627  Diff: 93
arg_mex
	Min: 202368  Max: 206768  Diff: 4400
arg_nig
	Min: 205865  Max: 233849  Diff: 27984
arg_sko
	Min: 200918  Max: 206651  Diff: 5733
bar_int
	Min: 164261  Max: 166956  Diff: 2695
bay_int
	Min: 210620  Max: 251822  Diff: 41202
bra_chi
	Min: 197484  Max: 206882  Diff: 9398
bra_ita
	Min: 214644  Max: 224843  Diff: 10199
bra_ned
	Min: 199709  Max: 206878  Diff: 7169
chi_swi
	Min: 204272  Max: 206681  Diff: 2409
cor_flu
	Min: 219759  Max: 251833  Diff: 32074
cru_spo
	Min: 219949  Max: 254262  Diff: 34313
den_jap
	Min: 200063  Max: 203249  Diff: 3186
fra_mex
	Min: 201324  Max: 206753  Diff: 5429
ger_eng
	Min: 196687  Max: 206882  Diff: 10195
ger_spa
	Min: 198728  Max: 206779  Diff: 8051
ger_uru
	Min: 199785  Max: 206887  Diff: 7102
ita_svk
	Min: 206667  Max: 206764  Diff: 97
ned_jap
	Min: 200203  Max: 206866  Diff: 6663
ned_svk
	Min: 56285  Max: 206791  Diff: 150506
por_nko
	Min: 204334  Max: 206

In [63]:
### CORTANDO VALORES EXCEDENTES PARA QUE TODOS OS DADOS TENHAM O MESMO TAMANHO ###

for game in games_data:
    values = [len(v) for k, v in game.items() if k not in ['name']]
    min_l = min(values)
    for k, v in game.items():
        if len(v) > min_l:
            game[k] = v[:min_l]
            

### ZERO PADDING ###
max_length = 0
for game in games_data:
    values = [len(v) for k, v in game.items() if k not in ['name']]
    max_length = max(values) if max(values)>max_length else max_length

for game in games_data:
    for k, v in game.items():
        if k not in ['name']:
            zero_v = np.zeros(max_length)
            if len(v) < max_length:
                zero_v[:len(v)] = v
                game[k] = zero_v

In [64]:
### CHECANDO COMPRIMENTO DOS DADOS ###

for game in games_data:
    print(game['name'])
    values = [len(v) for k, v in game.items() if k not in ['name', 'highlights_boundaries']]
    min_l = min(values)
    max_l = max(values)
    print('\tMin: {}  Max: {}  Diff: {}'.format(min_l, max_l, max_l-min_l))

amg_vas
	Min: 219949  Max: 219949  Diff: 0
arg_ger
	Min: 219949  Max: 219949  Diff: 0
arg_mex
	Min: 219949  Max: 219949  Diff: 0
arg_nig
	Min: 219949  Max: 219949  Diff: 0
arg_sko
	Min: 219949  Max: 219949  Diff: 0
bar_int
	Min: 219949  Max: 219949  Diff: 0
bay_int
	Min: 219949  Max: 219949  Diff: 0
bra_chi
	Min: 219949  Max: 219949  Diff: 0
bra_ita
	Min: 219949  Max: 219949  Diff: 0
bra_ned
	Min: 219949  Max: 219949  Diff: 0
chi_swi
	Min: 219949  Max: 219949  Diff: 0
cor_flu
	Min: 219949  Max: 219949  Diff: 0
cru_spo
	Min: 219949  Max: 219949  Diff: 0
den_jap
	Min: 219949  Max: 219949  Diff: 0
fra_mex
	Min: 219949  Max: 219949  Diff: 0
ger_eng
	Min: 219949  Max: 219949  Diff: 0
ger_spa
	Min: 219949  Max: 219949  Diff: 0
ger_uru
	Min: 219949  Max: 219949  Diff: 0
ita_svk
	Min: 219949  Max: 219949  Diff: 0
ned_jap
	Min: 219949  Max: 219949  Diff: 0
ned_svk
	Min: 219949  Max: 219949  Diff: 0
por_nko
	Min: 219949  Max: 219949  Diff: 0
san_vas
	Min: 219949  Max: 219949  Diff: 0
spa_ned
	Mi

In [65]:
def get_frames_ids(gt):
    u, counts = np.unique(gt, return_counts=True)
    ind_0 = [i for i, v in enumerate(gt) if v == 0.0]
    ind_1 = [i for i, v in enumerate(gt) if v == 1.0]
    
    ind_0_ids = np.linspace(0, counts[0]-1, counts[1], dtype=int)
    new_ind_0 = [ind_0[j] for j in ind_0_ids]
    
    return sorted(new_ind_0 + ind_1)

In [88]:
for i, tag_name in enumerate(games_data[10].keys()):
    if tag_name == 'name':
        name_id = i
    if tag_name == 'gt':
        gt_id = i

skip = 8
f_size = 61
skip_values = np.arange(-30*skip, 31*skip, skip)

        
data = []
gt = []
for game in games_data:
    values_array = np.array(list(game.values()))
    
    # Getting GT
    gt_game = list(values_array[gt_id])

    # Removing name and GT to train
    values_array = np.delete(values_array, (max(name_id, gt_id)), axis=0)
    values_array = np.delete(values_array, (min(name_id, gt_id)), axis=0)    
        
    frames_ids = get_frames_ids(gt_game)
    gt += [gt_game[i] for i in frames_ids]
    
    game_features = []
    for feature in values_array:
        # Normalizing
        norm_feature = np.squeeze(normalize([feature], axis=1))

        M_feature = np.repeat([norm_feature], f_size, axis=0)
        for i, v in enumerate(skip_values):
            M_feature[i] = np.roll(M_feature[i], v, axis=0)
            
        M_feature = M_feature[:, frames_ids]
        
        M_feature = np.transpose(M_feature)
        if len(game_features) == 0:
            game_features = M_feature
        else:
            game_features = np.concatenate((game_features, M_feature), axis=1)
            
    if len(data) == 0:
        data = game_features
    else:
        data = np.concatenate((data, game_features), axis=0)
      
    
gt = np.array(gt)   

print(data.shape)
print(gt.shape)

KeyboardInterrupt: 

In [19]:
print(games_matrix.shape)
print(M.shape)

(219949, 976)
(219949, 960)


In [10]:
print(len(games_data))

0
