In [1]:
import os
import time
import numpy as np
import matplotlib.pyplot as plt

In [22]:
### READING DATASET ###

# Paths
dataset_path = 'Dataset'
games_dir = next(os.walk(dataset_path))[1]

tag_to_skip = ['highlights', 'highlights2', '.DS_Store']
dt = np.dtype([('a', float), ('b', float), ('name', np.unicode_, 16)])

# Iterating through games
games_data = []
games_path = [os.path.join(dataset_path, g) for g in games_dir]
for path in games_path:
    game_name = os.path.basename(path)
    if game_name != 'ned_svk':
        game_info = os.listdir(path)
        game_data = dict()
        for info in game_info:
            tag_name = os.path.splitext(info)[0]
            if tag_name not in tag_to_skip:
                csv_file = os.path.join(path, info)
                if tag_name == 'highlights_boundaries':
                    data_value = np.genfromtxt(csv_file, delimiter=',', dtype=None)
                else:
                    data_value = np.fromfile(csv_file, dtype=np.float64)
                game_data[tag_name] = data_value
        game_data['name'] = game_name
        games_data.append(game_data)

games_backup = games_data[:]

In [3]:
games_data = games_backup[:]

In [23]:
games_data[0]

{'dc_hue_mean': array([  3.70980754e-245,   4.91035163e-062,   4.91127896e-062, ...,
          3.70444411e-245,   3.70444411e-245,   3.70444411e-245]),
 'dc_percent': array([  4.39811949e-245,   3.85342763e-057,   4.66299277e-033, ...,
          3.22511851e-086,   3.22511851e-086,   3.22511851e-086]),
 'em_cs_energy': array([  1.57692220e-52,   9.73449964e-72,   1.74451393e-76, ...,
          3.35174813e-57,   4.27708668e-33,   4.46469229e-86]),
 'em_cs_energy_diff': array([  1.57610813e-52,   9.95279112e-43,   4.42131876e-62, ...,
          9.16225109e-72,   1.20788625e-67,   7.41018373e-38]),
 'em_cs_energy_diff_ascending': array([  2.96620797e-260,   2.96620797e-260,   2.96620797e-260, ...,
          2.96620797e-260,   2.96620797e-260,   2.96620797e-260]),
 'em_mcs_energy': array([  1.57692220e-52,   9.73449964e-72,   1.74451393e-76, ...,
          3.35174813e-57,   4.27708668e-33,   4.46469229e-86]),
 'em_mcs_energy_diff': array([  1.57610813e-52,   9.95279112e-43,   4.42131876e-62

In [24]:
### CRIANDO VETOR DE GT ###

normal_cases = ['Normal', 'Inicio', 'Fim']

for game in games_data:
    h = game['highlights_boundaries']
    end = h[-1][0]
    gt = np.zeros(end)
    for line in h[:-1]:
        tag = line[2].decode('UTF-8').split(' ')
        name = [t for t in tag if t != ''][0]
        if name not in normal_cases:
            gt[line[0]-1:line[1]] = 1
    game['gt'] = gt
    del game['highlights_boundaries']

In [25]:
### REMOVENDO VALORES CORRELACIONADOS ###

to_remove = ['em_mcs_energy', 'em_mcs_energy_diff', 'em_mcs_energy_diff_ascending']
for game in games_data:
    for tag in to_remove:
        del game[tag]

In [26]:
### CORTANDO VALORES EXCEDENTES PARA QUE TODOS OS DADOS TENHAM O MESMO TAMANHO ###

for game in games_data:
    values = [len(v) for k, v in game.items() if k not in ['name']]
    min_l = min(values)
    for k, v in game.items():
        if len(v) > min_l:
            game[k] = v[:min_l]
            

### ZERO PADDING ###
max_length = 0
for game in games_data:
    values = [len(v) for k, v in game.items() if k not in ['name']]
    max_length = max(values) if max(values)>max_length else max_length

for game in games_data:
    for k, v in game.items():
        if k not in ['name']:
            zero_v = np.zeros(max_length)
            if len(v) < max_length:
                zero_v[:len(v)] = v
                game[k] = zero_v

In [27]:
### DIVIDING SPLITS FROM TRAIN, EVAL, TEST ###

from sklearn.model_selection import train_test_split

games_name = [g['name'] for g in games_data]
games_train1, games_test = train_test_split(games_name, test_size=0.15)
games_train2, games_val = train_test_split(games_train1, test_size=0.05)

print('Total train: {}'.format(len(games_train2)))
print('Total test: {}'.format(len(games_test)))
print('Total val: {}'.format(len(games_val)))

Total train: 22
Total test: 5
Total val: 2


In [41]:
games_train1

['san_vas',
 'spa_usa',
 'por_nko',
 'arg_nig',
 'ger_uru',
 'spa_ned',
 'arg_ger',
 'spa_swi',
 'fra_mex',
 'chi_swi',
 'bra_chi',
 'bay_int',
 'bra_ned',
 'bar_int',
 'amg_vas',
 'ita_svk',
 'uru_sko',
 'bra_ita',
 'uru_ned',
 'cru_spo',
 'den_jap',
 'arg_sko',
 'ned_jap',
 'spa_por']

In [None]:
name_id = 6
gt_id = 13

X_train = []
y_train = []

for game in games_data:
    if game['name'] in games_train1:
        values_array = np.array(list(game.values()))
        # Getting GT
        y_train += list(values_array[13])
        
        # Removing name and GT to train
        values_array = np.delete(values_array, (6), axis=0)
        values_array = np.delete(values_array, (13), axis=0)
        
        values_array = np.array(values_array[:].tolist())
        if len(X_train) == 0:
            X_train = np.transpose(values_array)
        else:
            X_train = np.concatenate((X_train, np.transpose(values_array)))

y_train = np.array(y_train)    


X_test = []
y_test = []

for game in games_data:
    if game['name'] in games_test:
        values_array = np.array(list(game.values()))
        # Getting GT
        y_test += list(values_array[13])
        
        # Removing name and GT to train
        values_array = np.delete(values_array, (6), axis=0)
        values_array = np.delete(values_array, (13), axis=0)
        
        values_array = np.array(values_array[:].tolist())
        if len(X_test) == 0:
            X_test = np.transpose(values_array)
        else:
            X_test = np.concatenate((X_test, np.transpose(values_array)))
y_test = np.array(y_test)
            

print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

In [66]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

(16, 219949)
(219949,)


ValueError: Number of labels=219949 does not match number of samples=16

In [4]:
games_data[0].keys()

dict_keys(['pc_var_delta', 'em_cs_energy_diff', 'highlights_boundaries', 'dc_percent', 'em_cs_energy_diff_ascending', 'em_st_energy_diff', 'pc_var_theta', 'em_st_energy_diff_ascending', 'em_cs_energy', 'dc_hue_mean', 'gt', 'pm_pitch', 'pm_pitch_diff_ascending', 'pc_rho', 'em_st_energy', 'pc_theta', 'pm_pitch_diff', 'pc_delta', 'name'])