In [None]:
import os
import time
import collections
import numpy as np
from tempfile import mkdtemp
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize

from sklearn.metrics import average_precision_score, precision_recall_curve, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

from sknn.mlp import Classifier, Layer
import pickle
import sys
import logging

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

# Use the maximum number of threads for this script.
from sknn.platform import cpu32, threading

Theano was already imported and cannot be reconfigured.


In [None]:
### READING DATASET ###

# Paths
dataset_path = 'Dataset'
games_dir = next(os.walk(dataset_path))[1]

tag_to_skip = [
    'highlights', 'highlights2', '.DS_Store', '.git', 
    'em_mcs_energy_diff_ascending', 'em_mcs_energy_diff', 'em_mcs_energy']

# Iterating through games
games_data = []
games_path = [os.path.join(dataset_path, g) for g in games_dir]
for path in games_path:
    game_name = os.path.basename(path)
    game_info = os.listdir(path)
    data = collections.OrderedDict()
    for info in game_info:
        tag_name = os.path.splitext(info)[0]
        if tag_name not in tag_to_skip:
            csv_file = os.path.join(path, info)
            if tag_name == 'highlights_boundaries':
                data_value = np.genfromtxt(csv_file, delimiter=',', dtype=None)
            else:
                data_value = np.fromfile(csv_file, sep=' ')
            data[tag_name] = data_value
    data['name'] = game_name
    games_data.append(data)

games_backup = games_data[:]

In [33]:
### CRIANDO VETOR DE GT ###

wanted_cases = ['Gol', 'Perigo']
to_skip = ['name', 'highlights_boundaries']

for game in games_data:
    h = game['highlights_boundaries']
    end = h[-1][0]
    gt = np.zeros(end)
    for line in h:
        tag = line[2].decode('UTF-8').split(' ')
        name = [t for t in tag if t != ''][-1]
        if name in wanted_cases:
            gt[line[0]-1:line[1]] = 1                   
            
    game['gt'] = gt
    del game['highlights_boundaries']

In [19]:
### CORTANDO VALORES EXCEDENTES PARA QUE TODOS OS DADOS TENHAM O MESMO TAMANHO ###

for game in games_data:
    values = [len(v) for k, v in game.items() if k not in ['name']]
    min_l = min(values)
    for k, v in game.items():
        if len(v) > min_l:
            game[k] = v[:min_l]
            

### ZERO PADDING ###
max_length = 0
for game in games_data:
    values = [len(v) for k, v in game.items() if k not in ['name']]
    max_length = max(values) if max(values)>max_length else max_length

for game in games_data:
    for k, v in game.items():
        if k not in ['name']:
            zero_v = np.zeros(max_length)
            if len(v) < max_length:
                zero_v[:len(v)] = v
                game[k] = zero_v

In [20]:
### DIVIDING SPLITS FROM TRAIN, EVAL, TEST ###

games_name = [g['name'] for g in games_data]
games_train1, games_test = train_test_split(games_name, test_size=0.15)
games_train2, games_val = train_test_split(games_train1, test_size=0.05)

print('Total train: {}'.format(len(games_train2)))
print('Total test: {}'.format(len(games_test)))
print('Total val: {}'.format(len(games_val)))

Total train: 22
Total test: 5
Total val: 2


In [21]:
def get_frames_ids(gt):
    u, counts = np.unique(gt, return_counts=True)
    ind_0 = [i for i, v in enumerate(gt) if v == 0.0]
    ind_1 = [i for i, v in enumerate(gt) if v == 1.0]
    
    ind_0_ids = np.linspace(0, counts[0]-1, counts[1], dtype=int)
    new_ind_0 = [ind_0[j] for j in ind_0_ids]
    
    return sorted(new_ind_0 + ind_1)

0 - dc_hue_mean
1 - name
2 - dc_percent
3 - em_cs_energy
4 - em_cs_energy_diff
5 - em_cs_energy_diff_ascending
6 - em_st_energy
7 - em_st_energy_diff
8 - em_st_energy_diff_ascending
9 - pc_delta
10 - pc_rho
11 - pc_theta
12 - pc_var_delta
13 - pc_var_theta
14 - pm_pitch
15 - pm_pitch_diff
16 - pm_pitch_diff_ascending
17 - gt

nome_id: 1, gt_id: 17


In [None]:
def get_matrix(restriction, skip):
    data = []
    gt = []
    for game in games_data:
        if game['name'] in restriction:
            values_array = np.array(list(game.values()))

            # Getting GT
            gt_game = list(values_array[gt_id])

            # Removing name and GT to train
            values_array = np.delete(values_array, (max(name_id, gt_id)), axis=0)
            values_array = np.delete(values_array, (min(name_id, gt_id)), axis=0)    

            frames_ids = get_frames_ids(gt_game)
            gt += [gt_game[i] for i in frames_ids]

            game_features = []
            for feature in values_array:
                # Normalizing
                norm_feature = np.squeeze(normalize([feature], axis=1))

                M_feature = np.repeat([norm_feature], f_size, axis=0)
                for i, v in enumerate(skip_values):
                    M_feature[i] = np.roll(M_feature[i], v, axis=0)

                M_feature = M_feature[:, frames_ids]

                M_feature = np.transpose(M_feature)
                if len(game_features) == 0:
                    game_features = M_feature
                else:
                    game_features = np.concatenate((game_features, M_feature), axis=1)

            if len(data) == 0:
                data = game_features
            else:
                data = np.concatenate((data, game_features), axis=0)


    gt = np.array(gt)
    
    return data, gt


In [None]:
for i, tag_name in enumerate(games_data[10].keys()):
    if tag_name == 'name':
        name_id = i
    if tag_name == 'gt':
        gt_id = i

skip = 8
f_size = 61
skip_values = np.arange(-30*skip, 31*skip, skip)

X_train, y_train = get_matrix(games_train2, skip)

X_test, y_test = get_matrix(games_test, skip)

X_val, y_val = get_matrix(games_val, skip)


print('Train', X_train.shape, y_train.shape)
print('Val', X_val.shape, y_val.shape)
print('Test', X_test.shape, y_test.shape)

In [23]:
X_train = []
y_train = []

for game in games_data:
    if game['name'] in games_train2:
        values_array = list(game.values())
        # Getting GT
        y_train += list(values_array[gt_id])
        
        # Removing name and GT to train
        del values_array[max(name_id, gt_id)]
        del values_array[min(name_id, gt_id)]
        
        values_array = np.array(values_array)
        if len(X_train) == 0:
            X_train = np.transpose(values_array)
        else:
            X_train = np.concatenate((X_train, np.transpose(values_array)))
y_train = np.array(y_train)    


X_val = []
y_val = []

for game in games_data:
    if game['name'] in games_val:
        values_array = list(game.values())
        # Getting GT
        y_val += list(values_array[gt_id])
        
        # Removing name and GT to train
        del values_array[max(name_id, gt_id)]
        del values_array[min(name_id, gt_id)]
        
        values_array = np.array(values_array)
        if len(X_val) == 0:
            X_val = np.transpose(values_array)
        else:
            X_val = np.concatenate((X_val, np.transpose(values_array)))
y_val = np.array(y_val)


X_test = []
y_test = []

for game in games_data:
    if game['name'] in games_test:
        values_array = list(game.values())
        # Getting GT
        y_test += list(values_array[gt_id])
        
        # Removing name and GT to train
        del values_array[max(name_id, gt_id)]
        del values_array[min(name_id, gt_id)]
        
        values_array = np.array(values_array)
        if len(X_test) == 0:
            X_test = np.transpose(values_array)
        else:
            X_test = np.concatenate((X_test, np.transpose(values_array)))
y_test = np.array(y_test)

normalize(X_train, axis=1, copy=False)
normalize(X_val, axis=1, copy=False)
normalize(X_test, axis=1, copy=False)

print('Train', X_train.shape, y_train.shape)
print('Val', X_val.shape, y_val.shape)
print('Test', X_test.shape, y_test.shape)
































ValueError: Found array with 0 sample(s) (shape=(0, 16)) while a minimum of 1 is required by the normalize function.

In [30]:

logging.basicConfig(
            format="%(message)s",
            level=logging.DEBUG,
            stream=sys.stdout)

nn = Classifier(
    layers=[
        Layer("Tanh", units=100),
        Layer("Softmax")],
    learning_rate=0.001,
    n_iter=25,
    verbose=True,
    valid_set=(X_val, y_val))

w_train = np.zeros(X_train.shape[0])
w_train[y_train == 0] = 1.02
w_train[y_train == 1] = 50

nn.fit(X_train, y_train, w_train)
nn.fit(X_train, y_train)

pickle.dump(nn, open('nn.pkl', 'wb'))


Initializing neural network with 2 layers, 16 inputs and 2 outputs.
  - Dense: [1;97mTanh      [0m Units:  [1;97m100 [0m
  - Dense: [1;97mSoftmax   [0m Units:  [1;97m2   [0m

Training on dataset of 1,834,140 samples with 33,014,520 total size.
  - Train: 1,834,140  Valid: 166,740
  - Terminating loop after 25 total iterations.
  - Early termination after 10 stable iterations.

Epoch       Training Error       Validation Error       Time
------------------------------------------------------------
    1         [0;94m 1.045e+00[0m            [0;32m 4.516e-01[0m        238.3s
    2         [0;94m 1.041e+00[0m            [0;32m 3.861e-01[0m        257.2s
    3         [0;94m 1.039e+00[0m             4.738e-01        251.2s
    4         [0;94m 1.040e+00[0m             4.258e-01        256.6s
    5         [0;94m 1.040e+00[0m            [0;32m 3.364e-01[0m        245.6s
    6         [0;94m 1.039e+00[0m             4.672e-01        478.2s
    7         [0;94m 1.0

TypeError: Bad input argument to theano function with name "/Users/admin/py3env/lib/python3.5/site-packages/sknn/backend/lasagne/mlp.py:98" at index 2 (0-based).  
Backtrace when that variable is created:

  File "/Users/admin/py3env/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/admin/py3env/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-30-a8345c708a1f>", line 20, in <module>
    nn.fit(X_train, y_train, w_train)
  File "/Users/admin/py3env/lib/python3.5/site-packages/sknn/mlp.py", line 397, in fit
    return super(Classifier, self)._fit(X, yp, w)
  File "/Users/admin/py3env/lib/python3.5/site-packages/sknn/mlp.py", line 213, in _fit
    X, y = self._initialize(X, y, w)
  File "/Users/admin/py3env/lib/python3.5/site-packages/sknn/mlp.py", line 42, in _initialize
    return self._backend._initialize_impl(X, y, w)
  File "/Users/admin/py3env/lib/python3.5/site-packages/sknn/backend/lasagne/mlp.py", line 241, in _initialize_impl
    self._create_mlp(X, w)
  File "/Users/admin/py3env/lib/python3.5/site-packages/sknn/backend/lasagne/mlp.py", line 177, in _create_mlp
    self.data_mask = T.vector('m') if w is not None else T.scalar('m')
Wrong number of dimensions: expected 1, got 0 with shape ().

In [None]:
y_scores = nn.predict(X_test_norm)
y_prob = nn.predict_proba(X_test_norm)