In [1]:
import os
import time
import collections
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import average_precision_score, precision_recall_curve, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report,confusion_matrix

In [2]:
### READING DATASET ###

# Paths
dataset_path = 'Dataset'
games_dir = next(os.walk(dataset_path))[1]

tag_to_skip = ['highlights', 'highlights2', '.DS_Store', '.git']

# Iterating through games
games_data = []
games_path = [os.path.join(dataset_path, g) for g in games_dir]
for path in games_path:
    game_name = os.path.basename(path)
    if game_name != 'ned_svk':
        game_info = os.listdir(path)
        data = collections.OrderedDict()
        for info in game_info:
            tag_name = os.path.splitext(info)[0]
            if tag_name not in tag_to_skip:
                csv_file = os.path.join(path, info)
                if tag_name == 'highlights_boundaries':
                    data_value = np.genfromtxt(csv_file, delimiter=',', dtype=None)
                else:
                    data_value = np.fromfile(csv_file, dtype=np.float64)
                data[tag_name] = data_value
            data['name'] = game_name
        games_data.append(data)

games_backup = games_data[:]

In [3]:
### REMOVENDO VALORES CORRELACIONADOS ###

to_remove = ['em_mcs_energy', 'em_mcs_energy_diff', 'em_mcs_energy_diff_ascending']
for game in games_data:
    for tag in to_remove:
        del game[tag]

In [4]:
### CRIANDO VETOR DE GT ###

normal_cases = ['Normal', 'Inicio', 'Fim']
to_skip = ['name', 'highlights_boundaries']

for game in games_data:
    h = game['highlights_boundaries']
    end = h[-1][0]
    gt = np.zeros(end)
    slices = []
    for line in h:
        tag = line[2].decode('UTF-8').split(' ')
        name = [t for t in tag if t != ''][0]
        if name not in normal_cases:
            gt[line[0]-1:line[1]] = 1
        else:
            slices.append([line[0], line[1], name])

    slices = slices[::-1]
    ## Removing data not to be considered ###
    for k, values in game.items():
        if k not in to_skip:
            for sl in slices:
                if 'Fim' in sl[2]:
                    values = np.delete(values, range(sl[0], len(values)))
                if 'Normal' in sl[2]:
                    values = np.delete(values, range(sl[0], sl[1]))
                if 'Inicio' in sl[2]:
                    values = np.delete(values, range(0, sl[0]))
        game[k] = values
                    
            
    game['gt'] = gt
    del game['highlights_boundaries']



In [5]:
### CORTANDO VALORES EXCEDENTES PARA QUE TODOS OS DADOS TENHAM O MESMO TAMANHO ###

for game in games_data:
    values = [len(v) for k, v in game.items() if k not in ['name']]
    min_l = min(values)
    for k, v in game.items():
        if len(v) > min_l:
            game[k] = v[:min_l]
            

### ZERO PADDING ###
max_length = 0
for game in games_data:
    values = [len(v) for k, v in game.items() if k not in ['name']]
    max_length = max(values) if max(values)>max_length else max_length

for game in games_data:
    for k, v in game.items():
        if k not in ['name']:
            zero_v = np.zeros(max_length)
            if len(v) < max_length:
                zero_v[:len(v)] = v
                game[k] = zero_v

In [7]:
### DIVIDING SPLITS FROM TRAIN, EVAL, TEST ###

games_name = [g['name'] for g in games_data]
games_train1, games_test = train_test_split(games_name, test_size=0.15)
games_train2, games_val = train_test_split(games_train1, test_size=0.05)

print('Total train: {}'.format(len(games_train2)))
print('Total test: {}'.format(len(games_test)))
print('Total val: {}'.format(len(games_val)))

Total train: 22
Total test: 5
Total val: 2


In [8]:
for i, tag_name in enumerate(games_data[10].keys()):
    print('{} - {}'.format(i, tag_name))
    if tag_name == 'name':
        name_id = i
    if tag_name == 'gt':
        gt_id = i
        
print('\nnome_id: {}, gt_id: {}'.format(name_id, gt_id))

0 - dc_hue_mean
1 - name
2 - dc_percent
3 - em_cs_energy
4 - em_cs_energy_diff
5 - em_cs_energy_diff_ascending
6 - em_st_energy
7 - em_st_energy_diff
8 - em_st_energy_diff_ascending
9 - pc_delta
10 - pc_rho
11 - pc_theta
12 - pc_var_delta
13 - pc_var_theta
14 - pm_pitch
15 - pm_pitch_diff
16 - pm_pitch_diff_ascending
17 - gt

nome_id: 1, gt_id: 17


In [9]:
X_train = []
y_train = []

for game in games_data:
    if game['name'] in games_train1:
        values_array = list(game.values())
        # Getting GT
        y_train += list(values_array[gt_id])
        
        # Removing name and GT to train
        del values_array[max(name_id, gt_id)]
        del values_array[min(name_id, gt_id)]
        
        values_array = np.array(values_array)
        if len(X_train) == 0:
            X_train = np.transpose(values_array)
        else:
            X_train = np.concatenate((X_train, np.transpose(values_array)))
y_train = np.array(y_train)    


X_test = []
y_test = []

for game in games_data:
    if game['name'] in games_test:
        values_array = list(game.values())
        # Getting GT
        y_test += list(values_array[gt_id])
        
        # Removing name and GT to train
        del values_array[max(name_id, gt_id)]
        del values_array[min(name_id, gt_id)]
        
        values_array = np.array(values_array)
        if len(X_test) == 0:
            X_test = np.transpose(values_array)
        else:
            X_test = np.concatenate((X_test, np.transpose(values_array)))
y_test = np.array(y_test)
            

print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

Train (2000880, 16) (2000880,)
Test (416850, 16) (416850,)


In [None]:
from sknn.mlp import Classifier, Layer
import pickle
import sys
import logging

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

logging.basicConfig(
            format="%(message)s",
            level=logging.DEBUG,
            stream=sys.stdout)

pipeline = Pipeline([
        ('min/max scaler', MinMaxScaler(feature_range=(0.0, 1.0))),
        ('neural network', Classifier(layers=[Layer("Softmax")], n_iter=25))])
pipeline.fit(X_train, y_train)

X_train_norm = pipeline.predict(X_train)
X_test_norm = pipeline.predict(X_test)

nn = Classifier(
    layers=[
        Layer("Maxout", units=100, pieces=2),
        Layer("Softmax")],
    learning_rate=0.001,
    n_iter=25,
    verbose=True)

w_train = numpy.array((X_train_norm.shape[0],))
w_train[y_train == 0] = 1.02
w_train[y_train == 1] = 50

nn.fit(X_train_norm, y_train, w_train)
nn.fit(X_train_norm, y_train)

pickle.dump(nn, open('nn.pkl', 'wb'))



Initializing neural network with 1 layers, 16 inputs and 2 outputs.
  - Dense: [1;97mSoftmax   [0m Units:  [1;97m2   [0m

Training on dataset of 2,000,880 samples with 36,015,840 total size.
  - Terminating loop after 25 total iterations.
  - Early termination after 10 stable iterations.


In [None]:
y_scores = nn.predict(X_test_norm)
y_prob = nn.predict_proba(X_test_norm)