In [1]:
import sys
import os
import numpy as np
import scipy
import scipy.io
import tensorflow as tf
import keras as k

Using TensorFlow backend.


## Data Prep
### Data ReadMe

Kaneshiro_etAl_objectCategoryEEG_README.txt

Data name: EEG data analyzed in "A Representational Similarity Analysis of the Dynamics of Object Processing Using Single-Trial EEG Classification"
Creator: Blair Kaneshiro, blairbo@ccrma.stanford.edu
PURL: http://purl.stanford.edu/bq914sc3730

Preferred citation: Kaneshiro, Blair and Perreau Guimaraes, Marcos and Kim, Hyung-Suk and Norcia, Anthony M. and Suppes, Patrick (2015). EEG data analyzed in "A Representational Similarity Analysis of the Dynamics of Object Processing Using Single-Trial EEG Classification". Stanford Digital Repository. Available at: http://purl.stanford.edu/bq914sc3730

The data package contains 10 anonymized datasets of scalp-recorded EEG in MATLAB (.mat) format. Each .mat file contains EEG data from one experimental subject. Data matrices have been preprocessed and are in the form used as input for classification. Dimensionality reduction/PCA has not been performed.

Variables contained in each dataset
- sub: Experimental subject identifier (e.g., 'S1', 'S2')
- N: Number of time samples per trial (always 32)
- Fs: Sampling frequency of the data (always 62.5Hz)
- T: Number of experimental trials (around 5,184 per dataset)
- exemplarLabels: A vector of length T containing the exemplar label of each trial.
- categoryLabels: A vector of length T containing the category label of each trial. (1=Human Body; 2=Human Face; 3=Animal Body; 4=Animal Face; 5=Fruit Vegetable; 6=Inanimate Object)
- X: The data matrix. Size of X is T rows by 124*N columns. Each row of X represents one experimental trial. Trial labels are corresponding elements in the exemplarLabels and categoryLabels vectors. Columns of X contain N time samples of EEG, concatenated from 124 electrodes (i.e., N time samples from electrode 1 followed by N time samples from electrode 2, etc.). Electrode numbers correspond to channels 1-124 of EGI's HydroCel Geodesic Sensor Net, 128 channels (ftp://ftp.egi.com/pub/support/Documents/net_layouts/hcgsn_128.pdf).

### Get the data
Get the matlab files and extract the data/labels

In [2]:
data_files = ['data/S%d.mat' % (i+1) for i in range(10)]
data = [scipy.io.loadmat(a_file) for a_file in data_files]

In [3]:
for key in data[0]:
    print(key, data[0][key])
    
print([len(data[i]['X']) for i in range(len(data))])
print([len(data[i]['categoryLabels'][0]) for i in range(len(data))])

('Fs', array([[ 62.5]]))
('sub', array([u'S1'], 
      dtype='<U2'))
('__globals__', [])
('__header__', 'MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Fri Feb  6 14:21:30 2015')
('N', array([[32]], dtype=uint8))
('T', array([[5188]], dtype=uint16))
('X', array([[-0.26424164, -0.47586018, -0.47061757, ...,  0.05198565,
         0.08307296, -0.04543913],
       [ 0.0028675 , -0.04133774,  0.05108967, ...,  0.09525956,
        -0.00872008,  0.12494965],
       [-0.22358645,  0.02478356,  0.08543183, ...,  0.0938575 ,
         0.06906053,  0.09690079],
       ..., 
       [ 0.17795506, -0.06740315,  0.09080031, ..., -0.11675177,
        -0.19592201, -0.16641006],
       [ 0.18887348,  0.17863007,  0.32982534, ...,  0.10712342,
         0.10700955,  0.0652873 ],
       [ 0.02932549, -0.05199817, -0.15791252, ..., -0.09192452,
        -0.04275392,  0.08084101]]))
('__version__', '1.0')
('exemplarLabels', array([[40, 64, 29, ..., 59,  3, 12]], dtype=uint8))
('categoryLabels', array([[4, 

### Organize the data
Combine our data before shuffling. We set aside all data from a single subject as a special test that the model generalizes to new people

In [36]:
all_data = np.array(data[0]['X'])
all_labels = np.array(data[0]['categoryLabels'])

for ind in range(len(data) - 2):
    data_runner = data[ind+1]
    all_data = np.concatenate((all_data, data_runner['X']))
    all_labels = np.concatenate((all_labels, data_runner['categoryLabels']), axis=1)
all_data = all_data.reshape(-1, 124, 32).transpose((0, 2, 1)).reshape(-1, 124*32)
    
test_new_person_data = data[len(data)-1]['X'].reshape(-1, 124, 32).transpose((0, 2, 1)).reshape(-1, 124*32)
test_new_person_labels = data[len(data)-1]['categoryLabels'].reshape((-1, 1)) - 1


print('all test data shape:   ', all_data.shape)
print('all test labels shape: ', all_labels.shape)
print()
print('new person holdout data shape:   ', test_new_person_data.shape)
print('new person holdout labels shape: ', test_new_person_labels.shape)

('all test data shape:   ', (46673, 3968))
('all test labels shape: ', (1, 46673))
()
('new person holdout data shape:   ', (5184, 3968))
('new person holdout labels shape: ', (5184, 1))


shuffle the two lists, keeping their order matching

In [37]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [38]:
shuffled_data, shuffled_labels = unison_shuffled_copies(all_data, all_labels.reshape(-1))
shuffled_labels = shuffled_labels.reshape((-1, 1))

    
shuffled_labels = shuffled_labels - 1 # zero index the labels (3 -> 2)
# shuffled_labels = keras.utils.np_utils.to_categorical(shuffled_labels) # change them to one-hot vectors (2 -> [0, 0, 1, 0, 0, 0])

In [39]:
print(shuffled_data.shape)
print(shuffled_labels.shape)

(46673, 3968)
(46673, 1)


split data into test and training portions

In [40]:
test_portion = .1 # 10% of data reserved for final testing

test_size = int(len(shuffled_data) * test_portion)

test_data = shuffled_data[:test_size]
test_labels = shuffled_labels[:test_size]

training_data = shuffled_data[test_size:]
training_labels = shuffled_labels[test_size:]

In [41]:
print(test_data.shape)
print(test_labels.shape)
print(training_data.shape)
print(training_labels.shape)

(4667, 3968)
(4667, 1)
(42006, 3968)
(42006, 1)


## Model
### Set up

In [42]:
model = k.models.Sequential()

model.add(k.layers.core.Reshape((32, 124), input_shape=(3968,)))

# model.add(k.layers.recurrent.GRU(32, dropout=.2, recurrent_dropout=.2, activation='tanh', recurrent_activation='hard_sigmoid', return_sequences=True))

model.add(k.layers.recurrent.GRU(32, dropout=.2, recurrent_dropout=.2, activation='tanh', recurrent_activation='hard_sigmoid', return_sequences=True))
model.add(k.layers.normalization.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001))

model.add(k.layers.convolutional.Conv1D(64, 16, padding='causal', activation='relu'))
model.add(k.layers.normalization.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001))
model.add(k.layers.core.Dropout(.2))

model.add(k.layers.convolutional.Conv1D(32, 4, padding='causal', activation='relu'))
model.add(k.layers.normalization.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001))
model.add(k.layers.core.Dropout(.2))

model.add(k.layers.Flatten())

model.add(k.layers.Dense(24))
model.add(k.layers.Activation('relu'))
model.add(k.layers.core.Dropout(.2))

model.add(k.layers.Dense(6))

model.add(k.layers.Activation('softmax'))

# Compile model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


# print(model.layers[-1].output_shape)
# print(model.to_yaml())

### Train

In [43]:
# Fit the model
model.fit(training_data, training_labels, epochs=20, batch_size=100, verbose=2, shuffle=True, validation_split=.1)

Train on 37805 samples, validate on 4201 samples
Epoch 1/20
32s - loss: 1.7826 - acc: 0.2192 - val_loss: 1.7205 - val_acc: 0.2533
Epoch 2/20
31s - loss: 1.7047 - acc: 0.2668 - val_loss: 1.6381 - val_acc: 0.3164
Epoch 3/20
33s - loss: 1.6515 - acc: 0.3050 - val_loss: 1.5835 - val_acc: 0.3492
Epoch 4/20
32s - loss: 1.6216 - acc: 0.3211 - val_loss: 1.5582 - val_acc: 0.3571
Epoch 5/20
32s - loss: 1.5991 - acc: 0.3358 - val_loss: 1.5475 - val_acc: 0.3716
Epoch 6/20
31s - loss: 1.5840 - acc: 0.3459 - val_loss: 1.5210 - val_acc: 0.3797
Epoch 7/20
31s - loss: 1.5682 - acc: 0.3534 - val_loss: 1.5217 - val_acc: 0.3823
Epoch 8/20
31s - loss: 1.5564 - acc: 0.3624 - val_loss: 1.5037 - val_acc: 0.3963
Epoch 9/20
31s - loss: 1.5442 - acc: 0.3691 - val_loss: 1.4813 - val_acc: 0.3999
Epoch 10/20
31s - loss: 1.5313 - acc: 0.3725 - val_loss: 1.4736 - val_acc: 0.4073
Epoch 11/20
31s - loss: 1.5189 - acc: 0.3775 - val_loss: 1.4688 - val_acc: 0.4090
Epoch 12/20
32s - loss: 1.5085 - acc: 0.3889 - val_loss: 1

<keras.callbacks.History at 0x17eae1ed0>

### Evaluate

In [44]:
scores = model.evaluate(test_data, test_labels)
print
print("model accuracy on holdout set: %.2f%%" % (scores[1]*100))

print

new_person_results = model.evaluate(test_new_person_data, test_new_person_labels)
print
print('model accuracy on entirely new person: %.2f%%' % (new_person_results[1] * 100))

model accuracy on holdout set: 43.65%

model accuracy on entirely new person: 32.18%
