In [58]:
import glob
import random
import h5py
import numpy as np
import math

from tensorflow import keras
from tensorflow.keras import layers

Find data files recursively from root folder.

In [59]:
data_path = "D:/Coding/Thesis/Data/STFT Output/**/*.h5"
data_files = glob.glob(data_path, recursive=True)

In [60]:
config = {}

config['EEG_window_length_in_ms'] = 30000

Next, we configure the output layer of the LSTM:


`delta_time_k` predicts the delta time to the next $k^{th}$ tap.

`tap_count_times_p` predicts the *number of taps* within the next $p$ seconds.

In [61]:
config['delta_time_k'] = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
config['tap_count_times_p'] = np.array([0.5, 1, 5, 10, 50, 100, 500])

config['EEG_sampling_rate'] = 1000
config['stft_stride'] = 32
config['sampling_rate_after_stft'] = config['EEG_sampling_rate'] / config['stft_stride']
config['sample_length_after_stft'] = 1000 / config['sampling_rate_after_stft']

config['tap_count_times_in_samples'] = np.multiply(config['tap_count_times_p'], config['EEG_sampling_rate'])

# The ParticipantData class
The ParticipantData class contains and acts on participant data. It knows about the structure of the h5 files and can return random windows of EEG activity and taps, i.e., training data.

The stft data in the h5 files is 3-dimensional:
1. EEG channel
2. EEG timeseries
3. STFT frequency bins

In [5]:
class ParticipantData:
    def __init__(self, data_files):
        self.data_file_paths = data_files
        
        self.open_h5_files()
        
        self._generate_group_idx()
    
    
    def _generate_group_idx(self):
        self.sessions = [list(participant.keys()) for participant in self.data_files_open]

        #windows = [list(participant[session].keys()) for participant in data_files_open for session in list(participant.keys())]
        self.windows = []
        for participant in self.data_files_open:
            for session in list(participant.keys()):
                self.windows.append(list(participant[session].keys()))
    
    
    def open_h5_files(self):
        self.data_files_open = [h5py.File(f, 'r') for f in self.data_file_paths]
    
    
    def close_h5_files(self):
        for f in self.data_files_open:
            f.close()
        
        self.data_files_open = []
    
    
    def get_random_EEG_window(self, window_length):
        ppt = random.choice(self.data_files_open)        
        session = random.choice(list(ppt.keys()))
        activity_window = random.choice(list(ppt[session].keys()))
        
        window_length_in_samples = np.ceil(window_length / config['stft_stride']).astype(np.int)
        
        window_end_idx = np.random.randint(window_length_in_samples, ppt[session][activity_window]['stft'].shape[1])       
        window_idx_stft = np.arange(window_end_idx - window_length_in_samples, window_end_idx, dtype=np.int)        
        
        input_data = np.array(ppt[session][activity_window]['stft'][:, window_idx_stft, :])
        
        window_end_idx_tap_adjusted = np.ceil(np.array(window_end_idx) * config['sampling_rate_after_stft']).astype(np.int)
        
        output_data = self.get_taps_in_window(np.array(ppt[session][activity_window]['taps'], dtype=np.int), window_end_idx_tap_adjusted)
                
        return(input_data, output_data)
    
    
    def get_taps_in_window(self, taps, window_end):      
        tap_deltas = self.get_delta_taps(taps, window_end)
        
        future_tap_n = self.get_n_future_taps(taps, window_end)
        
        result = np.concatenate((tap_deltas, future_tap_n))

        return(result)
    
    
    def get_delta_taps(self, taps, window_idx):
        n_k = len(config['delta_time_k'])
        
        next_kth_taps = taps[taps > window_idx][:n_k]
        
        # Ensure that if not enough taps were found the array is padded with 0s.
        # This only occurs 
        if len(next_kth_taps) < n_k:
            next_kth_taps = np.concatenate((next_kth_taps, np.zeros(n_k - len(next_kth_taps))))
        
        tap_deltas = next_kth_taps - window_idx
        
        return(tap_deltas)
            
    
    def get_n_future_taps(self, taps, window_idx):
        n_future_taps = np.zeros(len(config['tap_count_times_in_samples']))
        
        for p_idx, p in enumerate(config['tap_count_times_in_samples']):
            n_future_taps[p_idx] = len(
                taps[
                    (taps > window_idx) &
                    (taps <= (window_idx + p))
                ]
            )
        
        return(n_future_taps)

In [6]:
#ppt_data = ParticipantData(data_files)

# Load data into memory

Instead of loading data from the h5 file when we need it, we load all the data of a single participant into memory (because it fits and it's easier)

In [20]:
def load_data(h5_files):
    EEG, taps = [], []
    
    # Makse sure that if a single file is passed in, it is put into a list
    if type(h5_files) != 'list':
        h5_files = [h5_files]
    
    for f in h5_files:
        with h5py.File(f, 'r') as f_open:        
            for session in list(f_open.keys()):
                for activity_window in list(f_open[session].keys()):
                    EEG.append(np.array(f_open[session][activity_window]['stft']))
                    taps.append(np.array(f_open[session][activity_window]['stft']))
        
    return (EEG, taps)

In [21]:
%time stft, taps = load_data(data_files[0])

Wall time: 1min 14s


### Replacing ParticipantData class with class inherited from `tf.keras.utils.Sequence`

In [38]:
class DataLoader(keras.utils.Sequence):
    def __init__(self, x, y, window_length=30000, batch_size=32, n_samples=2000, shuffle=True):
        self.x = x
        self.y = y
        self.window_length = window_length
        self.batch_size = batch_size
        self.n_samples = n_samples
        self.shuffle = shuffle
        
        self.subset_probabilities = self._get_subset_probabilites(x)
    
    
    def __len__(self):
        [win.shape[1] for win in x]
    
    
    def __getitem__(self, index=0):
        X = []
        Y = []
        
        for batch in range(self.batch_size):
            if self.shuffle:
                # Pick random activity window
                # Since subsets are of different length and contain different amounts of data,
                # we need to adjust the probability of each subset being picked to its length.
                subset_idx = np.random.choice(len(self.x), p = self.subset_probabilities)

                # Pick random EEG window
                window_length_in_samples = np.ceil(self.window_length / config['stft_stride']).astype(np.int)

                window_end_idx = np.random.randint(window_length_in_samples, self.x[subset_idx].shape[1])       
                window_idx_stft = np.arange(window_end_idx - window_length_in_samples, window_end_idx, dtype=np.int)        

                input_data = np.array(self.x[subset_idx][:, window_idx_stft, :])
                
                # Get appropriate tap info        
                window_end_idx_tap_adjusted = np.ceil(np.array(window_end_idx) * config['sampling_rate_after_stft']).astype(np.int)

                output_data = self.get_taps_in_window(self.y[subset_idx], window_end_idx_tap_adjusted)
            else:
                subset_idx = 0 # TODO Change this as soon as I know how __getitem__ is called
                
                # EEG data
                window_length_in_samples = np.ceil(self.window_length / config['stft_stride']).astype(np.int)
                window_idx_stft = np.arange(index - window_length_in_samples, index, dtype=np.int)
                input_data = np.array(self.x[subset_idx][:, window_idx_stft, :])
                
                # Tap data
                window_end_idx_tap_adjusted = np.ceil(np.array(index) * config['sampling_rate_after_stft']).astype(np.int)
                output_data = self.get_taps_in_window(self.y[subset_idx], window_end_idx_tap_adjusted)
            
            X.append(input_data)
            Y.append(output_data)
                
        return X, Y
    
    def on_epoch_end(self):
        pass
    
    def _get_subset_probabilites(self, x):
        lengths = np.array([win.shape[1] for win in x])
        
        return lengths / sum(lengths)
        
    
    def get_taps_in_window(self, taps, window_end):      
        tap_deltas = self.get_delta_taps(taps, window_end)
        
        future_tap_n = self.get_n_future_taps(taps, window_end)
        
        result = np.concatenate((tap_deltas, future_tap_n))

        return result 
    
    
    def get_delta_taps(self, taps, window_idx):
        n_k = len(config['delta_time_k'])
        
        next_kth_taps = taps[taps > window_idx][:n_k]
        
        # Ensure that if not enough taps were found the array is padded with 0s.
        # This only occurs 
        if len(next_kth_taps) < n_k:
            next_kth_taps = np.concatenate((next_kth_taps, np.zeros(n_k - len(next_kth_taps))))
        
        tap_deltas = next_kth_taps - window_idx
        
        return tap_deltas
            
    
    def get_n_future_taps(self, taps, window_idx):
        n_future_taps = np.zeros(len(config['tap_count_times_in_samples']))
        
        for p_idx, p in enumerate(config['tap_count_times_in_samples']):
            n_future_taps[p_idx] = len(
                taps[
                    (taps > window_idx) &
                    (taps <= (window_idx + p))
                ]
            )
        
        return n_future_taps

# Determining the in- and outputs of the LSTM
Next, we check the sizes that are returned by our data generation function.

In [54]:
ppt1 = DataLoader(stft, taps, batch_size=8)

In [57]:
%prun my_batch_x, my_batch_y = ppt1[0] # get a random batch

 

         324 function calls in 19.331 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        8   17.552    2.194   17.552    2.194 <ipython-input-38-d7219537c5ad>:91(get_n_future_taps)
        8    1.661    0.208    1.661    0.208 <ipython-input-38-d7219537c5ad>:76(get_delta_taps)
        1    0.070    0.070   19.321   19.321 <ipython-input-38-d7219537c5ad>:17(__getitem__)
       16    0.036    0.002    0.036    0.002 {built-in method numpy.array}
        1    0.010    0.010   19.331   19.331 <string>:1(<module>)
        8    0.001    0.000    0.001    0.000 {method 'choice' of 'numpy.random.mtrand.RandomState' objects}
       16    0.000    0.000    0.000    0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}
        8    0.000    0.000    0.000    0.000 {method 'randint' of 'numpy.random.mtrand.RandomState' objects}
        8    0.000    0.000   19.214    2.402 <ipython-input-38-d7219537c5ad>:66

And do a sanity check on the content of the generated target data.

In [53]:
np.set_printoptions(suppress=True)

print(f'Input layer dimensions: {my_batch_x[0].shape}')
print(f'Output layer dimensions: {my_batch_y[0].shape}')

Input layer dimensions: (64, 938, 32)
Output layer dimensions: (17,)


## Input layer
The ParticipantData class generates an input information that is $64 \times 938 \times 32$ in size. Since the second dimension is the temporal information, the LSTM's input layer has the a size of $64 \times 32 = 2048$ and takes a window of 938 samples at a time.

This conversion from window length in ms to window length in samples can be calculated by the following formula:

$$
\bigl\lceil
\frac{\frac{\text{T}}{1000} \text{Fs}}
{\text{R}}
\bigr\rceil
$$

, where $T$ is the window length in ms, $Fs$ is the original sampling rate, and $R$ is the hopsize of the STFT.

## Output layer
The ParticipantData class also generates training output. The output layer has a length of 17. It is composed of the $\Delta t$ to the next $k$ steps in ms, as well as the number of taps within the next $p$ seconds. Which and how many $k$ and $p$ are predicted is defined in the config dictionary at the top of the file. The size of the output layer is defined as $|K| + |P|$

# LSTM

In [None]:
model = keras.Sequential()

# Add input layer
model.add(layers.LSTM(
    128,
    input_shape = ()
))
model.add

Close h5 files again.

In [56]:
ppt_data.close_h5_files()