In [9]:
import glob
import random
import h5py
import numpy as np
import math

Find data files recursively from root folder.

In [10]:
data_path = "D:/Coding/Thesis/Data/STFT Output/**/*.h5"
data_files = glob.glob(data_path, recursive=True)

In [11]:
config = {}

config['EEG_window_length_in_ms'] = 30000

Next, we configure the output layer of the LSTM:


`delta_time_k` predicts the delta time to the next $k^{th}$ tap.

`tap_count_times` predicts the *number of taps* within the next $p$ seconds.

In [12]:
config['delta_time_k'] = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
config['tap_count_times_p'] = np.array([0.5, 1, 5, 10, 50, 100, 500])

config['EEG_sampling_rate'] = 1000
config['stft_stride'] = 32
config['sampling_rate_after_stft'] = config['EEG_sampling_rate'] / config['stft_stride']
config['sample_length_after_stft'] = 1000 / config['sampling_rate_after_stft']

config['tap_count_times_in_samples'] = np.multiply(config['tap_count_times_p'], config['EEG_sampling_rate'])

# The ParticipantData class
The ParticipantData class contains and acts on participant data. It knows about the structure of the h5 files and can return random windows of EEG activity and taps, i.e., training data.

The stft data in the h5 files is 3-dimensional:
1. EEG channel
2. EEG timeseries
3. STFT frequency bins

In [22]:
class ParticipantData:
    def __init__(self, data_files):
        self.data_file_paths = data_files
        
        self.open_h5_files()
        
        self._generate_group_idx()
    
    
    def _generate_group_idx(self):
        self.sessions = [list(participant.keys()) for participant in self.data_files_open]

        #windows = [list(participant[session].keys()) for participant in data_files_open for session in list(participant.keys())]
        self.windows = []
        for participant in self.data_files_open:
            for session in list(participant.keys()):
                self.windows.append(list(participant[session].keys()))
    
    
    def open_h5_files(self):
        self.data_files_open = [h5py.File(f, 'r') for f in self.data_file_paths]
    
    
    def close_h5_files(self):
        for f in self.data_files_open:
            f.close()
        
        self.data_files_open = []
    
    
    def get_random_EEG_window(self, window_size):
        ppt = random.choice(self.data_files_open)        
        session = random.choice(list(ppt.keys()))
        activity_window = random.choice(list(ppt[session].keys()))
        
        window_size_in_samples = np.ceil(window_size / config['stft_stride']).astype(np.int)
        
        window_end_idx = np.random.randint(window_size_in_samples, ppt[session][activity_window]['stft'].shape[1])       
        window_idx_stft = np.arange(window_end_idx - window_size_in_samples, window_end_idx, dtype=np.int)        
        
        input_data = np.array(ppt[session][activity_window]['stft'][:, window_idx_stft, :])
        
        window_end_idx_tap_adjusted = np.ceil(np.array(window_end_idx) * config['sampling_rate_after_stft']).astype(np.int)
        
        output_data = self.get_taps_in_window(np.array(ppt[session][activity_window]['taps'], dtype=np.int), window_end_idx_tap_adjusted)
                
        return(input_data, output_data)
    
    
    def get_taps_in_window(self, taps, window_end):      
        tap_deltas = self.get_delta_taps(taps, window_end)
        
        future_tap_n = self.get_n_future_taps(taps, window_end)
        
        result = np.concatenate((tap_deltas, future_tap_n))

        return(result)
    
    
    def get_delta_taps(self, taps, window_idx):
        n_k = len(config['delta_time_k'])
        
        next_kth_taps = taps[taps > window_idx][:n_k]
        
        # Ensure that if not enough taps were found the array is padded with 0s.
        # This only occurs 
        if len(next_kth_taps) < n_k:
            next_kth_taps = np.concatenate((next_kth_taps, np.zeros(n_k - len(next_kth_taps))))
        
        tap_deltas = next_kth_taps - window_idx
        
        return(tap_deltas)
            
    
    def get_n_future_taps(self, taps, window_idx):
        n_future_taps = np.zeros(len(config['tap_count_times_in_samples']))
        
        for p_idx, p in enumerate(config['tap_count_times_in_samples']):
            n_future_taps[p_idx] = len(
                taps[
                    (taps > window_idx) &
                    (taps <= (window_idx + p))
                ]
            )
        
        return(n_future_taps)

In [23]:
ppt_data = ParticipantData(data_files)

# Determining the in- and outputs of the LSTM
Next, we check the sizes that are returned by our data generation function.

In [24]:
input_layer, output_layer = ppt_data.get_random_EEG_window(config['EEG_window_length_in_ms'])

print(f"Input layer shape: {input_layer.shape}")
print(f"Output layer shape: {output_layer.shape}")

Input layer shape: (64, 938, 32)
Output layer shape: (17,)


## Input layer
The ParticipantData class generates an input information that is $64 \times 938 \times 32$ in size. Since the second dimension is the temporal information, the LSTM's input layer has the a size of $64 \times 32 = 2048$ and takes a window of 938 samples at a time.

This conversion from window length in ms to window length in samples can be calculated by the following formula:

$$
\bigl\lceil
\frac{\frac{\text{T}}{1000} \text{Fs}}
{\text{R}}
\bigr\rceil
$$

, where $T$ is the window length in ms, $Fs$ is the original sampling rate, and $R$ is the hopsize of the STFT.

## Output layer
The ParticipantData class also generates training output. The output layer has a length of 17. It is composed of the delta times to the next $k$ steps and the number of taps within the next $p$ seconds. Which and how many $k$ and $p$ are predicted is defined in the config dictionary at the top of the file. The size of the output layer is defined as $|K| + |P|$

In [8]:
f = h5py.File(data_files[0], 'r')

list(f['12_02_11_04_19']['window_1'].keys())

print(f['12_02_11_04_19']['window_1']['taps'][:, -1][0])
print(f['12_02_11_04_19']['window_1']['stft'].shape)

3112552
(64, 98203, 32)


Close h5 files again.

In [56]:
ppt_data.close_h5_files()