In [1]:
import glob
import random
import h5py
import numpy as np
import math

Find data files recursively from root folder.

In [2]:
data_path = "D:/Coding/Thesis/Data/STFT Output/**/*.h5"
data_files = glob.glob(data_path, recursive=True)

In [12]:
config = {}

config['EEG_window_length_in_ms'] = 30000

Next, we configure the output layer of the LSTM:


`delta_time_k` predicts the delta time to the next $k^{th}$ tap.

`tap_count_times` predicts the *number of taps* within the next $p$ seconds.

In [21]:
config['delta_time_k'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
config['tap_count_times_p'] = [0.5, 1, 5, 10, 50, 100, 500]

config['EEG_sampling_rate'] = 1000
config['stft_stride'] = 32
config['sampling_rate_after_stft'] = config['EEG_sampling_rate'] / config['stft_stride']
config['sample_length_after_stft'] = 1000 / config['sampling_rate_after_stft']

config['tap_count_times_in_samples'] = np.multiply(config['tap_count_times_p'], config['EEG_sampling_rate'])

The ParticipantData class contains and acts on participant data. It knows about the structure of the h5 files and can return random windows of EEG activity and taps, i.e., training data.

The stft data in the h5 files is 3-dimensional:
1. Frequency bins
2. EEG timeseries
3. EEG channel

In [33]:
class ParticipantData:
    def __init__(self, data_files):
        self.data_file_paths = data_files
        
        self.open_h5_files()
        
        self._generate_group_idx()
    
    
    def _generate_group_idx(self):
        self.sessions = [list(participant.keys()) for participant in self.data_files_open]

        #windows = [list(participant[session].keys()) for participant in data_files_open for session in list(participant.keys())]
        self.windows = []
        for participant in self.data_files_open:
            for session in list(participant.keys()):
                self.windows.append(list(participant[session].keys()))
    
    
    def open_h5_files(self):
        self.data_files_open = [h5py.File(f, 'r') for f in self.data_file_paths]
    
    
    def close_h5_files(self):
        for f in self.data_files_open:
            f.close()
        
        self.data_files_open = []
    
    
    def get_random_EEG_window(self, window_size):
        ppt = random.choice(self.data_files_open)        
        session = random.choice(list(ppt.keys()))
        activity_window = random.choice(list(ppt[session].keys()))
        
        window_size_in_samples = window_size / config['stft_stride']
        
        window_end_idx = np.random.randint(window_size_in_samples, ppt[session][activity_window]['stft'].shape[1])       
        window_idx_stft = np.arange(window_end_idx - window_size_in_samples, window_end_idx, dtype=np.int)        
        window_end_idx_tap_adjusted = np.ceil(np.array(window_end_idx) * config['sampling_rate_after_stft']).astype(np.int)
        
        input_data = np.array(ppt[session][activity_window]['stft'][:, window_idx_stft, :][0])        
        output_data = self.get_taps_in_window(np.array(ppt[session][activity_window]['taps'], dtype=np.int), window_end_idx_tap_adjusted)
                
        return(input_data, output_data)
    
    
    def get_taps_in_window(self, taps, window_end):      
        tap_deltas = self.get_delta_taps(taps, window_end)
        
        future_tap_n = self.get_n_future_taps(taps, window_end)
        
        result = np.concatenate((tap_deltas, future_tap_n))

        return(result)
    
    
    def get_delta_taps(self, taps, window_idx):
        next_kth_taps = taps[taps > window_idx][:10]
        
        tap_deltas = next_kth_taps - window_idx
        
        return(tap_deltas)
            
    
    def get_n_future_taps(self, taps, window_idx):
        n_future_taps = np.zeros(len(config['tap_count_times_in_samples']))
        
        for p_idx, p in enumerate(config['tap_count_times_in_samples']):
            n_future_taps[p_idx] = len(
                taps[(taps > window_idx) & (taps <= (window_idx + p))])
        
        return(n_future_taps)

In [36]:
ppt_data = ParticipantData(data_files)

input_layer, output_layer = ppt_data.get_random_EEG_window(config['EEG_window_length_in_ms'])

print(input_layer.shape)
print(output_layer.shape)

(10,)
(7,)
(17,)
(938, 32)
(17,)


In [37]:
%time ppt_data.get_random_EEG_window(config['EEG_window_length_in_ms'])

(10,)
(7,)
(17,)
Wall time: 1.14 s


(array([[ 10.06532955,  10.61457825,  11.88332844, ..., -12.61561871,
         -13.55403137, -14.27822208],
        [ 10.2015419 ,  10.16027832,  11.54461479, ..., -11.87997818,
         -12.54148769, -12.89192772],
        [ 10.91590595,   9.6346302 ,  11.46984577, ..., -11.72691727,
         -11.89600754, -11.95673466],
        ...,
        [  8.68409538,   9.79296494,  11.8128233 , ..., -11.53281307,
         -11.80125618, -11.90323925],
        [  9.79301739,   9.89943504,  11.96513462, ..., -12.30611515,
         -12.91157818, -13.19223213],
        [  8.65023327,   9.7555151 ,  11.95156956, ..., -13.12178612,
         -14.16552734, -15.17259979]]),
 array([8.9200e+02, 2.3960e+03, 3.1460e+03, 3.8580e+03, 7.0200e+03,
        8.0400e+03, 8.8300e+03, 9.5010e+03, 1.0846e+04, 1.2030e+04,
        0.0000e+00, 1.0000e+00, 4.0000e+00, 8.0000e+00, 4.6000e+01,
        7.4000e+01, 3.1900e+02]))

In [11]:
f = h5py.File(data_files[0], 'r')

list(f['12_02_11_04_19']['window_1'].keys())

print(f['12_02_11_04_19']['window_1']['taps'][:, -1][0])
print(f['12_02_11_04_19']['window_1']['stft'].shape[1])

3112552
98203


Close h5 files again.

In [107]:
ppt_data.close_h5_files()