In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from multiprocessing import Pool
import h5py
import sys

# Make sure these folders can be accessed by Python, they are provided in another part of the codebase
import loading
import nedc

In [None]:
base_directory = 'PATH_OF_DATASET'
save_path = 'PATH_OF_NEW_FILE.h5'

includes_tse = False # Boolean flag, whether .tse files can also be loaded

In [None]:
def wrangle_tse(tse_path, length, fs=200):

    label_dct = {

        'bckg': 0,
        'seiz': 1
            }

    label = np.zeros(shape=(length,))

    df_label = pd.read_csv(
            filepath_or_buffer=tse_path,
            header=None,
            sep=' ',
            names=['start', 'stop', 'label', 'confidence'],
            skiprows=2,
            na_filter=False
        )

    for i, time_s in enumerate(df_label.start):
        label_str = df_label.label[i]
        label[int(time_s*fs):] = label_dct[label_str]
    
    return label

In [None]:
edf_files = []
for root, dirs, files in os.walk(base_directory):
    for file in files:
        if file.endswith(".edf"):
             edf_files.append(os.path.join(root, file))

In [None]:
len(edf_files)

# Process_file definition
*includes_tse* flag determines whether labels can also be loaded. Change *wiener* to True in the definition to use the Wiener filter as preprocessing.

In [None]:
if includes_tse:
    def process_file(file_path):
        file_name = file_path[-22-7:-4-7]
        
        try:
            (fs, data, mount) = loading.loadRecording(file_path, wiener=False)
            signal = np.asarray(data, dtype=np.float32)
        
        label = wrangle_tse(file_path[:-4]+'.tse_bi', length=signal.shape[1])
        label = np.asarray(label, dtype=np.uint8)
        except TypeError:
            signal = 0
        
        return file_name, signal, label
else:
    def process_file(file_path):
        file_name = file_path[-22-7:-4-7]
        
        try:
            (fs, data, mount) = loading.loadRecording(file_path, wiener=False)
            signal = np.asarray(data, dtype=np.float32)

        except TypeError:
            signal = 0

        return file_name, signal

In [None]:
pool = Pool()

In [None]:
if includes_tse:
    file_names, signals, labels = zip(*pool.map(process_file, edf_files))
else:
    file_names, signals = zip(*pool.map(process_file, edf_files))

# Saving data

Make sure that files rejected by ICLabel don't get included in the final dataset

In [None]:
file_names, signals = zip(*[[file_name, signal] for file_name, signal in zip(file_names, signals) 
                          if len(np.asarray(signal).shape)!=0])

In [None]:
if includes_tse:
    dt_fl = h5py.vlen_dtype(np.dtype('float32'))
    dt_int = h5py.vlen_dtype(np.dtype('uint8'))
    dt_str = h5py.special_dtype(vlen=str)
    
    with h5py.File(save_path, 'w') as f:
        dset_signals = f.create_dataset('signals', (len(signals), 18), dtype=dt_fl)
        dset_labels = f.create_dataset('labels', (len(labels),), dtype=dt_int)
        dset_file_names = f.create_dataset('filenames', (len(file_names),), dtype=dt_str)
        
        for i in range(len(signals)):
            dset_signals[i] = signals[i]
            dset_labels[i] = labels[i]
            dset_file_names[i] = file_names[i]
else:
    dt_fl = h5py.vlen_dtype(np.dtype('float32'))
    dt_str = h5py.special_dtype(vlen=str)
    
    with h5py.File(save_path, 'w') as f:
        dset_signals = f.create_dataset('signals', (len(signals), 18), dtype=dt_fl)
        dset_file_names = f.create_dataset('filenames', (len(file_names),), dtype=dt_str)
        
        for i in range(len(signals)):
            dset_signals[i] = signals[i]
            dset_file_names[i] = file_names[i]