# Pre-process Sleep Data 

This script loads and pre-processes the sleep polysomnography biosignals from the Cleveland Family Study (https://sleepdata.org/datasets/cfs). We then filter the EEG, EMG, EOG, and ECG signals, re-reference the EEG data to the linked mastoids, and then extract our epochs as an MNE object. The epochs of interest are 30 seconds in length and do not overlap. We then downsample everything to 128 Hz and save the epochs as .fif.gz files and the hypnograms as .npy files.

In [1]:
%matplotlib inline

## Import packages 
import numpy as np
import os
import mne
from tqdm import tqdm
import random
import neurokit2 as nk
mne.set_log_level('WARNING')

In [2]:
## NSRR Cleveland family sleep study dataset specific functions

def process_raw_EDF_cfs(file):

    ## Import raw edf file
    raw_train = mne.io.read_raw_edf(file + '.edf', eog = ['LOC','ROC'],
                                    preload = True, verbose = False)

    ## Create dictionary of channels we are interested in  
    mapping = {'C3': 'eeg',
               'C4': 'eeg',
               'M1': 'eeg',
               'M2': 'eeg',
               'LOC': 'eog',
               'ROC': 'eog',
               'EMG2': 'emg',
               'EMG3': 'emg',
               'ECG1': 'ecg'}

    ## Select channels in object and give labels for channel type
    raw_train.pick_channels(ch_names=list(mapping))
    raw_train.set_channel_types(mapping) 

    ## Rereference eeg data to average of mastoids
    raw_train.set_eeg_reference(ref_channels=['M1','M2']) # type: ignore
        
    ## Bipolarize eog and emg data 
    try:
        raw_train = mne.set_bipolar_reference(raw_train, 'EMG2', 'EMG3')
    except:
        if not isinstance(raw_train, mne.io.Raw):
            ref_inst = mne.io.RawArray(raw_train.get_data(), raw_train.info)
            raw_train = mne.set_bipolar_reference(raw_train, 'EMG2', 'EMG3')

    ## Filter data
    raw_train.filter(picks=['eeg','eog'], l_freq=0.5, h_freq=40)
    raw_train.filter(picks='emg', l_freq=10, h_freq=100)
    # Clean the ECG data with neurokit2
    raw_train.apply_function(fun=nk.ecg_clean, picks='ecg', n_jobs=-1, 
                            channel_wise=True, **dict(sampling_rate=raw_train.info['sfreq'], method='neurokit', powerline=60))
    # Notch filter
    raw_train.notch_filter(freqs=[60, 120], method='spectrum_fit') # type: ignore

    ## Create fixed-length 30 second epochs
    events = mne.make_fixed_length_events(raw_train, duration=30)
    epochs = mne.Epochs(raw_train, events, tmin=0, tmax=29.99,
                        baseline=None, detrend=None, preload=True, reject=None)

    ## Downsample data
    epochs.resample(128)
    epochs.get_data().shape

    ## import and unravel hypnogram
    stages, stagelens = read_xml(file + '-nsrr.xml')
    hypnogram = unravel_hypnogram(stages, stagelens)
  
    return raw_train.resample(128), epochs, hypnogram

def read_xml(file):
    # import xml annotation file to extract hypnogram
    import xml.etree.ElementTree as ET
    tree = ET.parse(file)
    
    # obtain roots from xml annotation tree
    root = tree.getroot()
    
    # extract sleep stageing related information
    stages = [] 
    stagelens = []
    for child in root.iter('ScoredEvent'):
        var = child[0].text
        if var == None:
            pass
        elif 'Stages' in var:
            stage = child[1].text
            # append stages (0-5)
            stages.append(int(stage[-1]))
            stagelen = int(float(child[3].text)) # type: ignore
            # append epoch lengths in increments of 30s
            stagelens.append(int(stagelen/30))
        else:
            pass
        
    # return numpy arrays with stages and corresponding length info
    return np.array(stages).astype(int), np.array(stagelens).astype(int)

def unravel_hypnogram(stages, stagelens):  
    stage_len = []
    # parse stageing information to fit total of epochs by stages 
    for index, length in enumerate(stagelens):
        if stages[index] == 0:
            stage_len.append(np.zeros(length))
        elif stages[index] == 1:
            stage_len.append(np.ones(length))
        elif stages[index] == 2:
            stage_len.append(2*np.ones(length))
        elif stages[index] == 3:
            stage_len.append(3*np.ones(length))
        # collapse stage 3 and 4
        elif stages[index] == 4:
            stage_len.append(3*np.ones(length))
        elif stages[index] == 5:
            stage_len.append(4*np.ones(length))
    
    hypnogram = np.concatenate(stage_len)
    
    # sanity check - does hypnogram 
    if len(hypnogram) != stagelens.sum():
        raise ValueError('The length of the scaled hypnogram does not match the amount of total epochs')
    
    return hypnogram
 

In [3]:
## 1. Load data
path = '/media/administrator/data/cfs/polysomnography/'
save_path = '/mnt/server/data03/2023_NENA_Aperiodic_Workshop/data/processed/'
# Iterate over all files if the names end in .edf
files = [os.path.splitext(f)[0] for f in os.listdir(path) if f.endswith('.edf')]
# Randomly select 20 files 
files = random.sample(files, 20)

In [4]:
## 2. Process and save data
# Iterate over all files and process them
for idx, file in enumerate(tqdm(files)):
    print(f'Preprocessing file : {file}')
    raw, epochs, hypnogram = process_raw_EDF_cfs(path + file)
    # save the data
    epochs.save(fname = save_path + file + '-epo.fif.gz', 
                overwrite=False)
    raw.save(fname = save_path + file + '-raw.fif.gz',  # type: ignore
             overwrite=False)
    np.save(save_path + file + '-hypnogram.npy', hypnogram)
    # delete the data from memory
    del raw, epochs, hypnogram

  0%|          | 0/20 [00:00<?, ?it/s]

Preprocessing file : cfs-visit5-800551


  raw_train.pick_channels(ch_names=list(mapping))
  5%|▌         | 1/20 [01:16<24:09, 76.31s/it]

Preprocessing file : cfs-visit5-801058


  raw_train.pick_channels(ch_names=list(mapping))
 10%|█         | 2/20 [02:30<22:34, 75.28s/it]

Preprocessing file : cfs-visit5-801126


  raw_train.pick_channels(ch_names=list(mapping))
 15%|█▌        | 3/20 [03:39<20:26, 72.16s/it]

Preprocessing file : cfs-visit5-802522


  raw_train.pick_channels(ch_names=list(mapping))
 20%|██        | 4/20 [04:44<18:27, 69.24s/it]

Preprocessing file : cfs-visit5-800667


  raw_train.pick_channels(ch_names=list(mapping))
 25%|██▌       | 5/20 [06:23<19:59, 80.00s/it]

Preprocessing file : cfs-visit5-801602


  raw_train.pick_channels(ch_names=list(mapping))
 30%|███       | 6/20 [08:05<20:25, 87.52s/it]

Preprocessing file : cfs-visit5-800092


  raw_train.pick_channels(ch_names=list(mapping))
 35%|███▌      | 7/20 [09:47<19:58, 92.17s/it]

Preprocessing file : cfs-visit5-801907


  raw_train.pick_channels(ch_names=list(mapping))
 40%|████      | 8/20 [10:59<17:10, 85.90s/it]

Preprocessing file : cfs-visit5-800659


  raw_train.pick_channels(ch_names=list(mapping))
 45%|████▌     | 9/20 [12:23<15:38, 85.35s/it]

Preprocessing file : cfs-visit5-800630


  raw_train.pick_channels(ch_names=list(mapping))
 50%|█████     | 10/20 [14:08<15:14, 91.46s/it]

Preprocessing file : cfs-visit5-802298


  raw_train.pick_channels(ch_names=list(mapping))
 55%|█████▌    | 11/20 [15:24<13:00, 86.71s/it]

Preprocessing file : cfs-visit5-801747


  raw_train.pick_channels(ch_names=list(mapping))
 60%|██████    | 12/20 [16:37<11:00, 82.57s/it]

Preprocessing file : cfs-visit5-802132


  raw_train.pick_channels(ch_names=list(mapping))
 65%|██████▌   | 13/20 [18:11<10:01, 85.99s/it]

Preprocessing file : cfs-visit5-800407


  raw_train.pick_channels(ch_names=list(mapping))
 70%|███████   | 14/20 [19:45<08:50, 88.44s/it]

Preprocessing file : cfs-visit5-800697


  raw_train.pick_channels(ch_names=list(mapping))
 75%|███████▌  | 15/20 [21:00<07:02, 84.44s/it]

Preprocessing file : cfs-visit5-801152


  raw_train.pick_channels(ch_names=list(mapping))
 80%|████████  | 16/20 [22:12<05:22, 80.51s/it]

Preprocessing file : cfs-visit5-801044


  raw_train.pick_channels(ch_names=list(mapping))
 85%|████████▌ | 17/20 [23:46<04:13, 84.58s/it]

Preprocessing file : cfs-visit5-801393


  raw_train.pick_channels(ch_names=list(mapping))
 90%|█████████ | 18/20 [25:20<02:55, 87.54s/it]

Preprocessing file : cfs-visit5-802643


  raw_train.pick_channels(ch_names=list(mapping))
 95%|█████████▌| 19/20 [26:51<01:28, 88.35s/it]

Preprocessing file : cfs-visit5-801416


  raw_train.pick_channels(ch_names=list(mapping))
100%|██████████| 20/20 [28:24<00:00, 85.22s/it]
