# Pre-process Sleep Data

This script loads and pre-processes the sleep polysomnography biosignals from the Cleveland Family Study (https://sleepdata.org/datasets/cfs). We then filter the EEG, EMG, EOG, and ECG signals, re-reference the EEG data to the linked mastoids, and then extract our epochs as an MNE object. The epochs of interest are 4 seconds in length and do not overlap. We then optionally downsample everything to 128 Hz and save the epochs as ".fif.gz" files and the hypnograms as ".npy" files.

In [1]:
%matplotlib inline

## Import packages 
import numpy as np
import yasa
import os
import mne
from tqdm import tqdm
import random
import neurokit2 as nk
import pandas as pd
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
from scipy.stats import mode
mne.set_log_level('WARNING')

In [2]:
## Additonal useful functions

def process_raw_EDF_cfs(file):
    """
    Process a raw EDF file, apply various preprocessing steps, and return the processed raw data, epochs, and hypnogram.

    This function:
    - Imports the raw EDF file.
    - Maps and picks channels of interest.
    - Rereferences EEG data.
    - Bipolarizes EOG and EMG data.
    - Applies various filters.
    - Epochs the data into fixed-length 4-second segments.
    - Downsamples the data.
    - Imports and unravels the hypnogram.

    Parameters:
    - file (str): Path to the EDF file (without the '.edf' extension).

    Returns:
    - tuple: A tuple containing:
        - raw_train (mne.io.Raw): Processed raw data.
        - epochs (mne.Epochs): Epochs created from the raw data.
        - hypnogram (numpy array): An array representing the hypnogram.

    Raises:
    - ValueError: If there's a mismatch between the hypnogram length and the total number of epochs.
    """

    ## Import raw edf file
    raw_train = mne.io.read_raw_edf(file + '.edf', eog = ['LOC','ROC'],
                                    preload = True, verbose = False)

    ## Create dictionary of channels we are interested in  
    mapping = {'C3': 'eeg',
               'C4': 'eeg',
               'M1': 'eeg',
               'M2': 'eeg',
               'LOC': 'eog',
               'ROC': 'eog',
               'EMG2': 'emg',
               'EMG3': 'emg',
               'ECG1': 'ecg'}

    ## Select channels in object and give labels for channel type
    raw_train.pick_channels(ch_names=list(mapping))
    raw_train.set_channel_types(mapping) 

    ## Rereference eeg data to average of mastoids
    raw_train.set_eeg_reference(ref_channels=['M1','M2']) # type: ignore
        
    ## Bipolarize eog and emg data 
    try:
        raw_train = mne.set_bipolar_reference(raw_train, 'EMG2', 'EMG3')
    except:
        if not isinstance(raw_train, mne.io.Raw):
            ref_inst = mne.io.RawArray(raw_train.get_data(), raw_train.info)
            raw_train = mne.set_bipolar_reference(raw_train, 'EMG2', 'EMG3')

    ## Filter data
    raw_train.filter(picks=['eeg','eog'], l_freq=0.5, h_freq=40)
    raw_train.filter(picks='emg', l_freq=10, h_freq=100)
    # Clean the ECG data with neurokit2
    raw_train.apply_function(fun=nk.ecg_clean, picks='ecg', n_jobs=-1, 
                            channel_wise=True, **dict(sampling_rate=raw_train.info['sfreq'], method='neurokit', powerline=60))
    # Notch filter
    raw_train.notch_filter(freqs=[60, 120], method='spectrum_fit') # type: ignore

    ## Create fixed-length 4 second epochs
    events = mne.make_fixed_length_events(raw_train, duration=4)
    epochs = mne.Epochs(raw_train, events, tmin=0, tmax=3.99,
                        baseline=None, detrend=None, preload=True, reject=None)

    ## Downsample data
    #epochs.resample(128)
    #raw_train.resample(128)

    ## import and unravel hypnogram
    stages, stagelens = read_xml(file + '-nsrr.xml')
    hypnogram = unravel_hypnogram(stages, stagelens)
  
    return raw_train, epochs, hypnogram

def read_xml(file):
    """
    Reads an XML annotation file to extract hypnogram information.

    Args:
    - file (str): Path to the XML file.

    Returns:
    - tuple: Two numpy arrays containing stages and their corresponding lengths.
    """
    
    tree = ET.parse(file)
    root = tree.getroot()

    # Use list comprehensions to extract the relevant data
    stages = [int(child[1].text[-1]) for child in root.iter('ScoredEvent') 
              if child[0].text and 'Stages' in child[0].text]
    stagelens = [int(float(child[3].text) / 30) for child in root.iter('ScoredEvent') 
                 if child[0].text and 'Stages' in child[0].text]

    # Convert lists to numpy arrays
    return np.array(stages, dtype=int), np.array(stagelens, dtype=int)

def unravel_hypnogram(stages, stagelens):
    """
    Construct a hypnogram based on provided sleep stages and their durations.

    The function maps each sleep stage to a respective value and then creates
    a continuous array representing the hypnogram.

    Parameters:
    - stages (list or array-like): A list of sleep stages, where each stage is an integer.
    - stagelens (list or array-like): A list of durations (in 30s increments) corresponding to each sleep stage.

    Returns:
    - numpy array: A continuous array representing the hypnogram.

    Raises:
    - ValueError: If the length of the constructed hypnogram does not match the total duration specified by stagelens.
    """

    # Map stages to their respective values
    stage_map = {
        0: 0,
        1: 1,
        2: 2,
        3: 3,
        4: 3,  # collapse stage 3 and 4
        5: 4
    }

    # Construct the hypnogram using list comprehension and the mapping
    hypnogram = np.concatenate([stage_map[stage] * np.ones(length) for stage, length in zip(stages, stagelens)])
    
    # Sanity check
    if len(hypnogram) != stagelens.sum():
        raise ValueError('The length of the scaled hypnogram does not match the amount of total epochs')
    
    return hypnogram

def downsample_hypnogram(hypno, samples_per_epoch):
    """
    Downsample a hypnogram based on the most common value in each epoch.

    Parameters:
    - hypno (array-like): The hypnogram with higher sampling rate.
    - samples_per_epoch (int): The number of samples in each epoch.

    Returns:
    - array: The downsampled hypnogram.
    """
    # Reshape the hypnogram
    num_epochs = len(hypno) // samples_per_epoch
    reshaped_hypno = hypno[:num_epochs * samples_per_epoch].reshape(num_epochs, samples_per_epoch)
    
    # Determine the mode for each epoch and explicitly set keepdims to False
    downsampled_hypno = mode(reshaped_hypno, axis=1, keepdims=False)[0]
    
    return downsampled_hypno


## 1. Load data

In [3]:
path = '/media/administrator/data/cfs/polysomnography/'
save_path = '/mnt/server/data03/2023_NENA_Aperiodic_Workshop/data/processed/'
fig_path = '/mnt/server/data03/2023_NENA_Aperiodic_Workshop/figures/subject/'
# Iterate over all files if the names end in .edf
files = [os.path.splitext(f)[0] for f in os.listdir(path) if f.endswith('.edf')]
# Randomly select 50 files 
files = random.sample(files, 50)

## 2. Process and save data

In [4]:
# Iterate over all files and process them
for idx, file in enumerate(tqdm(files)):
    print(f'Preprocessing file : {file}')
    raw, epochs, hypnogram = process_raw_EDF_cfs(path + file)
    # save the data
    epochs.save(fname = save_path + file + '-epo.fif.gz', 
                overwrite=False)
    raw.save(fname = save_path + file + '-raw.fif.gz',  # type: ignore
             overwrite=False)
    np.save(save_path + file + '-hypnogram.npy', hypnogram)
    # delete the data from memory
    del raw, epochs, hypnogram

  0%|          | 0/50 [00:00<?, ?it/s]

Preprocessing file : cfs-visit5-800347


  raw_train.pick_channels(ch_names=list(mapping))
  2%|▏         | 1/50 [01:28<1:12:22, 88.62s/it]

Preprocessing file : cfs-visit5-800212


  raw_train.pick_channels(ch_names=list(mapping))
  4%|▍         | 2/50 [03:05<1:14:51, 93.57s/it]

Preprocessing file : cfs-visit5-800551


  raw_train.pick_channels(ch_names=list(mapping))
  6%|▌         | 3/50 [04:45<1:15:26, 96.30s/it]

Preprocessing file : cfs-visit5-802177


  raw_train.pick_channels(ch_names=list(mapping))
  8%|▊         | 4/50 [06:20<1:13:36, 96.01s/it]

Preprocessing file : cfs-visit5-801662


  raw_train.pick_channels(ch_names=list(mapping))
 10%|█         | 5/50 [07:44<1:08:45, 91.69s/it]

Preprocessing file : cfs-visit5-802125


  raw_train.pick_channels(ch_names=list(mapping))
 12%|█▏        | 6/50 [09:24<1:09:09, 94.30s/it]

Preprocessing file : cfs-visit5-802739


  raw_train.pick_channels(ch_names=list(mapping))
 14%|█▍        | 7/50 [10:59<1:07:52, 94.72s/it]

Preprocessing file : cfs-visit5-800705


  raw_train.pick_channels(ch_names=list(mapping))
 16%|█▌        | 8/50 [12:37<1:06:52, 95.53s/it]

Preprocessing file : cfs-visit5-801873


  raw_train.pick_channels(ch_names=list(mapping))
 18%|█▊        | 9/50 [15:40<1:24:03, 123.01s/it]

Preprocessing file : cfs-visit5-801126


  raw_train.pick_channels(ch_names=list(mapping))
 20%|██        | 10/50 [17:07<1:14:34, 111.87s/it]

Preprocessing file : cfs-visit5-802643


  raw_train.pick_channels(ch_names=list(mapping))
 22%|██▏       | 11/50 [19:49<1:22:37, 127.11s/it]

Preprocessing file : cfs-visit5-801323


  raw_train.pick_channels(ch_names=list(mapping))
 24%|██▍       | 12/50 [21:11<1:11:55, 113.57s/it]

Preprocessing file : cfs-visit5-800243


  raw_train.pick_channels(ch_names=list(mapping))
 26%|██▌       | 13/50 [24:07<1:21:45, 132.57s/it]

Preprocessing file : cfs-visit5-802380


  raw_train.pick_channels(ch_names=list(mapping))
 28%|██▊       | 14/50 [27:03<1:27:22, 145.64s/it]

Preprocessing file : cfs-visit5-802487


  raw_train.pick_channels(ch_names=list(mapping))
 30%|███       | 15/50 [29:45<1:27:44, 150.42s/it]

Preprocessing file : cfs-visit5-801497


  raw_train.pick_channels(ch_names=list(mapping))
 32%|███▏      | 16/50 [31:17<1:15:22, 133.01s/it]

Preprocessing file : cfs-visit5-800625


  raw_train.pick_channels(ch_names=list(mapping))
 34%|███▍      | 17/50 [33:51<1:16:35, 139.26s/it]

Preprocessing file : cfs-visit5-800092


  raw_train.pick_channels(ch_names=list(mapping))
 36%|███▌      | 18/50 [36:42<1:19:19, 148.74s/it]

Preprocessing file : cfs-visit5-800184


  raw_train.pick_channels(ch_names=list(mapping))
 38%|███▊      | 19/50 [39:13<1:17:12, 149.43s/it]

Preprocessing file : cfs-visit5-800010


  raw_train.pick_channels(ch_names=list(mapping))
 40%|████      | 20/50 [40:58<1:07:59, 135.99s/it]

Preprocessing file : cfs-visit5-800535


  raw_train.pick_channels(ch_names=list(mapping))
 42%|████▏     | 21/50 [43:52<1:11:19, 147.58s/it]

Preprocessing file : cfs-visit5-802709


  raw_train.pick_channels(ch_names=list(mapping))
 44%|████▍     | 22/50 [45:35<1:02:37, 134.21s/it]

Preprocessing file : cfs-visit5-800407


  raw_train.pick_channels(ch_names=list(mapping))
 46%|████▌     | 23/50 [48:17<1:04:09, 142.58s/it]

Preprocessing file : cfs-visit5-800494


  raw_train.pick_channels(ch_names=list(mapping))
 48%|████▊     | 24/50 [51:16<1:06:26, 153.32s/it]

Preprocessing file : cfs-visit5-801825


  raw_train.pick_channels(ch_names=list(mapping))
 50%|█████     | 25/50 [52:44<55:47, 133.91s/it]  

Preprocessing file : cfs-visit5-801747


  raw_train.pick_channels(ch_names=list(mapping))
 52%|█████▏    | 26/50 [54:18<48:45, 121.88s/it]

Preprocessing file : cfs-visit5-801064


  raw_train.pick_channels(ch_names=list(mapping))
 54%|█████▍    | 27/50 [57:14<52:53, 137.99s/it]

Preprocessing file : cfs-visit5-801291


  raw_train.pick_channels(ch_names=list(mapping))
 56%|█████▌    | 28/50 [1:00:16<55:24, 151.12s/it]

Preprocessing file : cfs-visit5-800349


  raw_train.pick_channels(ch_names=list(mapping))
 58%|█████▊    | 29/50 [1:02:47<52:54, 151.16s/it]

Preprocessing file : cfs-visit5-800151


  raw_train.pick_channels(ch_names=list(mapping))
 60%|██████    | 30/50 [1:04:16<44:13, 132.69s/it]

Preprocessing file : cfs-visit5-800861


  raw_train.pick_channels(ch_names=list(mapping))
 62%|██████▏   | 31/50 [1:05:35<36:53, 116.52s/it]

Preprocessing file : cfs-visit5-801196


  raw_train.pick_channels(ch_names=list(mapping))
 64%|██████▍   | 32/50 [1:08:26<39:49, 132.76s/it]

Preprocessing file : cfs-visit5-801044


  raw_train.pick_channels(ch_names=list(mapping))
 66%|██████▌   | 33/50 [1:11:05<39:52, 140.72s/it]

Preprocessing file : cfs-visit5-800697


  raw_train.pick_channels(ch_names=list(mapping))
 68%|██████▊   | 34/50 [1:12:43<34:07, 127.98s/it]

Preprocessing file : cfs-visit5-800659


  raw_train.pick_channels(ch_names=list(mapping))
 70%|███████   | 35/50 [1:14:29<30:19, 121.31s/it]

Preprocessing file : cfs-visit5-800249


  raw_train.pick_channels(ch_names=list(mapping))
 72%|███████▏  | 36/50 [1:16:02<26:18, 112.75s/it]

Preprocessing file : cfs-visit5-802073


  raw_train.pick_channels(ch_names=list(mapping))
 74%|███████▍  | 37/50 [1:17:37<23:18, 107.54s/it]

Preprocessing file : cfs-visit5-802635


  raw_train.pick_channels(ch_names=list(mapping))
 76%|███████▌  | 38/50 [1:20:23<24:58, 124.87s/it]

Preprocessing file : cfs-visit5-801019


  raw_train.pick_channels(ch_names=list(mapping))
 78%|███████▊  | 39/50 [1:23:09<25:09, 137.26s/it]

Preprocessing file : cfs-visit5-801001


  raw_train.pick_channels(ch_names=list(mapping))
 80%|████████  | 40/50 [1:24:38<20:29, 122.91s/it]

Preprocessing file : cfs-visit5-802491


  raw_train.pick_channels(ch_names=list(mapping))
 82%|████████▏ | 41/50 [1:27:34<20:48, 138.70s/it]

Preprocessing file : cfs-visit5-801380


  raw_train.pick_channels(ch_names=list(mapping))
 84%|████████▍ | 42/50 [1:29:04<16:32, 124.06s/it]

Preprocessing file : cfs-visit5-801058


  raw_train.pick_channels(ch_names=list(mapping))
 86%|████████▌ | 43/50 [1:30:37<13:24, 114.92s/it]

Preprocessing file : cfs-visit5-802005


  raw_train.pick_channels(ch_names=list(mapping))
 88%|████████▊ | 44/50 [1:32:18<11:03, 110.62s/it]

Preprocessing file : cfs-visit5-801785


  raw_train.pick_channels(ch_names=list(mapping))
 90%|█████████ | 45/50 [1:34:04<09:06, 109.20s/it]

Preprocessing file : cfs-visit5-802522


  raw_train.pick_channels(ch_names=list(mapping))
 92%|█████████▏| 46/50 [1:35:27<06:45, 101.28s/it]

Preprocessing file : cfs-visit5-801393


  raw_train.pick_channels(ch_names=list(mapping))
 94%|█████████▍| 47/50 [1:38:10<05:59, 119.81s/it]

Preprocessing file : cfs-visit5-800667


  raw_train.pick_channels(ch_names=list(mapping))
 96%|█████████▌| 48/50 [1:40:49<04:23, 131.70s/it]

Preprocessing file : cfs-visit5-801638


  raw_train.pick_channels(ch_names=list(mapping))
 98%|█████████▊| 49/50 [1:43:43<02:24, 144.38s/it]

Preprocessing file : cfs-visit5-801225


  raw_train.pick_channels(ch_names=list(mapping))
100%|██████████| 50/50 [1:45:26<00:00, 126.52s/it]


# Artifact detection and labeling of Sleep Data 

We then use the pre-processed polysomnography data from above and utilize a Riemanian geometry based algorithm to detect and label artifacts contained therein. The algorithm is based on the following papers: https://hal.archives-ouvertes.fr/hal-00781701 & https://hal.science/hal-02015909 and is implemented in the yasa toolbox. This so-called "Riemannian Potato" is a clustering method that iteratively estimates the centroid of clean signal by rejecting every trial that is too far from it, thus giving you a label for each given 4 second epoch.

## 3. Process & save update epochs and hypnograms

In [16]:
# Obtain list of unique recordings
processed_files = list(set(["-".join(f.split('-')[0:3]) for f in os.listdir(save_path)]))

# Iterate over all files and process them
for idx, file in enumerate(tqdm(processed_files)):
    print(f'Detecting and labeling artifacts in file : {file}')
    # Load the data and hypnogram files
    raw = mne.io.read_raw(save_path + file + '-raw.fif.gz', preload=True) # type: ignore
    epochs = mne.read_epochs(save_path + file + '-epo.fif.gz', preload=True) # type: ignore
    hypnogram = np.load(save_path + file + '-hypnogram.npy')
    # Get sampling frequency
    sf = raw.info['sfreq']
    # Get data
    data = raw.get_data() * 1e6
    # Unravel hypnogram to match data length
    hypnogram_unravel = yasa.hypno_upsample_to_data(hypno=hypnogram, sf_hypno=1/30, 
                                                    data=data, sf_data=sf)
    # Label artifacts based on Riemannian Potato clustering algorithm
    window = 4
    art, zscores = yasa.art_detect(data, sf=sf, window=window, hypno=hypnogram_unravel, 
                                   include=(1, 2, 3, 4), method='covar', threshold=3)
    sf_art = 1 / window
    
    # Upsample art to match data length
    art_up = yasa.hypno_upsample_to_data(art, sf_art, data, sf)

    # Add -1 to hypnogram where artifacts were detected
    hypno_with_art = hypnogram_unravel.copy()
    hypno_with_art[art_up] = -1

    # Plot and save the spectrogram with the updated hypnogram
    fig = yasa.plot_spectrogram(data[1, :], sf, hypno_with_art)
    fig.savefig(fig_path + file + '-hypno-artifacts.png', dpi=300)
    plt.close(fig)

    # Save the updated hypnogram
    np.save(save_path + file + '-hypnogram_with_art.npy', hypno_with_art)

    # Downsample the hypnogram to match the number of epochs
    samples_per_epoch = int(window * sf)  # 4 seconds * sf
    downsampled_hypno = hypno_with_art[::samples_per_epoch]

    # Create a DataFrame from the downsampled hypnogram
    metadata = pd.DataFrame({'SleepStage': downsampled_hypno})
    
    # Add the metadata to the epochs
    epochs.metadata = metadata

    # Save the epochs 
    epochs.save(save_path + file + '-epo.fif.gz', overwrite=True) # type: ignore

    # Delete some objects to free up memory
    del raw, epochs, hypnogram, hypnogram_unravel, art, zscores, art_up, hypno_with_art, metadata

  0%|          | 0/50 [00:00<?, ?it/s]

Detecting and labeling artifacts in file : cfs-visit5-801662


In [None]:
## Miscellanous
# import pandas as pd
# pd.Series(hypno_with_art).value_counts(normalize=True)
# yasa.sleep_statistics(hypno_with_art, sf_hyp=sf)