In [1]:
import os
import os.path as op

import mne
import wfdb
import numpy as np
import pandas as pd

from mne.datasets.utils import _get_path
from mne.datasets.sleep_physionet._utils import _fetch_one
from braindecode.datasets import BaseDataset, BaseConcatDataset
from braindecode.preprocessing.preprocess import _preprocess, preprocess, Preprocessor
from braindecode.preprocessing.windowers  import _create_windows_from_events
from functools import partial

from joblib import Parallel, delayed

In [2]:
PC18_DIR           = op.join('..','..','..','03 Dynamic-Spatial-Filtering', 'data', 'pc18')
PC18_RECORDS       = op.join(PC18_DIR, 'sleep_records.csv')
PC18_INFO          = op.join(PC18_DIR, 'age-sex.csv')
PC18_SHA1_TRAINING = op.join(PC18_DIR, 'training_SHA1SUMS')
PC18_SHA1_TEST     = op.join(PC18_DIR, 'test_SHA1SUMS')
PC18_URL           = 'https://physionet.org/files/challenge-2018/1.0.0/'

In [3]:
df_records = pd.read_csv(PC18_RECORDS)
df_info    = pd.read_csv(PC18_INFO)

In [4]:
df_records.head()

Unnamed: 0,Subject,Record,Record type,Split,Age,Sex,sha,fname
0,0,te03-0024,PSG,test,31,male,fe9a52b00a81a8c2c29ec30b4f8e0d21e0d1d0b8,test/te03-0024/te03-0024.mat
1,0,te03-0024,Header,test,31,male,52b31029da8454f01610745c27eb07d0bda7c301,test/te03-0024/te03-0024.hea
2,1,te03-0031,Header,test,55,male,573dc478c0219adb9c66229222dcd3b3a5febafb,test/te03-0031/te03-0031.hea
3,1,te03-0031,PSG,test,55,male,de56e1df7cb0d8856eeb900f182e9a3b7f96a4f7,test/te03-0031/te03-0031.mat
4,2,te03-0032,Header,test,50,male,1fe5c7bd48e9346da4e1082cea42483a2b3f5469,test/te03-0032/te03-0032.hea


In [5]:
df_records.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4960 entries, 0 to 4959
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Subject      4960 non-null   int64 
 1   Record       4960 non-null   object
 2   Record type  4960 non-null   object
 3   Split        4960 non-null   object
 4   Age          4960 non-null   int64 
 5   Sex          4960 non-null   object
 6   sha          4960 non-null   object
 7   fname        4960 non-null   object
dtypes: int64(2), object(6)
memory usage: 310.1+ KB


In [6]:
df_records['Record type'].unique()

array(['PSG', 'Header', 'Arousal'], dtype=object)

In [7]:
df_info.tail()

Unnamed: 0,Record,Sex,Age
1978,tr14-0268,M,49
1979,tr14-0272,F,62
1980,tr14-0276,M,32
1981,tr14-0278,F,73
1982,tr14-0291,M,80


In [8]:
df_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1983 entries, 0 to 1982
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Record  1983 non-null   object
 1   Sex     1983 non-null   object
 2   Age     1983 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 46.6+ KB


## load data

Se tiene una clase que se encuentra en dataset.py que se piensa que es para cargar el set de datos: necesita de otras funciones como:

In [9]:
# funcion para guardar localmente el dataset

def _data_path(path=None, force_update=False, update_path=None, verbose=None):
    
    """
    
    Get path to local copy of PC18 dataset.
    
    """

    key =  'PC18_DATASET_PATH'
    name = 'PC18_DATASET_SLEEP'
    path = _get_path(path, key, name)
    subdirs = os.listdir(path)

    if 'training' in subdirs or 'test' in subdirs:  # the specified path is
        # already at the training and test folders level
        return path
    else:
        return op.join('/media/martin/Disco2', 'Dsf_Data')

In [10]:
def fetch_pc18_data(subjects, path=None, force_update=False, update_path=None, base_url=PC18_URL, verbose=None):
    
    """
    
    Get paths to local copies of PhysioNet Challenge 2018 dataset files.

    This will fetch data from the publicly available PhysioNet Computing in
    Cardiology Challenge 2018 dataset on sleep arousal detection [1]_ [2]_.
    This corresponds to 1983 recordings from individual subjects with
    (suspected) sleep apnea. The dataset is separated into a training set with
    994 recordings for which arousal annotation are available and a test set
    with 989 recordings for which the labels have not been revealed. Across the
    entire dataset, mean age is 55 years old and 65% of recordings are from
    male subjects.

    More information can be found on the
    `physionet website <https://physionet.org/content/challenge-2018/1.0.0/>`_.

    Parameters
    ----------
    subjects : list of int
        The subjects to use. Can be in the range of 0-1982 (inclusive). Test
        recordings are 0-988, while training recordings are 989-1982.
    path : None | str
        Location of where to look for the PC18 data storing location. If None,
        the environment variable or config parameter ``PC18_DATASET_PATH``
        is used. If it doesn't exist, the "~/mne_data" directory is used. If
        the dataset is not found under the given path, the data will be
        automatically downloaded to the specified folder.
    force_update : bool
        Force update of the dataset even if a local copy exists.
    update_path : bool | None
        If True, set the PC18_DATASET_PATH in mne-python config to the given
        path. If None, the user is prompted.
    base_url : str
        The URL root.
    %(verbose)s

    Returns
    -------
    paths : list
        List of local data paths of the given type.

    References
    ----------
    .. [1] Mohammad M Ghassemi, Benjamin E Moody, Li-wei H Lehman, Christopher
      Song, Qiao Li, Haoqi Sun, Roger G Mark, M Brandon Westover, Gari D
      Clifford. You Snooze, You Win: the PhysioNet/Computing in Cardiology
      Challenge 2018.
    .. [2] Goldberger, A., Amaral, L., Glass, L., Hausdorff, J., Ivanov, P. C.,
      Mark, R., ... & Stanley, H. E. (2000). PhysioBank, PhysioToolkit, and
      PhysioNet: Components of a new research resource for complex physiologic
      signals. Circulation [Online]. 101 (23), pp. e215–e220.)

    """
    
    records         = pd.read_csv(PC18_RECORDS)
    psg_records     = records[records['Record type'] == 'PSG']
    hea_records     = records[records['Record type'] == 'Header']
    arousal_records = records[records['Record type'] == 'Arousal']

    path            = _data_path(path=path, update_path=update_path)
    params          = [path, force_update, base_url]

    fnames          = []
    for subject in subjects:
        for idx in np.where(psg_records['Subject'] == subject)[0]:
            psg_fname = _fetch_one(psg_records['fname'].iloc[idx], psg_records['sha'].iloc[idx], *params)
            hea_fname = _fetch_one(hea_records['fname'].iloc[idx], hea_records['sha'].iloc[idx], *params)
            if psg_records['Split'].iloc[idx] == 'training':
                train_idx = np.where(
                    arousal_records['Subject'] == subject)[0][0]
                arousal_fname = _fetch_one(
                    arousal_records['fname'].iloc[train_idx],
                    arousal_records['sha'].iloc[train_idx], *params)
            else:
                arousal_fname = None
            fnames.append([psg_fname, hea_fname, arousal_fname])

    return fnames

In [20]:
def convert_wfdb_anns_to_mne_annotations(annots):

    """

    Convert wfdb.io.Annotation format to MNE's.

    Parameters
    ----------
    annots : wfdb.io.Annotation
        Annotation object obtained by e.g. loading an annotation file with
        wfdb.rdann().

    Returns
    -------
    mne.Annotations :
        MNE Annotations object.

    """
    ann_chs = set(annots.chan)
    onsets = annots.sample / annots.fs
    new_onset, new_duration, new_description = list(), list(), list()
    for ch in ann_chs:
        mask = annots.chan == ch
        ch_onsets = onsets[mask]
        ch_descs = np.array(annots.aux_note)[mask]

        # Events with beginning and end, defined by '(event' and 'event)'
        if all([(i.startswith('(') or i.endswith(')')) for i in ch_descs]):
            pass
        else:  # Sleep stage-like annotations
            ch_durations = np.concatenate([np.diff(ch_onsets), [30]])
            assert all(ch_durations > 0), 'Negative duration'
            new_onset.extend(ch_onsets)
            new_duration.extend(ch_durations)
            new_description.extend(ch_descs)

    mne_annots = mne.Annotations(new_onset, new_duration, new_description, orig_time=None)

    return mne_annots

In [38]:
class PC18(BaseConcatDataset):

    """
    
    Physionet Challenge 2018 polysomnography dataset.

    Sleep dataset from https://physionet.org/content/challenge-2018/1.0.0/.
    Contains overnight recordings from 1983 healthy subjects.

    See `fetch_pc18_data` for a more complete description.

    Parameters
    ----------
    subject_ids: list(int) | str | None
        (list of) int of subject(s) to be loaded. If None, load all available
        subjects. If 'training', load all training recordings. If 'test', load
        all test recordings.
    path : None | str
        Location of where to look for the PC18 data storing location. If None,
        the environment variable or config parameter ``MNE_DATASETS_PC18_PATH``
        is used. If it doesn't exist, the "~/mne_data" directory is used. If
        the dataset is not found under the given path, the data will be
        automatically downloaded to the specified folder.
    load_eeg_only: bool
        If True, only load the EEG channels and discard the others (EOG, EMG,
        temperature, respiration) to avoid resampling the other signals.
    preproc : list(Preprocessor) | None
        List of preprocessors to apply to each file individually. This way the
        data can e.g., be downsampled (temporally and spatially) to limit the
        memory usage of the entire Dataset object. This also enables applying
        preprocessing in parallel over the recordings.
    windower : callable | None
        Function to split the raw data into windows. If provided, windowing is
        integrated into the loading process (after preprocessing) such that
        memory usage is minized while allowing parallelization.
    n_jobs : int
        Number of parallel processes.

    """
    
    def __init__(self, subject_ids=None, path=None, load_eeg_only=True, preproc=None, windower=None, n_jobs=1):
        paths = fetch_pc18_data(subject_ids, path=path)
        self.info_df = pd.read_csv(PC18_INFO)

        if n_jobs == 1:
            all_base_ds = [self._load_raw(subject_id, p[0], p[2], load_eeg_only=load_eeg_only,
                preproc=preproc, windower=windower)
                for subject_id, p in zip(subject_ids, paths)]
        else:
            all_base_ds = Parallel(n_jobs=n_jobs)(delayed(self._load_raw)(
                subject_id, p[0], p[2], load_eeg_only=load_eeg_only,
                preproc=preproc, windower=windower)
                for subject_id, p in zip(subject_ids, paths))
        super().__init__(all_base_ds)

    def _load_raw(self, subj_nb, raw_fname, arousal_fname, load_eeg_only, preproc, windower):
        raw_fname     = raw_fname[0] if isinstance(raw_fname, tuple) else raw_fname
        arousal_fname = arousal_fname[0] if isinstance(arousal_fname, tuple) else arousal_fname


        channel_types = ['eeg'] * 7
        if load_eeg_only:
            channels  = list(range(7))
        else:
            channel_types += ['emg', 'misc', 'misc', 'misc', 'misc', 'ecg']
            channels  = None

        # Load raw signals and header
        record = wfdb.io.rdrecord(op.splitext(raw_fname)[0], channels=channels)
        

        # Convert to right units for MNE (EEG should be in V)
        data = record.p_signal.T
        data[np.array(record.units) == 'uV'] /= 1e6
        data[np.array(record.units) == 'mV'] /= 1e3
        info = mne.create_info(record.sig_name, record.fs, channel_types)
        out = mne.io.RawArray(data, info)

        # Extract annotations
        if arousal_fname is not None:
            print('hasta aca corre')
            annots = wfdb.rdann(
                                op.splitext(raw_fname)[0], 'arousal', sampfrom=0, sampto=None,
                                shift_samps=False, return_label_elements=['symbol'],
                                summarize_labels=False
                               )
            mne_annots = convert_wfdb_anns_to_mne_annotations(annots)
            out.set_annotations(mne_annots)
        record_name = op.splitext(op.basename(raw_fname))[0]
        record_info = self.info_df[
            self.info_df['Record'] == record_name].iloc[0]
        if record_info['Record'].startswith('tr'):
            split = 'training'
        elif record_info['Record'].startswith('te'):
            split = 'test'
        else:
            split = 'unknown'

        desc = pd.Series({
            'subject': subj_nb,
            'record': record_info['Record'],
            'split': split,
            'age': record_info['Age'],
            'sex': record_info['Sex']
        }, name='')
        out = BaseDataset(out, desc)

        if preproc is not None:
            _preprocess(out, None, preproc)

        if windower is not None:
            out = windower(out)
            out.windows.load_data()

        return out

In [39]:
def scale(x, k):
    return k * x

def cast(x, dtype):
    return x.astype(dtype)

def load_data(dataset, window_size_s, n_jobs):
    """
    
    Load, preprocess and window data.
    
    """       

    subject_ids = [989, 990, 991]
    ch_names    = ['F3-M2', 'F4-M1', 'O1-M2', 'O2-M1']
    preproc     = [
                  Preprocessor('pick_channels', ch_names=ch_names, ordered=True),
                  Preprocessor('filter', l_freq=None, h_freq=30, n_jobs=1),
                  Preprocessor('resample', sfreq=100., n_jobs=1),
                  Preprocessor(scale, k=1e6),
                  Preprocessor(cast, dtype=np.float32)
                  ]

    window_size_samples = int(window_size_s * 100)
    mapping             = {'W': 0, 'N1': 1, 'N2': 2, 'N3': 3, 'R': 4}
    windower            = partial(
                                 _create_windows_from_events, infer_mapping=False,
                                 infer_window_size_stride=False, trial_start_offset_samples=0,
                                 trial_stop_offset_samples=0,
                                 window_size_samples=window_size_samples,
                                 window_stride_samples=window_size_samples, mapping=mapping
                                 )

    dataset = PC18(subject_ids=subject_ids, preproc=preproc, windower=windower, n_jobs=n_jobs)
    return dataset

windows_dataset = load_data('pc18_debug', 30, 4)

Using default location ~/mne_data for PC18_DATASET_SLEEP...


Creating RawArray with float64 data, n_channels=7, n_times=4770000
    Range : 0 ... 4769999 =      0.000 ... 23849.995 secs
Ready.
Error loading annotations: Python integer 256 out of bounds for uint8
Creating RawArray with float64 data, n_channels=7, n_times=5147000
    Range : 0 ... 5146999 =      0.000 ... 25734.995 secs
Ready.
Error loading annotations: Python integer 256 out of bounds for uint8


OverflowError: Python integer 256 out of bounds for uint8

Estoy teniendo un problema con la parte de _load_raw