In [1]:
# To be able to make edits to repo without having to restart notebook
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
data_path = Path("/Users/dtyoung/Documents/childmind/signalstore-eeg-datasets/HealthyBrainNetworkDataExample/ds004186")
import scipy.io as sio
import numpy as np
import xarray as xr
import os
from signalstore import UnitOfWorkProvider
from mongomock import MongoClient
#from pymongo import MongoClient
from fsspec.implementations.local import LocalFileSystem
from fsspec import get_mapper
from fsspec.implementations.dirfs import DirFileSystem
import fsspec
import mne
import pandas as pd
import json

# Healthy Brain Network Data - Resting state

This is an excerpt of the Healthy Brain Network data ([data paper](https://www.nature.com/articles/sdata2017181)). The resting state portion as been formatted with BIDS ([Brain Imaging Data Structure](https://bids-specification.readthedocs.io/en/stable/)) format, and made publicly available on [Openneuro](https://openneuro.org/datasets/ds004186/versions/2.0.0) (it's a large dataset so browsing latency might be lagging)

## EEG Files
Data is organized by subject. Each subject (`sub-*`) directory has an `eeg` directory storing the eeg data and its associated metadata.
`*_eeg.fdt` and `*_eeg.set`: EEG data in EEGLAB format

## Experiment Information
Subjects 1-17 were instructed to attend to 'Twenty Thousand Leagues Under the Sea' (20000), played in the left ear
Subjects 18-33 were instructed to attend to 'Journey to the Centre of the Earth' (Journey), played in the right ear

## Behavioral Data
score: Comprehension question scores for attended and unattended stories.
Format: Subjects x Run x Story (1=Attended, 2=Unattended)

## Stimuli Data Files

wordVec = List of all the content words for a given trial
onset_time = Onset time of the word in the corresponding cell of 'wordVec' (given in seconds)
offset_time = Offset time of the word in the corresponding cell of 'wordVec' (given in seconds)
sentence_boundaries = Time of sentence close (in seconds)

In [20]:
def load_eeg_data(bids_data_path):
    subjects = [bids_dir for bids_dir in os.listdir(bids_data_path) if bids_dir.startswith('sub-')]
    print('subjects', subjects)
    for sub_n, subject_dir in enumerate(subjects):
        subject = subject_dir.split('-')[1]
        subject_dir_path = bids_data_path / subject_dir
        eeg_dir = subject_dir_path / "eeg"
        sub_n += 1

        tasks = ['EC', 'EO']
        runs  = [list(range(1, 6)), list(range(1, 6))]
        # list run mat files
        # runs = os.listdir(subject_dir_path)
        for t, task in enumerate(tasks):
            for run in runs[t]:
                # get file by name pattern subject_dir*task*run_eeg.set
                raw_file = eeg_dir / f"{subject_dir}_task-{task}_run-{run}_eeg.set"
                print('raw file', raw_file)
                if not os.path.exists(raw_file):
                    continue

                EEG = mne.io.read_raw_eeglab(os.path.join(raw_file), preload=True)
                eeg_data = EEG.get_data()

                print('data shape:', eeg_data.shape)
                
                eeg_json_file = eeg_dir / f"{subject_dir}_task-{task}_run-{run}_eeg.json"
                eeg_json = json.load(eeg_json_file.open())
                fs = int(eeg_json['SamplingFrequency'])
                max_time = eeg_data.shape[0] / fs
                time_steps = np.linspace(0, max_time, eeg_data.shape[0]).squeeze() # in seconds

                channel_coords_file = eeg_dir / f"{subject_dir}_task-{task}_run-{run}_channels.tsv"
                channel_coords = pd.read_csv(channel_coords_file, sep='\t') 
                # get channel names from channel_coords
                channel_names = channel_coords['name'].values
                # print('channel coords names', channel_names)
                # print(len(channel_names))
                eeg_xarray = xr.DataArray(
                    data=eeg_data,
                    dims=['time', 'channel'],
                    # coords={
                    #     'time': time_steps,
                    #     'channel': channel_names
                    # },
                    attrs={
                        'schema_ref': 'eeg_signal',
                        'data_name': f"{subject_dir}_task-{task}_run-{run}",
                        'subject': f'{subject}',
                        'version_timestamp': 0,
                        'task': task,
                        'session_run': run,
                        'sampling_frequency': fs,
                    }
                )
                yield eeg_xarray

In [31]:
filesystem = LocalFileSystem()
# tmp_dir = TemporaryDirectory()
# print(tmp_dir.name)

# Create data storage location
dataset_name = "healthy_brain_network"
store_path = Path("/Users/dtyoung/Documents/childmind/signalstore-eeg-datasets/HealthyBrainNetworkDataExample/signalstore")

# Create a directory for the dataset
if not os.path.exists(store_path):
    os.makedirs(store_path)

tmp_dir_fs = DirFileSystem(
    store_path,
    filesystem=filesystem
)
client = MongoClient()
memory_store = {}
uow_provider = UnitOfWorkProvider(
    mongo_client=client,
    filesystem=tmp_dir_fs,
    memory_store=memory_store
)
import json
cwd = Path.cwd()
domain_models_path = cwd.parent / f"DomainModels/{dataset_name}/data_models.json"
metamodel_path = cwd.parent / f"DomainModels/{dataset_name}/metamodels.json"
property_path = cwd.parent / f"DomainModels/{dataset_name}/property_models.json"

with open(metamodel_path) as f:
    metamodels = json.load(f)

with open(property_path) as f:
    property_models = json.load(f)
    
# for metamodel in metamodels:
# with uow_provider('cocktail-party') as uow:
#     print(f"Adding model {metamodel['schema_name']} to domain_models store.")
#     uow.domain_models.add(metamodel)
#     model = uow.domain_models.get(metamodel['schema_name'])
#     print(model['schema_name'])
#     uow.commit()

# load domain models json file
with open(domain_models_path) as f:
    domain_models = json.load(f)
    
with uow_provider(dataset_name) as uow:
    for property_model in property_models:
        uow.domain_models.add(property_model)
        model = uow.domain_models.get(property_model['schema_name'])
        print('property model: ', model['schema_name'])
    for metamodel in metamodels:
        uow.domain_models.add(metamodel)
        model = uow.domain_models.get(metamodel['schema_name'])
        print('meta model: ', model['schema_name'])
    for domain_model in domain_models:
        uow.domain_models.add(domain_model)
        model = uow.domain_models.get(domain_model['schema_name'])
        print('domain model: ', model['schema_name'])
        uow.commit()

property model:  version_timestamp
property model:  schema_ref
property model:  schema_type
property model:  schema_name
property model:  schema_title
property model:  schema_description
property model:  data_name
property model:  time_of_save
property model:  time_of_removal
property model:  record_type
property model:  modality
property model:  json_schema
property model:  has_file
property model:  unit_of_measure
property model:  dimension_of_measure
property model:  acquisition
property model:  acquisition_date
property model:  import_date
property model:  acquisition_notes
property model:  data_dimensions
property model:  shape
property model:  dtype
property model:  session_description
property model:  session_date
property model:  session_time
property model:  session_duration
property model:  session_notes
property model:  session_run
property model:  data_ref
property model:  start_time
property model:  duration
property model:  duration_unit
property model:  animal_species
pr

Subject 30 Run 2 has data shape (2, 7681), and eeg data shape (128, 7681) for some reason, added the code in the load_eeg_data function to transpose the mastoid data if the second dimension is not 2.

In [33]:
for eeg_xarray in load_eeg_data(data_path):
    print('adding data')
    with uow_provider(dataset_name) as uow:
        uow.data.add(eeg_xarray)
        
        uow.commit()

subjects ['sub-NDARZZ993CEV']
raw file /Users/dtyoung/Documents/childmind/signalstore-eeg-datasets/HealthyBrainNetworkDataExample/ds004186/sub-NDARZZ993CEV/eeg/sub-NDARZZ993CEV_task-EC_run-1_eeg.set
Reading /Users/dtyoung/Documents/childmind/signalstore-eeg-datasets/HealthyBrainNetworkDataExample/ds004186/sub-NDARZZ993CEV/eeg/sub-NDARZZ993CEV_task-EC_run-1_eeg.fdt
Reading 0 ... 19997  =      0.000 ...    39.994 secs...
data shape: (129, 19998)
adding data


MongoDAOTypeError: Invalid type <class 'xarray.core.dataarray.DataArray'> for argument time_threshold. Must be one of (<class 'datetime.datetime'>, <class 'NoneType'>).

In [29]:
with uow_provider(dataset_name) as uow:
    query = {
        # "schema_ref": "eeg_signal",
        "subject": "NDARZZ993CEV",
    }
    sessions = uow.data.find(query)
    print(len(sessions))
    for i in range(len(sessions)):
        print(sessions[i])

10
{'schema_ref': 'eeg_signal', 'data_name': 'sub-NDARZZ993CEV_task-EC_run-1', 'subject': 'NDARZZ993CEV', 'version_timestamp': 0, 'task': 'EC', 'session_run': 1, 'sampling_frequency': 500, 'has_file': True, 'time_of_save': datetime.datetime(2024, 7, 23, 19, 31, 51, 691132, tzinfo=datetime.timezone.utc), 'time_of_removal': None}
{'schema_ref': 'eeg_signal', 'data_name': 'sub-NDARZZ993CEV_task-EC_run-2', 'subject': 'NDARZZ993CEV', 'version_timestamp': 0, 'task': 'EC', 'session_run': 2, 'sampling_frequency': 500, 'has_file': True, 'time_of_save': datetime.datetime(2024, 7, 23, 19, 31, 51, 779608, tzinfo=datetime.timezone.utc), 'time_of_removal': None}
{'schema_ref': 'eeg_signal', 'data_name': 'sub-NDARZZ993CEV_task-EC_run-3', 'subject': 'NDARZZ993CEV', 'version_timestamp': 0, 'task': 'EC', 'session_run': 3, 'sampling_frequency': 500, 'has_file': True, 'time_of_save': datetime.datetime(2024, 7, 23, 19, 31, 51, 854820, tzinfo=datetime.timezone.utc), 'time_of_removal': None}
{'schema_ref': '