In [1]:
# To be able to make edits to repo without having to restart notebook
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
data_path = Path("D:\BlcRepo\OtherCode\Generative_Neuroscience\Dataset\Broderick\Cocktail Party")
import scipy.io as sio
import numpy as np
import xarray as xr
import os
from signalstore import UnitOfWorkProvider
from mongomock import MongoClient
#from pymongo import MongoClient
from fsspec.implementations.local import LocalFileSystem
from fsspec import get_mapper
from fsspec.implementations.dirfs import DirFileSystem
from tempfile import TemporaryDirectory
import fsspec

# Cocktail Party Experiment

## EEG Files
eegData: EEG Data, Time Locked to the onset of the speech stimulus.   
Format: Channels (128) x Time Points

mastoids: Mastoid Channels, Time Locked to the onset of the speech stimulus. 
Format: Channels (Left=1 Right=2) x Time Points

fs: Sampling Rate 

EEG data is unfiltered, unreferenced and sampled at 128Hz

## Experiment Information
Subjects 1-17 were instructed to attend to 'Twenty Thousand Leagues Under the Sea' (20000), played in the left ear
Subjects 18-33 were instructed to attend to 'Journey to the Centre of the Earth' (Journey), played in the right ear

## Behavioral Data
score: Comprehension question scores for attended and unattended stories.
Format: Subjects x Run x Story (1=Attended, 2=Unattended)

## Stimuli Data Files

wordVec = List of all the content words for a given trial
onset_time = Onset time of the word in the corresponding cell of 'wordVec' (given in seconds)
offset_time = Offset time of the word in the corresponding cell of 'wordVec' (given in seconds)
sentence_boundaries = Time of sentence close (in seconds)

In [3]:
def load_eeg_data(data_path):
    eeg_dir = data_path / "EEG"
    subjects = [sdir for sdir in os.listdir(eeg_dir) if os.path.isdir(eeg_dir / sdir)]
    behavior_path = data_path / "Behavioural Data" / "Comprehension Scores.mat"
    behavior_data = sio.loadmat(behavior_path)
    behavior_scores = behavior_data['score']
    for sub_n, subject in enumerate(subjects):
        sub_n += 1
        subject_dir = eeg_dir / subject
        # list run mat files
        runs = os.listdir(subject_dir)
        for run_n, run in enumerate(runs):
            run_n += 1
            mfile = sio.loadmat(subject_dir / run)
            eeg_data = np.array(mfile['eegData'])
            # print(eeg_data.shape)
            mastoid_data = mfile['mastoids']
            fs = int(mfile['fs'][0][0])
            max_time = eeg_data.shape[0] / fs
            time_steps = np.linspace(0, max_time, eeg_data.shape[0]).squeeze()
            if sub_n <= 17:
                attending_direction = 'left'
                attending_story = 'Twenty Thousand Leagues Under the Sea'
            else:
                attending_direction = 'right'
                attending_story = 'Journey to the Center of the Earth'
            behavior_score = behavior_scores[sub_n-1, run_n-1]
            eeg_xarray = xr.DataArray(
                data=eeg_data,
                dims=['time', 'channel'],
                coords={
                    'time': time_steps,
                },
                attrs={
                    'schema_ref': 'eeg_signal',
                    'data_name': f'subject_{sub_n}_run_{run_n}_channels',
                    'subject': f'subject_{sub_n}',
                    'session_data_ref': {'schema_ref': 'session', 'data_name': f'session_{run_n}'},
                    'sampling_frequency': fs,
                    'attending_direction': attending_direction,
                    'attending_story': attending_story,
                    'attend_score': float(behavior_score[0]),
                    'nonattend_score': float(behavior_score[1])
                }
            )
            mastoid_xarray = xr.DataArray(
                data=mastoid_data,
                dims=['time', 'channel'],
                coords={
                    'time': time_steps,
                    'channel': ['left', 'right']
                },
                attrs={
                    'schema_ref': 'eeg_signal',
                    'data_name': f'subject_{sub_n}_run_{run_n}_mastoid',
                    'subject': f'subject_{sub_n}',
                    'session_data_ref': {'schema_ref': 'session', 'data_name': f'session_{run_n}'},
                    'sampling_frequency': fs,
                    'attending_direction': attending_direction,
                    'attending_story': attending_story,
                    'attend_score': float(behavior_score[0]),
                    'nonattend_score': float(behavior_score[1])
                }
            )
            yield eeg_xarray, mastoid_xarray

In [4]:
def load_stimuli(data_path):
    stimuli_dir = data_path / "Stimuli"
    envelopes_20000_dir = stimuli_dir / "Envelopes" / "20000"
    envelopes_20000_files = os.listdir(envelopes_20000_dir)
    envelopes_journey_dir = stimuli_dir / "Envelopes" / "Journey"
    envelopes_journey_files = os.listdir(envelopes_journey_dir)
    text_20000_dir = stimuli_dir / "Text" / "20000"
    text_20000_files = os.listdir(text_20000_dir)
    text_journey_dir = stimuli_dir / "Text" / "Journey"
    text_journey_files = os.listdir(text_journey_dir)
    assert len(envelopes_20000_files) == len(text_20000_files)
    assert len(envelopes_journey_files) == len(text_journey_files)
    assert len(envelopes_20000_files) == len(envelopes_journey_files)
    n_runs = len(envelopes_20000_files)
    for run in range(n_runs):
        stimuli_record = {
            'schema_ref': 'stimuli_record',
            'data_name': f'session_{run}',
            'left_wordvec_data_ref': {'schema_ref': 'wordvec', 'data_name': f'20000_run_{run}'},
            'left_offset_time_data_ref': {'schema_ref': 'offset_times', 'data_name': f'20000_run_{run}'},
            'left_onset_time_data_ref': {'schema_ref': 'onset_times', 'data_name': f'20000_run_{run}'},
            'left_sentence_boundaries_data_ref': {'schema_ref': 'sentence_boundaries', 'data_name': f'20000_run_{run}'},
            'left_envelope_data_ref': {'schema_ref': 'envelope', 'data_name': f'20000_run_{run}'},
            'right_wordvec_data_ref': {'schema_ref': 'wordvec', 'data_name': f'journey_run_{run}'},
            'right_offet_time_data_ref': {'schema_ref': 'offset_times', 'data_name': f'journey_run_{run}'},
            'right_onset_time_data_ref': {'schema_ref': 'onset_times', 'data_name': f'journey_run_{run}'},
            'right_sentence_boundaries_data_ref': {'schema_ref': 'sentence_boundaries', 'data_name': f'journey_run_{run}'},
            'right_envelope_data_ref': {'schema_ref': 'envelope', 'data_name': f'journey_run_{run}'}
        }
        # text contains keys: 'offset_time', 'onset_time', 'sentence_boundaries', 'wordVec'
        text_20000 = sio.loadmat(text_20000_dir / text_20000_files[run])
        wordvec_20000 = xr.Dataset(
            {
                'wordVec': (['time'], text_20000['wordVec']),
                'onset_time': (['time'], text_20000['onset_time'].flatten()),
                'offset_time': (['time'], text_20000['offset_time'].flatten())
            },

        )
        text_journey = sio.loadmat(text_journey_dir / text_journey_files[run])
        # envelopes contains keys: 'envelope', 'fsEnv', 'origLength'
        envelopes_20000 = sio.loadmat(envelopes_20000_dir / envelopes_20000_files[run])
        envelopes_journey = sio.loadmat(envelopes_journey_dir / envelopes_journey_files[run])
        left_wordvec = xr.DataArray(
            wordvec_20000['wordVec'],
            dims = ['word_number'],
            attrs={
                'schema_ref': 'wordvec',
                'data_name': f'2000_run_{run}',
                'story': 'Twenty Thousand Leagues Under the Sea',
                'stimuli_record_data_ref': {'schema_ref': 'stimuli_record', 'data_name': f'session_{run}'}
            }
        )
        right_wordvec = xr.DataArray(
            text_journey['wordVec'],
            dims = ['word_number'],
            attrs={
                'schema_ref': 'wordvec',
                'data_name': f'journey_run_{run}',
                'story': 'Journey to the Center of the Earth',
                'stimuli_record_data_ref': {'schema_ref': 'stimuli_record', 'data_name': f'session_{run}'}
            }
        )
        left_onset_time = xr.DataArray(
            text_20000['onset_time'],
            dims = ['word_number'],
            attrs={
                'schema_ref': 'onset_times',
                'data_name': f'2000_run_{run}',
                'story': 'Twenty Thousand Leagues Under the Sea',
                'stimuli_record_data_ref': {'schema_ref': 'stimuli_record', 'data_name': f'session_{run}'}
            }
        )
        right_onset_time = xr.DataArray(
            text_journey['onset_time'],
            dims = ['word_number'],
            attrs={
                'schema_ref': 'onset_times',
                'data_name': f'journey_run_{run}',
                'story': 'Journey to the Center of the Earth',
                'stimuli_record_data_ref': {'schema_ref': 'stimuli_record', 'data_name': f'session_{run}'}
            }
        )
        left_offset_time = xr.DataArray(
            text_20000['offset_time'],
            dims = ['word_number'],
            attrs={
                'schema_ref': 'offset_times',
                'data_name': f'2000_run_{run}',
                'story': 'Twenty Thousand Leagues Under the Sea',
                'stimuli_record_data_ref': {'schema_ref': 'stimuli_record', 'data_name': f'session_{run}'}
            }
        )
        right_offset_time = xr.DataArray(
            text_journey['offset_time'],
            dims = ['word_number'],
            attrs={
                'schema_ref': 'offset_times',
                'data_name': f'journey_run_{run}',
                'story': 'Journey to the Center of the Earth',
                'stimuli_record_data_ref': {'schema_ref': 'stimuli_record', 'data_name': f'session_{run}'}
            }
        )
        left_sentence_boundaries = xr.DataArray(
            text_20000['sentence_boundaries'],
            dims = ['sentence_number'],
            attrs={
                'schema_ref': 'sentence_boundaries',
                'data_name': f'2000_run_{run}',
                'story': 'Twenty Thousand Leagues Under the Sea',
                'stimuli_record_data_ref': {'schema_ref': 'stimuli_record', 'data_name': f'session_{run}'}
            }
        )
        right_sentence_boundaries = xr.DataArray(
            text_journey['sentence_boundaries'],
            dims = ['sentence_number'],
            attrs={
                'schema_ref': 'sentence_boundaries',
                'data_name': f'journey_run_{run}',
                'stimuli_record_data_ref': {'schema_ref': 'stimuli_record', 'data_name': f'session_{run}'}
            }
        )
        left_envelope = xr.DataArray(
            envelopes_20000['envelope'],
            dims = ['word_number'],
            attrs={
                'schema_ref': 'envelope',
                'data_name': f'2000_run_{run}',
                'sampling_frequency': envelopes_20000['fsEnv'],
                'original_length': envelopes_20000['origLength'],
                'stimuli_record_data_ref': {'schema_ref': 'stimuli_record', 'data_name': f'session_{run}'}
            }
        )
        right_envelope = xr.DataArray(
            envelopes_journey['envelope'],
            dims = ['word_number'],
            attrs={
                'schema_ref': 'envelope',
                'data_name': f'journey_run_{run}',
                'sampling_frequency': envelopes_journey['fsEnv'],
                'original_length': envelopes_journey['origLength'],
                'stimuli_record_data_ref': {'schema_ref': 'stimuli_record', 'data_name': f'session_{run}'}
            }
        )
        yield left_wordvec, right_wordvec, left_onset_time, right_onset_time, left_offset_time, right_offset_time, left_sentence_boundaries, right_sentence_boundaries, left_envelope, right_envelope, stimuli_record

In [5]:
filesystem = LocalFileSystem()
tmp_dir = TemporaryDirectory()
print(tmp_dir.name)
tmp_dir_fs = DirFileSystem(
    tmp_dir.name,
    filesystem=filesystem
)
client = MongoClient()
memory_store = {}
uow_provider = UnitOfWorkProvider(
    mongo_client=client,
    filesystem=filesystem,
    memory_store=memory_store
)
import json
cwd = Path.cwd()
domain_models_path = cwd.parent / "DomainModels\\cocktail_party\\data_models.json"
metamodel_path = cwd.parent / "DomainModels\\cocktail_party\\metamodels.json"
property_path = cwd.parent / "DomainModels\\cocktail_party\\property_models.json"

with open(metamodel_path) as f:
    metamodels = json.load(f)

with open(property_path) as f:
    property_models = json.load(f)
    
# for metamodel in metamodels:
# with uow_provider('cocktail-party') as uow:
#     print(f"Adding model {metamodel['schema_name']} to domain_models store.")
#     uow.domain_models.add(metamodel)
#     model = uow.domain_models.get(metamodel['schema_name'])
#     print(model['schema_name'])
#     uow.commit()

# load domain models json file
with open(domain_models_path) as f:
    domain_models = json.load(f)
    
with uow_provider('cocktail-party') as uow:
    for metamodel in metamodels:
        uow.domain_models.add(metamodel)
        model = uow.domain_models.get(metamodel['schema_name'])
        print(model['schema_name'])
    for property_model in property_models:
        uow.domain_models.add(property_model)
        model = uow.domain_models.get(property_model['schema_name'])
        print(model['schema_name'])
    for domain_model in domain_models:
        uow.domain_models.add(domain_model)
        model = uow.domain_models.get(domain_model['schema_name'])
        print(model['schema_name'])
        uow.commit()

C:\Users\arthu\AppData\Local\Temp\tmpf4ec5ldg
record_metamodel
xarray_dataarray_metamodel
version_timestamp
schema_ref
schema_type
schema_name
schema_title
schema_description
data_name
time_of_save
time_of_removal
record_type
json_schema
has_file
unit_of_measure
dimension_of_measure
acquisition
acquisition_date
import_date
acquisition_notes
data_dimensions
shape
dtype
session_description
session_date
session_time
session_duration
session_notes
data_ref
start_time
duration
duration_unit
animal_species
age
age_unit
age_lower_bound
age_upper_bound
animal_id
tetrode_id
tetrode_depth
genotype
animal_strain
stimulus_type
stimulus_id
stimulus_description
recording_length
sample_rate
arena_shape
arena_description
study_description
arena_height
arena_width
diameter
arena_side_length
arena_radius
spike_count
subject
sampling_frequency
attending_direction
attending_story
attend_score
nonattend_score
original_length
eeg_signal
session
stimuli_record
wordvec
offset_times
onset_times
sentence_bounda

In [8]:
for eeg_xarray, mastoid_xarray in load_eeg_data(data_path):
    with uow_provider('cocktail-party') as uow:
        uow.data.add(eeg_xarray)
        uow.data.add(mastoid_xarray)
        
        uow.commit()

FileSystemDAOFileAlreadyExistsError: Cannot add object with path "cocktail-party/eeg_signal__subject_14_run_23_mastoid.nc" because it already exists in repository.

In [None]:
for result in load_stimuli(data_path):
    for data in result:
        with uow_provider('cocktail-party') as uow:
            uow.data.add(data)

In [None]:
# envelopes_path = "D:\BlcRepo\OtherCode\Generative_Neuroscience\Dataset\Broderick\Cocktail Party\Stimuli\Envelopes\\20000\\20000_1_env.mat"
envelopes_path = os.path.join(data_path, "Stimuli", "Envelopes", "20000", "20000_1_env.mat")
matf = sio.loadmat(envelopes_path)
print(matf.keys())
print(matf['envelope'].shape)
print(matf['fsEnv'])
print(matf['origLength'])

In [None]:
# text_path = "K:\ke\sta\data\SpeechEEG\Cocktail Party\Stimuli\Text\\20000\Run4.mat"
text_path = os.path.join(data_path, "Stimuli", "Text", "20000", "Run4.mat")
matf = sio.loadmat(text_path)
print(matf.keys())
def utf_array_to_str(arr):
    string = ""
    for word in arr:
        string += word[0][0] + " "
    return string
print(utf_array_to_str(matf['wordVec']))
print(matf['wordVec'].shape)
print(matf['onset_time'].shape)
print(matf['offset_time'].shape)
print(matf['sentence_boundaries'])

In [None]:
import re

def compute_unique_vocabulary(text_dir):
    full_paths = [text_dir / "20000" / f for f in os.listdir(text_dir / "20000")] \
                + [text_dir / "Journey" / f for f in os.listdir(text_dir / "Journey")]
    word_counts = {}
    word_lookup = {}
    for text_file in full_paths:
        matf = sio.loadmat(text_file)
        run = text_file.stem.replace("Run", "")
        source = text_file.parent.stem
        for word in matf['wordVec']:
            for word in word:
                word = str(word[0])
                if word in word_counts:
                    word_counts[word] += 1
                    word_lookup[word] += [(source, f'session_{run}')]
                else:
                    word_counts[word] = 1
                    word_lookup[word] = [(source, f'session_{run}')]
    return word_counts, word_lookup
vocab, lookup = compute_unique_vocabulary(data_path / "Stimuli" / "Text")
print(len(vocab))
print(lookup)