[MRG] update make_dataset_description (#996)

mne-tools · Apr 6, 2022 · 5bc4216 · 5bc4216
1 parent cc5db23
commit 5bc4216
Show file tree

Hide file tree

Showing 3 changed files with 139 additions and 40 deletions.
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -47,7 +47,7 @@ Detailed list of changes
 🧐 API and behavior changes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-- ...
+- :func:`mne_bids.make_dataset_description` now accepts keyword arguments only, and can now also write the following metadata: ``HEDVersion``, ``EthicsApprovals``, ``GeneratedBy``, and ``SourceDatasets``, by `Stefan Appelhoff`_ (:gh:`406`)
 
 🛠 Requirements
 ^^^^^^^^^^^^^^^

diff --git a/mne_bids/tests/test_write.py b/mne_bids/tests/test_write.py
@@ -294,10 +294,6 @@ def test_write_correct_inputs():
 
 def test_make_dataset_description(tmp_path, monkeypatch):
     """Test making a dataset_description.json."""
-    with pytest.raises(ValueError, match='`dataset_type` must be either "raw" '
-                                         'or "derivative."'):
-        make_dataset_description(path=tmp_path, name='tst', dataset_type='src')
-
     make_dataset_description(path=tmp_path, name='tst')
 
     with open(op.join(tmp_path, 'dataset_description.json'), 'r',
@@ -329,6 +325,39 @@ def test_make_dataset_description(tmp_path, monkeypatch):
         dataset_description_json = json.load(fid)
         assert dataset_description_json["Authors"] == ['MNE B.', 'MNE P.']
 
+    # Check we raise warnings and errors where appropriate
+    with pytest.raises(ValueError, match='`dataset_type` must be either "raw" '
+                                         'or "derivative."'):
+        make_dataset_description(path=tmp_path, name='tst', dataset_type='src')
+
+    with pytest.warns(RuntimeWarning, match='The `doi` field in.*'):
+        make_dataset_description(path=tmp_path, name='tst',
+                                 doi='10.5281/zenodo.3686061')
+
+    for gen_by in [[1, 2], 12]:
+        with pytest.raises(ValueError, match='generated_by must be a list.*'):
+            make_dataset_description(path=tmp_path, name='tst',
+                                     generated_by=gen_by)
+
+    with pytest.raises(ValueError, match='"Name" is a required field.*'):
+        make_dataset_description(path=tmp_path, name='tst',
+                                 generated_by=[{"Version": 2}])
+
+    gen_by = [{"Name": "bla", "x": 3, "y": 1}]
+    with pytest.raises(ValueError, match=".*in dict: {'.', '.'}"):
+        make_dataset_description(path=tmp_path, name='tst',
+                                 generated_by=gen_by)
+
+    for s_ds in [[1, 2], 12]:
+        with pytest.raises(ValueError, match='source_datasets must be a.*'):
+            make_dataset_description(path=tmp_path, name='tst',
+                                     source_datasets=s_ds)
+
+    s_ds = [{"URL": "bla", "x": 3, "y": 1}]
+    with pytest.raises(ValueError, match=".*in dict: {'.', '.'}"):
+        make_dataset_description(path=tmp_path, name='tst',
+                                 source_datasets=s_ds)
+
     monkeypatch.setattr(write, 'BIDS_VERSION', 'old')
     with pytest.raises(ValueError, match='Previous BIDS version used'):
         make_dataset_description(path=tmp_path, name='tst')
@@ -1032,7 +1061,11 @@ def test_bti(_bids_validate, tmp_path):
     bids_path = _bids_path.copy().update(root=tmp_path, datatype='meg')
 
     # write the BIDS dataset description, then write BIDS files
-    make_dataset_description(tmp_path, name="BTi data")
+    s_ds = [{"URL": "https://mne.testing.data"}]
+    gen_by = [{"Name": "mne_bids"}]
+    make_dataset_description(
+        path=tmp_path, name="BTi data", source_datasets=s_ds,
+        generated_by=gen_by, authors="a,b,c")
     write_raw_bids(raw, bids_path, verbose=True)
 
     assert op.exists(tmp_path / 'participants.tsv')
@@ -1253,8 +1286,11 @@ def test_eegieeg(dir_name, fname, reader, _bids_validate, tmp_path):
                           match='Encountered data in "double" format'):
             write_raw_bids(**kwargs)
 
-    make_dataset_description(bids_root, name="test",
-                             authors=["test1", "test2"], overwrite=True)
+    make_dataset_description(path=bids_root, name="test",
+                             authors=["test1", "test2"], overwrite=True,
+                             dataset_type="raw",
+                             ethics_approvals=["approved by S."],
+                             hed_version="No HED used (just testing)")
     dataset_description_fpath = op.join(bids_root, "dataset_description.json")
     with open(dataset_description_fpath, 'r', encoding='utf-8') as f:
         dataset_description_json = json.load(f)

diff --git a/mne_bids/write.py b/mne_bids/write.py
@@ -1042,48 +1042,65 @@ def _write_raw_edf(raw, bids_fname, overwrite):
 
 
 @verbose
-def make_dataset_description(path, name, data_license=None,
+def make_dataset_description(*, path, name, hed_version=None,
+                             dataset_type='raw', data_license=None,
                              authors=None, acknowledgements=None,
                              how_to_acknowledge=None, funding=None,
-                             references_and_links=None, doi=None,
-                             dataset_type='raw',
+                             ethics_approvals=None, references_and_links=None,
+                             doi=None, generated_by=None, source_datasets=None,
                              overwrite=False, verbose=None):
-    """Create json for a dataset description.
+    """Create a dataset_description.json file for a BIDS dataset.
 
-    BIDS datasets may have one or more fields, this function allows you to
-    specify which you wish to include in the description. See the BIDS
-    documentation for information about what each field means.
+    The dataset_description.json file is required in BIDS and describes
+    several general aspects of the dataset. You can use this function
+    to freely add metadata fields to this file. See the BIDS specification
+    for information about what each metadata field means.
 
     Parameters
     ----------
     path : str
         A path to a folder where the description will be created.
     name : str
         The name of this BIDS dataset.
+    hed_version : str
+        If HED tags are used: The version of the HED schema used to validate
+        HED tags for study.
+    dataset_type : str
+        Must be either "raw" or "derivative". Defaults to "raw".
     data_license : str | None
         The license under which this dataset is published.
     authors : list | str | None
         List of individuals who contributed to the creation/curation of the
-        dataset. Must be a list of str or a single comma separated str
-        like ['a', 'b', 'c'].
-    acknowledgements : list | str | None
-        Either a str acknowledging individuals who contributed to the
-        creation/curation of this dataset OR a list of the individuals'
-        names as str.
-    how_to_acknowledge : list | str | None
-        Either a str describing how to acknowledge this dataset OR a list of
-        publications that should be cited.
+        dataset. Must be a list of str (e.g., ['a', 'b', 'c']) or a single
+        comma-separated str (e.g., 'a, b, c').
+    acknowledgements : str | None
+        A str acknowledging individuals who contributed to the
+        creation/curation of this dataset.
+    how_to_acknowledge : str | None
+        A str describing how to acknowledge this dataset.
     funding : list | str | None
         List of sources of funding (e.g., grant numbers). Must be a list of
-        str or a single comma separated str like ['a', 'b', 'c'].
+        str (e.g., ['a', 'b', 'c']) or a single comma-separated str
+        (e.g., 'a, b, c').
+    ethics_approvals : list | str | None
+        List of ethics committee approvals of the research protocols
+        and/or protocol identifiers. Must be a list of str (e.g.,
+        ['a', 'b', 'c']) or a single comma-separated str (e.g., 'a, b, c').
     references_and_links : list | str | None
         List of references to publication that contain information on the
-        dataset, or links.  Must be a list of str or a single comma
-        separated str like ['a', 'b', 'c'].
+        dataset, or links.  Must be a list of str (e.g., ['a', 'b', 'c'])
+        or a single comma-separated str (e.g., 'a, b, c').
     doi : str | None
-        The DOI for the dataset.
-    dataset_type : str
-        Must be either "raw" or "derivative". Defaults to "raw".
+        The Digital Object Identifier of the dataset (not the corresponding
+        paper). Must be of the form ``doi:<insert_doi>`` (e.g.,
+        doi:10.5281/zenodo.3686061).
+    generated_by : list of dict | None
+        Used to specify provenance of the dataset. See BIDS specification
+        for details.
+    source_datasets : list of dict | None
+        Used to specify the locations and relevant attributes of all source
+        datasets. Each dict in the list represents one source dataset and
+        may contain the following keys: ``URL``, ``DOI``, ``Version``.
     overwrite : bool
         Whether to overwrite existing files or data in files.
         Defaults to False.
@@ -1094,32 +1111,76 @@ def make_dataset_description(path, name, data_license=None,
 
     Notes
     -----
-    The required field BIDSVersion will be automatically filled by mne_bids.
+    The required metadata field ``BIDSVersion`` will be automatically filled in
+    by mne_bids.
 
     """
-    # Put potential string input into list of strings
-    if isinstance(authors, str):
-        authors = authors.split(', ')
-    if isinstance(funding, str):
-        funding = funding.split(', ')
-    if isinstance(references_and_links, str):
-        references_and_links = references_and_links.split(', ')
+    # Convert potential string input into list of strings
+    convert_vars = [authors, funding, references_and_links, ethics_approvals]
+    convert_vars = [[i.strip() for i in var.split(',')]
+                    if isinstance(var, str) else var
+                    for var in convert_vars]
+    authors, funding, references_and_links, ethics_approvals = convert_vars
+
+    # Perform input checks
     if dataset_type not in ['raw', 'derivative']:
         raise ValueError('`dataset_type` must be either "raw" or '
                          '"derivative."')
+    if isinstance(doi, str):
+        if not doi.startswith('doi:'):
+            warn('The `doi` field in dataset_description should be of the '
+                 'form `doi:<insert_doi>`')
+
+    # check generated_by and source_datasets
+    msg_type = '{} must be a list of dicts or None.'
+    msg_key = 'found unexpected key(s) in dict: {}'
+
+    generated_by_keys = set([
+        "Name", "Version", "Description", "CodeURL", "Container"])
+    if isinstance(generated_by, list):
+        if not all([isinstance(i, dict) for i in generated_by]):
+            raise ValueError(msg_type.format('generated_by'))
+        for i in generated_by:
+            if 'Name' not in i:
+                raise ValueError(
+                    '"Name" is a required field for each dict in '
+                    'generated_by')
+            if not set(i.keys()).issubset(generated_by_keys):
+                raise ValueError(msg_key.format(i.keys() - generated_by_keys))
+    else:
+        if generated_by is not None:
+            raise ValueError(msg_type.format('generated_by'))
+
+    source_ds_keys = set(["URL", "DOI", "Version"])
+    if isinstance(source_datasets, list):
+        if not all([isinstance(i, dict) for i in source_datasets]):
+            raise ValueError(msg_type.format('source_datasets'))
+        for i in source_datasets:
+            if not set(i.keys()).issubset(source_ds_keys):
+                raise ValueError(msg_key.format(i.keys() - source_ds_keys))
+    else:
+        if source_datasets is not None:
+            raise ValueError(msg_type.format('source_datasets'))
 
+    # Prepare dataset_description.json
     fname = op.join(path, 'dataset_description.json')
     description = OrderedDict([
         ('Name', name),
         ('BIDSVersion', BIDS_VERSION),
+        ('HEDVersion', hed_version),
         ('DatasetType', dataset_type),
         ('License', data_license),
         ('Authors', authors),
         ('Acknowledgements', acknowledgements),
         ('HowToAcknowledge', how_to_acknowledge),
         ('Funding', funding),
+        ('EthicsApprovals', ethics_approvals),
         ('ReferencesAndLinks', references_and_links),
-        ('DatasetDOI', doi)])
+        ('DatasetDOI', doi),
+        ('GeneratedBy', generated_by),
+        ('SourceDatasets', source_datasets)])
+
+    # Handle potentially existing file contents
     if op.isfile(fname):
         with open(fname, 'r', encoding='utf-8-sig') as fin:
             orig_cols = json.load(fin)
@@ -1131,12 +1192,14 @@ def make_dataset_description(path, name, data_license=None,
         for key in description:
             if description[key] is None or not overwrite:
                 description[key] = orig_cols.get(key, None)
+
     # default author to make dataset description BIDS compliant
     # if the user passed an author don't overwrite,
     # if there was an author there, only overwrite if `overwrite=True`
     if authors is None and (description['Authors'] is None or overwrite):
         description['Authors'] = ["[Unspecified]"]
 
+    # Only write data that is not None
     pop_keys = [key for key, val in description.items() if val is None]
     for key in pop_keys:
         description.pop(key)
@@ -1660,7 +1723,7 @@ def write_raw_bids(raw, bids_path, events_data=None, event_id=None,
     # already exist. Always set overwrite to False here. If users
     # want to edit their dataset_description, they can directly call
     # this function.
-    make_dataset_description(bids_path.root, name=" ", overwrite=False)
+    make_dataset_description(path=bids_path.root, name=" ", overwrite=False)
 
     _sidecar_json(raw, task=bids_path.task, manufacturer=manufacturer,
                   fname=sidecar_path.fpath, datatype=bids_path.datatype,