diff --git a/pipeline/src/base.py b/pipeline/src/base.py index ada9101b..58d7aa95 100644 --- a/pipeline/src/base.py +++ b/pipeline/src/base.py @@ -240,7 +240,9 @@ def __init__(self, id=None, **properties): def save(self, file_path, indent=2): """ - Save this object to a file in JSON-LD format + Save this object to a file in JSON-LD format. + + It is recommended to use the extension ".jsonld". """ with open(file_path, "w") as output_file: json.dump(self.to_jsonld(), output_file, indent=indent) diff --git a/pipeline/src/collection.py b/pipeline/src/collection.py index 3a67d6d5..b3ea7eb9 100644 --- a/pipeline/src/collection.py +++ b/pipeline/src/collection.py @@ -5,6 +5,7 @@ The collection can be saved to and loaded from disk, in JSON-LD format. """ +from glob import glob import json import os from .registry import lookup_type @@ -69,7 +70,7 @@ def _sort_nodes_by_id(self): sorted_nodes = dict(sorted(self.nodes.items())) self.nodes = sorted_nodes - def save(self, path, individual_files=False, include_empty_properties=False): + def save(self, path, individual_files=False, include_empty_properties=False, group_by_schema=False): """ Save the node collection to disk in JSON-LD format. @@ -78,10 +79,18 @@ def save(self, path, individual_files=False, include_empty_properties=False): path (str): either a file or a directory into which the metadata will be written. + It is recommended to use the extension ".jsonld". individual_files (bool): if False (default), save the entire collection into a single file. if True, `path` must be a directory, and each node is saved into a separate file within that directory. + include_empty_properties (bool): + if False (default), do not include properties with value None. + if True, include all properties. + group_by_schema (bool): + Only applies if `individual_files` is True. + If False (default), save all files in a single directory. + If True, save into subdirectories according to the schema name. Returns ------- @@ -137,7 +146,12 @@ def save(self, path, individual_files=False, include_empty_properties=False): else: assert node.id.startswith("_:") file_identifier = node.id[2:] - file_path = os.path.join(path, f"{file_identifier}.jsonld") + if group_by_schema: + dir_path = os.path.join(path, node.__class__.__name__) + os.makedirs(dir_path, exist_ok=True) + file_path = os.path.join(dir_path, f"{file_identifier}.jsonld") + else: + file_path = os.path.join(path, f"{file_identifier}.jsonld") with open(file_path, "w") as fp: data = node.to_jsonld(embed_linked_nodes=False, include_empty_properties=include_empty_properties) json.dump(data, fp, indent=2) @@ -150,9 +164,9 @@ def load(self, *paths, version=DEFAULT_VERSION): `*paths` may contain either: - 1) a single directory, in which case - all JSON-LD files all the top level of this directory will be loaded - (but without descending into subdirectories) + 1) a single directory, in which case all JSON-LD files in this directory + and any non-hidden subdirectories will be loaded + (where hidden subdirectories are those whose name starts with "."). 2) one or more JSON-LD files, which will all be loaded. @@ -161,11 +175,10 @@ def load(self, *paths, version=DEFAULT_VERSION): """ if len(paths) == 1 and os.path.isdir(paths[0]): data_dir = paths[0] - json_paths = [ - os.path.join(data_dir, item) - for item in os.listdir(data_dir) - if os.path.splitext(item)[1] in (".json", ".jsonld") - ] + json_paths = ( + glob(f"{data_dir}/**/*.jsonld", recursive=True) + + glob(f"{data_dir}/**/*.json", recursive=True) + ) else: json_paths = paths diff --git a/pipeline/tests/test_collections.py b/pipeline/tests/test_collections.py index add5ddd7..d1accfc8 100644 --- a/pipeline/tests/test_collections.py +++ b/pipeline/tests/test_collections.py @@ -77,6 +77,26 @@ def test_round_trip_multi_file(): assert p == np +def test_round_trip_multi_file_group_by_schema(): + shutil.rmtree(test_output_dir, ignore_errors=True) + person = build_fake_node(omcore.Person) + collection = Collection(person) + collection.save(test_output_dir, individual_files=True, include_empty_properties=False, group_by_schema=True) + new_collection = Collection() + new_collection.load(test_output_dir) + + assert len(collection) == len(new_collection) + + for node in new_collection: + if node.id == person.id: + new_person = person + break + + p = person.to_jsonld(include_empty_properties=False, embed_linked_nodes=True) + np = new_person.to_jsonld(include_empty_properties=False, embed_linked_nodes=True) + assert p == np + + def test_collection_sort_by_id(): person = omcore.Person(given_name="A", family_name="Professor", id="_:004") uni1 = omcore.Organization(full_name="University of This Place", id="_:002")