openMetadataInitiative · Raphael-Gazzotti · Nov 18, 2025 · Nov 14, 2025 · Nov 18, 2025
diff --git a/pipeline/src/base.py b/pipeline/src/base.py
@@ -240,7 +240,9 @@ def __init__(self, id=None, **properties):
 
     def save(self, file_path, indent=2):
         """
-        Save this object to a file in JSON-LD format
+        Save this object to a file in JSON-LD format.
+
+        It is recommended to use the extension ".jsonld".
         """
         with open(file_path, "w") as output_file:
             json.dump(self.to_jsonld(), output_file, indent=indent)

diff --git a/pipeline/src/collection.py b/pipeline/src/collection.py
@@ -5,6 +5,7 @@
 The collection can be saved to and loaded from disk, in JSON-LD format.
 """
 
+from glob import glob
 import json
 import os
 from .registry import lookup_type
@@ -69,7 +70,7 @@ def _sort_nodes_by_id(self):
         sorted_nodes = dict(sorted(self.nodes.items()))
         self.nodes = sorted_nodes
 
-    def save(self, path, individual_files=False, include_empty_properties=False):
+    def save(self, path, individual_files=False, include_empty_properties=False, group_by_schema=False):
         """
         Save the node collection to disk in JSON-LD format.
 
@@ -78,10 +79,18 @@ def save(self, path, individual_files=False, include_empty_properties=False):
 
         path (str):
             either a file or a directory into which the metadata will be written.
+            It is recommended to use the extension ".jsonld".
         individual_files (bool):
             if False (default), save the entire collection into a single file.
             if True, `path` must be a directory, and each node is saved into a
             separate file within that directory.
+        include_empty_properties (bool):
+            if False (default), do not include properties with value None.
+            if True, include all properties.
+        group_by_schema (bool):
+            Only applies if `individual_files` is True.
+            If False (default), save all files in a single directory.
+            If True, save into subdirectories according to the schema name.
 
         Returns
         -------
@@ -137,7 +146,12 @@ def save(self, path, individual_files=False, include_empty_properties=False):
                 else:
                     assert node.id.startswith("_:")
                     file_identifier = node.id[2:]
-                file_path = os.path.join(path, f"{file_identifier}.jsonld")
+                if group_by_schema:
+                    dir_path = os.path.join(path, node.__class__.__name__)
+                    os.makedirs(dir_path, exist_ok=True)
+                    file_path = os.path.join(dir_path, f"{file_identifier}.jsonld")
+                else:
+                    file_path = os.path.join(path, f"{file_identifier}.jsonld")
                 with open(file_path, "w") as fp:
                     data = node.to_jsonld(embed_linked_nodes=False, include_empty_properties=include_empty_properties)
                     json.dump(data, fp, indent=2)
@@ -150,9 +164,9 @@ def load(self, *paths, version=DEFAULT_VERSION):
 
         `*paths` may contain either:
 
-        1) a single directory, in which case
-        all JSON-LD files all the top level of this directory will be loaded
-        (but without descending into subdirectories)
+        1) a single directory, in which case all JSON-LD files in this directory
+        and any non-hidden subdirectories will be loaded
+        (where hidden subdirectories are those whose name starts with ".").
 
         2) one or more JSON-LD files, which will all be loaded.
 
@@ -161,11 +175,10 @@ def load(self, *paths, version=DEFAULT_VERSION):
         """
         if len(paths) == 1 and os.path.isdir(paths[0]):
             data_dir = paths[0]
-            json_paths = [
-                os.path.join(data_dir, item)
-                for item in os.listdir(data_dir)
-                if os.path.splitext(item)[1] in (".json", ".jsonld")
-            ]
+            json_paths = (
+                glob(f"{data_dir}/**/*.jsonld", recursive=True)
+                + glob(f"{data_dir}/**/*.json", recursive=True)
+            )
         else:
             json_paths = paths
 

diff --git a/pipeline/tests/test_collections.py b/pipeline/tests/test_collections.py
@@ -77,6 +77,26 @@ def test_round_trip_multi_file():
     assert p == np
 
 
+def test_round_trip_multi_file_group_by_schema():
+    shutil.rmtree(test_output_dir, ignore_errors=True)
+    person = build_fake_node(omcore.Person)
+    collection = Collection(person)
+    collection.save(test_output_dir, individual_files=True, include_empty_properties=False, group_by_schema=True)
+    new_collection = Collection()
+    new_collection.load(test_output_dir)
+
+    assert len(collection) == len(new_collection)
+
+    for node in new_collection:
+        if node.id == person.id:
+            new_person = person
+            break
+
+    p = person.to_jsonld(include_empty_properties=False, embed_linked_nodes=True)
+    np = new_person.to_jsonld(include_empty_properties=False, embed_linked_nodes=True)
+    assert p == np
+
+
 def test_collection_sort_by_id():
     person = omcore.Person(given_name="A", family_name="Professor", id="_:004")
     uni1 = omcore.Organization(full_name="University of This Place", id="_:002")