Added list_iter() method to iterable corpus

markgw · Sep 29, 2020 · 3ba555e · 3ba555e
1 parent b34f328
commit 3ba555e
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 0 deletions.
diff --git a/src/python/pimlico/datatypes/corpora/base.py b/src/python/pimlico/datatypes/corpora/base.py
@@ -213,6 +213,18 @@ def __iter__(self):
             """
             raise NotImplementedError
 
+        def list_iter(self):
+            """
+            Iterate over the list of document names, without yielding the doc contents.
+
+            Whilst this could be considerably faster than iterating over all the docs,
+            the default implementation, if not overridden by subclasses of IterableCorpus,
+            simply calls the doc iter and throws away the docs.
+
+            """
+            for doc_name, doc in self:
+                yield doc_name
+
         def data_to_document(self, data, metadata=None):
             """
             Applies the corpus' datatype's processing to the raw data, given as a

diff --git a/src/python/pimlico/datatypes/corpora/grouped.py b/src/python/pimlico/datatypes/corpora/grouped.py
@@ -276,6 +276,16 @@ def list_archive_iter(self):
                             doc_name = doc_name[:-3]
                         yield archive_name, doc_name
 
+        def list_iter(self):
+            """
+            Iterate over the list of document names, without processing the doc contents.
+
+            In some cases, this could be considerably faster than iterating over all the docs.
+
+            """
+            for archive_name, doc_name in self.list_archive_iter():
+                yield doc_name
+
     class Writer(object):
         """
         Writes a large corpus of documents out to disk, grouping them together in Pimarc archives.