Skip to content

Commit

Permalink
Added list_iter() method to iterable corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
markgw committed Sep 29, 2020
1 parent b34f328 commit 3ba555e
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 0 deletions.
12 changes: 12 additions & 0 deletions src/python/pimlico/datatypes/corpora/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,18 @@ def __iter__(self):
"""
raise NotImplementedError

def list_iter(self):
"""
Iterate over the list of document names, without yielding the doc contents.
Whilst this could be considerably faster than iterating over all the docs,
the default implementation, if not overridden by subclasses of IterableCorpus,
simply calls the doc iter and throws away the docs.
"""
for doc_name, doc in self:
yield doc_name

def data_to_document(self, data, metadata=None):
"""
Applies the corpus' datatype's processing to the raw data, given as a
Expand Down
10 changes: 10 additions & 0 deletions src/python/pimlico/datatypes/corpora/grouped.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,16 @@ def list_archive_iter(self):
doc_name = doc_name[:-3]
yield archive_name, doc_name

def list_iter(self):
"""
Iterate over the list of document names, without processing the doc contents.
In some cases, this could be considerably faster than iterating over all the docs.
"""
for archive_name, doc_name in self.list_archive_iter():
yield doc_name

class Writer(object):
"""
Writes a large corpus of documents out to disk, grouping them together in Pimarc archives.
Expand Down

0 comments on commit 3ba555e

Please sign in to comment.