Skip to content

Commit

Permalink
Stats module only iterates once over corpus
Browse files Browse the repository at this point in the history
Restructured to avoid unnecessary iteration 3 times over the whole corpus, instead collecting all counts in one pass.
  • Loading branch information
markgw committed Aug 5, 2019
1 parent c8b0107 commit a14540e
Showing 1 changed file with 23 additions and 24 deletions.
47 changes: 23 additions & 24 deletions src/python/pimlico/modules/corpora/corpus_stats/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,36 +14,35 @@ class ModuleExecutor(BaseModuleExecutor):
def execute(self):
corpus = self.info.get_input("corpus")

self.log.info("Counting characters")
pbar = get_progress_bar(len(corpus), title="Counting")
characters = sum(sum(len(token) for token in sent) + len(sent)
for doc_name, doc in pbar(corpus) if not is_invalid_doc(doc) for sent in doc.sentences)
self.log.info("{:,} characters".format(characters))

self.log.info("Counting tokens")
pbar = get_progress_bar(len(corpus), title="Counting")
token_count = Counter(token for doc_name, doc in pbar(corpus) if not is_invalid_doc(doc)
for sent in doc.sentences for token in sent)
self.log.info("Collecting stats")
character_count = 0
sent_count = 0
token_counter = Counter()

types = len(token_count)
tokens = sum(token_count.values())

self.log.info("{:,} types".format(types))
self.log.info("{:,} tokens".format(tokens))

self.log.info("Counting sentences")
pbar = get_progress_bar(len(corpus), title="Counting")
sent_count = sum(len(doc.sentences) for doc_name, doc in pbar(corpus) if not is_invalid_doc(doc))

for __, doc in pbar(corpus):
if not is_invalid_doc(doc):
sent_count += len(doc.sentences)
for sent in doc.sentences:
# Add counts of each word
token_counter.update(sent)
# Count the characters in the tokens, plus spaces between them
character_count += sum(len(token) for token in sent) + len(sent) - 1

self.log.info("{:,} characters".format(character_count))
type_count = len(token_counter)
token_count = sum(token_counter.values())
self.log.info("{:,} types".format(type_count))
self.log.info("{:,} tokens".format(token_count))
self.log.info("{:,} sentences".format(sent_count))
self.log.info("{:.2f} characters per sentence".format(float(characters) / sent_count))
self.log.info("{:.2f} tokens per sentence".format(float(tokens) / sent_count))
self.log.info("{:.2f} characters per sentence".format(float(character_count) / sent_count))
self.log.info("{:.2f} tokens per sentence".format(float(token_count) / sent_count))

data = {
"types": types,
"tokens": tokens,
"types": type_count,
"tokens": token_count,
"sentences": sent_count,
"characters": characters,
"characters": character_count,
}

with self.info.get_output_writer("stats") as writer:
Expand Down

0 comments on commit a14540e

Please sign in to comment.