Stats module only iterates once over corpus

Restructured to avoid unnecessary iteration 3 times over the whole corpus, instead collecting all counts in one pass.
markgw · Aug 5, 2019 · a14540e · a14540e
1 parent c8b0107
commit a14540e
Showing 1 changed file with 23 additions and 24 deletions.
diff --git a/src/python/pimlico/modules/corpora/corpus_stats/execute.py b/src/python/pimlico/modules/corpora/corpus_stats/execute.py
@@ -14,36 +14,35 @@ class ModuleExecutor(BaseModuleExecutor):
     def execute(self):
         corpus = self.info.get_input("corpus")
 
-        self.log.info("Counting characters")
-        pbar = get_progress_bar(len(corpus), title="Counting")
-        characters = sum(sum(len(token) for token in sent) + len(sent)
-                         for doc_name, doc in pbar(corpus) if not is_invalid_doc(doc) for sent in doc.sentences)
-        self.log.info("{:,} characters".format(characters))
-
-        self.log.info("Counting tokens")
-        pbar = get_progress_bar(len(corpus), title="Counting")
-        token_count = Counter(token for doc_name, doc in pbar(corpus) if not is_invalid_doc(doc)
-                              for sent in doc.sentences for token in sent)
+        self.log.info("Collecting stats")
+        character_count = 0
+        sent_count = 0
+        token_counter = Counter()
 
-        types = len(token_count)
-        tokens = sum(token_count.values())
-
-        self.log.info("{:,} types".format(types))
-        self.log.info("{:,} tokens".format(tokens))
-
-        self.log.info("Counting sentences")
         pbar = get_progress_bar(len(corpus), title="Counting")
-        sent_count = sum(len(doc.sentences) for doc_name, doc in pbar(corpus) if not is_invalid_doc(doc))
-
+        for __, doc in pbar(corpus):
+            if not is_invalid_doc(doc):
+                sent_count += len(doc.sentences)
+                for sent in doc.sentences:
+                    # Add counts of each word
+                    token_counter.update(sent)
+                    # Count the characters in the tokens, plus spaces between them
+                    character_count += sum(len(token) for token in sent) + len(sent) - 1
+
+        self.log.info("{:,} characters".format(character_count))
+        type_count = len(token_counter)
+        token_count = sum(token_counter.values())
+        self.log.info("{:,} types".format(type_count))
+        self.log.info("{:,} tokens".format(token_count))
         self.log.info("{:,} sentences".format(sent_count))
-        self.log.info("{:.2f} characters per sentence".format(float(characters) / sent_count))
-        self.log.info("{:.2f} tokens per sentence".format(float(tokens) / sent_count))
+        self.log.info("{:.2f} characters per sentence".format(float(character_count) / sent_count))
+        self.log.info("{:.2f} tokens per sentence".format(float(token_count) / sent_count))
 
         data = {
-            "types": types,
-            "tokens": tokens,
+            "types": type_count,
+            "tokens": token_count,
             "sentences": sent_count,
-            "characters": characters,
+            "characters": character_count,
         }
 
         with self.info.get_output_writer("stats") as writer: