Fixed doc count bugs in vocab builder

OOV count was incorrectly computed. It unfortunately requires another pass over the data to do correctly, but this seems to be necessary. Other counts were also wrong, since we were adding docs line by line to the vocab, so it thought each line was a doc.
markgw · Mar 11, 2020 · 9a63638 · 9a63638
1 parent 1212174
commit 9a63638
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 4 deletions.
diff --git a/src/python/pimlico/datatypes/dictionary.py b/src/python/pimlico/datatypes/dictionary.py
@@ -106,6 +106,17 @@ def filter(self, threshold=None, no_above=None, limit=None):
                 no_above = 1.
             return self.data.filter_extremes(no_below=threshold, no_above=no_above, keep_n=limit)
 
+    def run_browser(self, reader, opts):
+        """
+        Browse the vocab simply by printing out all the words
+
+        """
+        d = reader.get_data()
+        print("Showing all words in vocabulary")
+        max_freq = max(d.dfs.values()) + 1
+        for word, id in sorted(d.token2id.items(), key=lambda wi: -d.dfs.get(wi[1], max_freq)):
+            print(u"{}: {}".format(word, d.dfs.get(id, "--")))
+
 
 class DictionaryData(object):
     """

diff --git a/src/python/pimlico/modules/corpora/vocab_builder/execute.py b/src/python/pimlico/modules/corpora/vocab_builder/execute.py
@@ -25,7 +25,7 @@ def execute(self):
             # Input is given for every document in a corpus
             # Update the term vocab with all terms in each doc
             vocab_writer.add_documents(
-                (line for doc_name, doc in pbar(input_docs) if not is_invalid_doc(doc) for line in doc.sentences),
+                (sum(doc.sentences, []) for doc_name, doc in pbar(input_docs) if not is_invalid_doc(doc)),
                 prune_at=prune_at
             )
 
@@ -56,9 +56,18 @@ def execute(self):
             if oov_token:
                 # Add the OOV token to the vocabulary
                 oov_id = vocab_writer.data.add_term(oov_token)
-                # Set the count to the total count of everything that was filtered out
-                # If we didn't apply filters, or they didn't have an effect, this will be 0, but we include it anyway
-                oov_count = sum([count for (t, i, count) in removed], 0)
+                # To get a correct document count for the OOV token, we need to go over the
+                # data again and check where the filtered-out terms appear
+                self.log.info("Counting OOVs in the input corpus")
+                pbar = get_progress_bar(len(input_docs), title="Counting OOVs")
+                vocab_terms = set(vocab_writer.data.token2id.keys())
+                oov_count = sum(
+                    (1 if any(
+                        word not in vocab_terms
+                        for line in doc.sentences
+                        for word in line)
+                     else 0 for dn, doc in pbar(input_docs)), 0
+                )
                 vocab_writer.data.dfs[oov_id] = oov_count
                 self.log.info("Added OOV token '%s' with count of %d" % (oov_token, oov_count))