Skip to content

Commit

Permalink
Fixed doc count bugs in vocab builder
Browse files Browse the repository at this point in the history
OOV count was incorrectly computed. It unfortunately requires another pass over the data to do correctly, but this seems to be necessary.

Other counts were also wrong, since we were adding docs line by line to the vocab, so it thought each line was a doc.
  • Loading branch information
markgw committed Mar 11, 2020
1 parent 1212174 commit 9a63638
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 4 deletions.
11 changes: 11 additions & 0 deletions src/python/pimlico/datatypes/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,17 @@ def filter(self, threshold=None, no_above=None, limit=None):
no_above = 1.
return self.data.filter_extremes(no_below=threshold, no_above=no_above, keep_n=limit)

def run_browser(self, reader, opts):
"""
Browse the vocab simply by printing out all the words
"""
d = reader.get_data()
print("Showing all words in vocabulary")
max_freq = max(d.dfs.values()) + 1
for word, id in sorted(d.token2id.items(), key=lambda wi: -d.dfs.get(wi[1], max_freq)):
print(u"{}: {}".format(word, d.dfs.get(id, "--")))


class DictionaryData(object):
"""
Expand Down
17 changes: 13 additions & 4 deletions src/python/pimlico/modules/corpora/vocab_builder/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def execute(self):
# Input is given for every document in a corpus
# Update the term vocab with all terms in each doc
vocab_writer.add_documents(
(line for doc_name, doc in pbar(input_docs) if not is_invalid_doc(doc) for line in doc.sentences),
(sum(doc.sentences, []) for doc_name, doc in pbar(input_docs) if not is_invalid_doc(doc)),
prune_at=prune_at
)

Expand Down Expand Up @@ -56,9 +56,18 @@ def execute(self):
if oov_token:
# Add the OOV token to the vocabulary
oov_id = vocab_writer.data.add_term(oov_token)
# Set the count to the total count of everything that was filtered out
# If we didn't apply filters, or they didn't have an effect, this will be 0, but we include it anyway
oov_count = sum([count for (t, i, count) in removed], 0)
# To get a correct document count for the OOV token, we need to go over the
# data again and check where the filtered-out terms appear
self.log.info("Counting OOVs in the input corpus")
pbar = get_progress_bar(len(input_docs), title="Counting OOVs")
vocab_terms = set(vocab_writer.data.token2id.keys())
oov_count = sum(
(1 if any(
word not in vocab_terms
for line in doc.sentences
for word in line)
else 0 for dn, doc in pbar(input_docs)), 0
)
vocab_writer.data.dfs[oov_id] = oov_count
self.log.info("Added OOV token '%s' with count of %d" % (oov_token, oov_count))

Expand Down

0 comments on commit 9a63638

Please sign in to comment.