Skip to content

Commit

Permalink
Added stopwords to vocab datatype
Browse files Browse the repository at this point in the history
Dictionary datatype stores list of stopwords.

Augmented while filtering out frequent words.

Also specified manually by vocab builder.

Stored as separate list by vocab builder, so that it can easily be
accessed later.
  • Loading branch information
markgw committed Oct 2, 2020
1 parent 81ccd08 commit 84bc733
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 22 deletions.
98 changes: 87 additions & 11 deletions src/python/pimlico/datatypes/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from future import standard_library
standard_library.install_aliases()
from builtins import zip
from builtins import zip, sorted
from builtins import str
from builtins import range
from past.builtins import basestring
Expand Down Expand Up @@ -83,18 +83,28 @@ def get_detailed_status(self):
u"Vocab size: %d" % len(data)
]

class Writer(object):
class Writer:
"""
When the context manager is created, a new, empty :class:`DictionaryData` instance
is created. You can build your dictionary by calling `add_documents()` on the
writer, or accessing the dictionary data structure directly (via the `data`
attribute), or simply replace it with a fully formed :class:`DictionaryData`
instance of your own, using the same instance.
You can specify a list/set of stopwords when instantiating the writer. These
will be excluded from the dictionary if seen in the corpus.
"""
def __init__(self, *args, **kwargs):
self.stopwords = kwargs.pop("stopwords", None)
super(Dictionary.Writer, self).__init__(*args, **kwargs)

def __enter__(self):
super(Dictionary.Writer, self).__enter__()
self.data = DictionaryData()
# Allow an initial set of stopwords to be given
if self.stopwords is not None:
self.data.stopwords = set(self.stopwords)
return self

def __exit__(self, exc_type, exc_val, exc_tb):
Expand All @@ -106,11 +116,15 @@ def add_documents(self, documents, prune_at=2000000):
self.data.add_documents(documents, prune_at=prune_at)

def filter(self, threshold=None, no_above=None, limit=None):
rem_hig, rem_low = self.filter_high_low(threshold=threshold, no_above=no_above, limit=limit)
return rem_hig, rem_low

def filter_high_low(self, threshold=None, no_above=None, limit=None):
if threshold is None:
threshold = 0
if no_above is None:
no_above = 1.
return self.data.filter_extremes(no_below=threshold, no_above=no_above, keep_n=limit)
return self.data.filter_high_low_extremes(no_below=threshold, no_above=no_above, keep_n=limit)

def run_browser(self, reader, opts):
"""
Expand All @@ -129,11 +143,17 @@ class DictionaryData(object):
Dictionary encapsulates the mapping between normalized words and their integer ids.
This is taken almost directly from Gensim.
We also store a set of stopwords. These can be set explicitly (see `add_stopwords()`),
and will also include any words that are removed as a result of filters on the basis
that they're too common. This means that we can tell which words are OOV because we've
never seen them (or not seen them often) and which are common but filtered.
"""
def __init__(self):
self.token2id = {} # token -> tokenId
self._id2token = {} # reverse mapping for token2id; only formed on request, to save memory
self.dfs = {} # document frequencies: tokenId -> in how many documents this token appeared
self.stopwords = set() # Set of words to always skip

self.num_docs = 0 # number of documents processed
self.num_pos = 0 # total number of corpus positions
Expand Down Expand Up @@ -171,12 +191,36 @@ def __str__(self):
def refresh_id2token(self):
self._id2token = dict((id, token) for (token, id) in self.token2id.items())

def add_stopwords(self, new_stopwords):
"""
Add some stopwords to the list.
Raises an error if a stopword is in the dictionary. We don't remove the
term here, because that would end up changing IDs of other words unexpectedly.
Instead, we leave it to the user to ensure a stopword is removed before
being added to the list.
Terms already in the stopword list will not be added to the dictionary
later.
"""
self.stopwords.update(new_stopwords)
# Make sure the stopword isn't already in the vocab
for stopword in new_stopwords:
# We could remove the stopword here, but that would end up changing IDs of
# other words. Instead, we leave it to the user to ensure a stopword is removed
# before being added to the list
if stopword in self.token2id:
raise ValueError("tried to add a stopword '{}' that's already in the vocab")

def add_term(self, term):
"""
Add a term to the dictionary, without any occurrence count. Note that if you run threshold-based
filters after adding a term like this, it will get removed.
"""
if term in self.stopwords:
return -1
if term not in self.token2id:
new_id = len(self.token2id)
self.token2id[term] = new_id
Expand Down Expand Up @@ -236,9 +280,11 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
missing = dict((w, freq) for w, freq in counter.items() if w not in token2id)
if allow_update:
for w in missing:
# new id = number of ids made so far;
# NOTE this assumes there are no gaps in the id sequence!
token2id[w] = len(token2id)
# Don't add a new term if it's in the stopword list
if w not in self.stopwords:
# new id = number of ids made so far;
# NOTE this assumes there are no gaps in the id sequence!
token2id[w] = len(token2id)

result = dict((token2id[w], freq) for w, freq in counter.items() if w in token2id)

Expand Down Expand Up @@ -271,21 +317,51 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
**Note**: Due to the gap shrinking, the same word may have a different word id before and after the call
to this function!
"""
removed_freq, removed_rare = self.filter_high_low_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
return removed_freq + removed_rare

def filter_high_low_extremes(self, no_below=5, no_above=0.5, keep_n=100000, add_stopwords=True):
"""
Filter out tokens that appear in
1. fewer than `no_below` documents (absolute number) or
2. more than `no_above` documents (fraction of total corpus size, *not* absolute number).
3. after (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `None`).
This is the same as filter_extremes(), but returns a separate list of terms
removed because they're too frequent and those removed because they're not frequent
enough.
If `add_stopwords=True` (default), any frequent words filtered out will be added to the
stopwords list.
"""
no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold

# determine which tokens to keep
# Keep a record of what's removed
# Terms that appear too often
removed_freq = [(term, v, self.dfs[v]) for (term, v) in self.token2id.items() if self.dfs.get(v, 0) > no_above_abs]
# And those that don't meet the min threshold
removed_rare = [(term, v, self.dfs[v]) for (term, v) in self.token2id.items() if self.dfs.get(v, 0) < no_below]
# Get the other IDs, which we're keeping
good_ids = (v for v in self.token2id.values() if no_below <= self.dfs.get(v, 0) <= no_above_abs)
good_ids = sorted(good_ids, key=self.dfs.get, reverse=True)

# Keep only the top n
if keep_n is not None:
removed_rare.extend((self.id2token[v], v, self.dfs[v]) for v in good_ids[keep_n:])
good_ids = good_ids[:keep_n]

# Convert to set for (much) faster inclusion check
good_ids = set(good_ids)
# Keep a record of what items we remove, along with their counts
removed = [(token, id, self.dfs[id]) for (token, id) in self.token2id.items() if id not in good_ids]
# do the actual filtering, then rebuild dictionary to remove gaps in ids
# Do the actual filtering, then rebuild dictionary to remove gaps in ids
self.filter_tokens(good_ids=good_ids)
return removed

# Add frequent terms to the stopword list
if add_stopwords:
self.stopwords.update((token for token, id, freq in removed_freq))
return removed_freq, removed_rare

def filter_tokens(self, bad_ids=None, good_ids=None):
"""
Expand Down
38 changes: 28 additions & 10 deletions src/python/pimlico/modules/corpora/vocab_builder/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ def execute(self):
input_docs = self.info.get_input("text")
oov_token = self.info.options["oov"]

# Read an optional list of stopwords
stopwords = self.info.get_input("stopwords")
if stopwords is not None:
stopwords = stopwords.get_list()
self.log.info("Initial list of {:,} stopwords".format(len(stopwords)))

prune_at = self.info.options["prune_at"] or None
if prune_at is not None:
self.log.info("Pruning if dictionary size reaches {}".format(prune_at))
Expand All @@ -21,7 +27,9 @@ def execute(self):
pbar = get_progress_bar(len(input_docs), title="Counting")

# Prepare dictionary writers for the term and feature vocabs
with self.info.get_output_writer("vocab") as vocab_writer:
# Set the list of stopwords initially, so that these terms will be
# ignored while building the vocab
with self.info.get_output_writer("vocab", stopwords=stopwords) as vocab_writer:
# Input is given for every document in a corpus
# Update the term vocab with all terms in each doc
vocab_writer.add_documents(
Expand All @@ -30,21 +38,25 @@ def execute(self):
)

# Filter the vocab according to the options set
self.log.info("Built dictionary of %d terms, applying filters" % len(vocab_writer.data))
self.log.info("Built dictionary of {:,} terms, applying filters".format(len(vocab_writer.data)))

self.log.info("Feature vocab filters: %s" % ", ".join("%s=%s" % (k, v) for (k, v) in [
self.log.info("Feature vocab filters: {}".format(", ".join("{}={}".format(k, v) for (k, v) in [
("threshold", self.info.options["threshold"]),
("max proportion", self.info.options["max_prop"]),
("limit", self.info.options["limit"]),
] if v is not None))
removed = vocab_writer.filter(
] if v is not None)))
removed_freq, removed_rare = vocab_writer.filter_high_low(
self.info.options["threshold"],
self.info.options["max_prop"],
self.info.options["limit"]
)
show_removed = removed[:30] + [("...", None, None)] if len(removed) > 30 else removed
self.log.info("Filters removed %d items from vocabulary: %s" % (
len(removed), ", ".join(char for (char, __, __) in show_removed)
show_removed_freq = removed_freq[:30] + [("...", None, None)] if len(removed_freq) > 30 else removed_freq
show_removed_rare = removed_rare[:30] + [("...", None, None)] if len(removed_rare) > 30 else removed_rare
self.log.info("Filters removed {:,} frequent items from vocabulary: {}".format(
len(removed_freq), ", ".join(char for (char, __, __) in show_removed_freq)
))
self.log.info("Filters removed {:,} rare items from vocabulary: {}".format(
len(removed_rare), ", ".join(char for (char, __, __) in show_removed_rare)
))

if self.info.options["include"] is not None:
Expand All @@ -69,6 +81,12 @@ def execute(self):
else 0 for dn, doc in pbar(input_docs) if not is_invalid_doc(doc)), 0
)
vocab_writer.data.dfs[oov_id] = oov_count
self.log.info("Added OOV token '%s' with count of %d" % (oov_token, oov_count))
self.log.info("Added OOV token '{}' with count of {:,}".format(oov_token, oov_count))

self.log.info("Outputting vocab ({} terms)".format(len(vocab_writer.data)))

stopwords = list(vocab_writer.data.stopwords)

self.log.info("Outputting vocab (%d terms)" % len(vocab_writer.data))
self.log.info("Final list of {:,} stopwords".format(len(stopwords)))
with self.info.get_output_writer("stopwords") as stopwords_writer:
stopwords_writer.write_list(stopwords)
12 changes: 11 additions & 1 deletion src/python/pimlico/modules/corpora/vocab_builder/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,13 @@
Similar to :mod:`pimlico.modules.features.vocab_builder`, which builds two vocabs, one for terms and one for
features.
May specify a list of stopwords, which will be ignored, even if they're found in the corpus.
The filter to remove frequent words (controlled by `max_prop`) will potentially add further
stopwords, so the resulting list is output as `stopwords`.
"""
from pimlico.datatypes import StringList

from pimlico.core.modules.base import BaseModuleInfo
from pimlico.core.modules.options import comma_separated_strings
from pimlico.datatypes.corpora import GroupedCorpus
Expand All @@ -21,7 +27,11 @@ class ModuleInfo(BaseModuleInfo):
module_type_name = "vocab_builder"
module_readable_name = "Corpus vocab builder"
module_inputs = [("text", GroupedCorpus(TokenizedDocumentType()))]
module_outputs = [("vocab", Dictionary())]
module_optional_inputs = [("stopwords", StringList())]
module_outputs = [
("vocab", Dictionary()),
("stopwords", StringList()),
]
module_options = {
"threshold": {
"help": "Minimum number of occurrences required of a term to be included",
Expand Down

0 comments on commit 84bc733

Please sign in to comment.