Added stopwords to vocab datatype

Dictionary datatype stores list of stopwords. Augmented while filtering out frequent words. Also specified manually by vocab builder. Stored as separate list by vocab builder, so that it can easily be accessed later.
markgw · Oct 2, 2020 · 84bc733 · 84bc733
1 parent 81ccd08
commit 84bc733
Show file tree

Hide file tree

Showing 3 changed files with 126 additions and 22 deletions.
diff --git a/src/python/pimlico/datatypes/dictionary.py b/src/python/pimlico/datatypes/dictionary.py
@@ -18,7 +18,7 @@
 
 from future import standard_library
 standard_library.install_aliases()
-from builtins import zip
+from builtins import zip, sorted
 from builtins import str
 from builtins import range
 from past.builtins import basestring
@@ -83,18 +83,28 @@ def get_detailed_status(self):
                 u"Vocab size: %d" % len(data)
             ]
 
-    class Writer(object):
+    class Writer:
         """
         When the context manager is created, a new, empty :class:`DictionaryData` instance
         is created. You can build your dictionary by calling `add_documents()` on the
         writer, or accessing the dictionary data structure directly (via the `data`
         attribute), or simply replace it with a fully formed :class:`DictionaryData`
         instance of your own, using the same instance.
 
+        You can specify a list/set of stopwords when instantiating the writer. These
+        will be excluded from the dictionary if seen in the corpus.
+
         """
+        def __init__(self, *args, **kwargs):
+            self.stopwords = kwargs.pop("stopwords", None)
+            super(Dictionary.Writer, self).__init__(*args, **kwargs)
+
         def __enter__(self):
             super(Dictionary.Writer, self).__enter__()
             self.data = DictionaryData()
+            # Allow an initial set of stopwords to be given
+            if self.stopwords is not None:
+                self.data.stopwords = set(self.stopwords)
             return self
 
         def __exit__(self, exc_type, exc_val, exc_tb):
@@ -106,11 +116,15 @@ def add_documents(self, documents, prune_at=2000000):
             self.data.add_documents(documents, prune_at=prune_at)
 
         def filter(self, threshold=None, no_above=None, limit=None):
+            rem_hig, rem_low = self.filter_high_low(threshold=threshold, no_above=no_above, limit=limit)
+            return rem_hig, rem_low
+
+        def filter_high_low(self, threshold=None, no_above=None, limit=None):
             if threshold is None:
                 threshold = 0
             if no_above is None:
                 no_above = 1.
-            return self.data.filter_extremes(no_below=threshold, no_above=no_above, keep_n=limit)
+            return self.data.filter_high_low_extremes(no_below=threshold, no_above=no_above, keep_n=limit)
 
     def run_browser(self, reader, opts):
         """
@@ -129,11 +143,17 @@ class DictionaryData(object):
     Dictionary encapsulates the mapping between normalized words and their integer ids.
     This is taken almost directly from Gensim.
 
+    We also store a set of stopwords. These can be set explicitly (see `add_stopwords()`),
+    and will also include any words that are removed as a result of filters on the basis
+    that they're too common. This means that we can tell which words are OOV because we've
+    never seen them (or not seen them often) and which are common but filtered.
+
     """
     def __init__(self):
         self.token2id = {}  # token -> tokenId
         self._id2token = {}  # reverse mapping for token2id; only formed on request, to save memory
         self.dfs = {}  # document frequencies: tokenId -> in how many documents this token appeared
+        self.stopwords = set()  # Set of words to always skip
 
         self.num_docs = 0  # number of documents processed
         self.num_pos = 0  # total number of corpus positions
@@ -171,12 +191,36 @@ def __str__(self):
     def refresh_id2token(self):
         self._id2token = dict((id, token) for (token, id) in self.token2id.items())
 
+    def add_stopwords(self, new_stopwords):
+        """
+        Add some stopwords to the list.
+
+        Raises an error if a stopword is in the dictionary. We don't remove the
+        term here, because that would end up changing IDs of other words unexpectedly.
+        Instead, we leave it to the user to ensure a stopword is removed before
+        being added to the list.
+
+        Terms already in the stopword list will not be added to the dictionary
+        later.
+
+        """
+        self.stopwords.update(new_stopwords)
+        # Make sure the stopword isn't already in the vocab
+        for stopword in new_stopwords:
+            # We could remove the stopword here, but that would end up changing IDs of
+            #  other words. Instead, we leave it to the user to ensure a stopword is removed
+            #  before being added to the list
+            if stopword in self.token2id:
+                raise ValueError("tried to add a stopword '{}' that's already in the vocab")
+
     def add_term(self, term):
         """
         Add a term to the dictionary, without any occurrence count. Note that if you run threshold-based
         filters after adding a term like this, it will get removed.
 
         """
+        if term in self.stopwords:
+            return -1
         if term not in self.token2id:
             new_id = len(self.token2id)
             self.token2id[term] = new_id
@@ -236,9 +280,11 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
             missing = dict((w, freq) for w, freq in counter.items() if w not in token2id)
             if allow_update:
                 for w in missing:
-                    # new id = number of ids made so far;
-                    # NOTE this assumes there are no gaps in the id sequence!
-                    token2id[w] = len(token2id)
+                    # Don't add a new term if it's in the stopword list
+                    if w not in self.stopwords:
+                        # new id = number of ids made so far;
+                        # NOTE this assumes there are no gaps in the id sequence!
+                        token2id[w] = len(token2id)
 
         result = dict((token2id[w], freq) for w, freq in counter.items() if w in token2id)
 
@@ -271,21 +317,51 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
         **Note**: Due to the gap shrinking, the same word may have a different word id before and after the call
         to this function!
 
+        """
+        removed_freq, removed_rare = self.filter_high_low_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
+        return removed_freq + removed_rare
+
+    def filter_high_low_extremes(self, no_below=5, no_above=0.5, keep_n=100000, add_stopwords=True):
+        """
+        Filter out tokens that appear in
+
+        1. fewer than `no_below` documents (absolute number) or
+        2. more than `no_above` documents (fraction of total corpus size, *not* absolute number).
+        3. after (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `None`).
+
+        This is the same as filter_extremes(), but returns a separate list of terms
+        removed because they're too frequent and those removed because they're not frequent
+        enough.
+
+        If `add_stopwords=True` (default), any frequent words filtered out will be added to the
+        stopwords list.
+
         """
         no_above_abs = int(no_above * self.num_docs)  # convert fractional threshold to absolute threshold
 
-        # determine which tokens to keep
+        # Keep a record of what's removed
+        # Terms that appear too often
+        removed_freq = [(term, v, self.dfs[v]) for (term, v) in self.token2id.items() if self.dfs.get(v, 0) > no_above_abs]
+        # And those that don't meet the min threshold
+        removed_rare = [(term, v, self.dfs[v]) for (term, v) in self.token2id.items() if self.dfs.get(v, 0) < no_below]
+        # Get the other IDs, which we're keeping
         good_ids = (v for v in self.token2id.values() if no_below <= self.dfs.get(v, 0) <= no_above_abs)
         good_ids = sorted(good_ids, key=self.dfs.get, reverse=True)
+
+        # Keep only the top n
         if keep_n is not None:
+            removed_rare.extend((self.id2token[v], v, self.dfs[v]) for v in good_ids[keep_n:])
             good_ids = good_ids[:keep_n]
+
         # Convert to set for (much) faster inclusion check
         good_ids = set(good_ids)
-        # Keep a record of what items we remove, along with their counts
-        removed = [(token, id, self.dfs[id]) for (token, id) in self.token2id.items() if id not in good_ids]
-        # do the actual filtering, then rebuild dictionary to remove gaps in ids
+        # Do the actual filtering, then rebuild dictionary to remove gaps in ids
         self.filter_tokens(good_ids=good_ids)
-        return removed
+
+        # Add frequent terms to the stopword list
+        if add_stopwords:
+            self.stopwords.update((token for token, id, freq in removed_freq))
+        return removed_freq, removed_rare
 
     def filter_tokens(self, bad_ids=None, good_ids=None):
         """

diff --git a/src/python/pimlico/modules/corpora/vocab_builder/execute.py b/src/python/pimlico/modules/corpora/vocab_builder/execute.py
@@ -13,6 +13,12 @@ def execute(self):
         input_docs = self.info.get_input("text")
         oov_token = self.info.options["oov"]
 
+        # Read an optional list of stopwords
+        stopwords = self.info.get_input("stopwords")
+        if stopwords is not None:
+            stopwords = stopwords.get_list()
+            self.log.info("Initial list of {:,} stopwords".format(len(stopwords)))
+
         prune_at = self.info.options["prune_at"] or None
         if prune_at is not None:
             self.log.info("Pruning if dictionary size reaches {}".format(prune_at))
@@ -21,7 +27,9 @@ def execute(self):
         pbar = get_progress_bar(len(input_docs), title="Counting")
 
         # Prepare dictionary writers for the term and feature vocabs
-        with self.info.get_output_writer("vocab") as vocab_writer:
+        # Set the list of stopwords initially, so that these terms will be
+        #  ignored while building the vocab
+        with self.info.get_output_writer("vocab", stopwords=stopwords) as vocab_writer:
             # Input is given for every document in a corpus
             # Update the term vocab with all terms in each doc
             vocab_writer.add_documents(
@@ -30,21 +38,25 @@ def execute(self):
             )
 
             # Filter the vocab according to the options set
-            self.log.info("Built dictionary of %d terms, applying filters" % len(vocab_writer.data))
+            self.log.info("Built dictionary of {:,} terms, applying filters".format(len(vocab_writer.data)))
 
-            self.log.info("Feature vocab filters: %s" % ", ".join("%s=%s" % (k, v) for (k, v) in [
+            self.log.info("Feature vocab filters: {}".format(", ".join("{}={}".format(k, v) for (k, v) in [
                 ("threshold", self.info.options["threshold"]),
                 ("max proportion", self.info.options["max_prop"]),
                 ("limit", self.info.options["limit"]),
-            ] if v is not None))
-            removed = vocab_writer.filter(
+            ] if v is not None)))
+            removed_freq, removed_rare = vocab_writer.filter_high_low(
                 self.info.options["threshold"],
                 self.info.options["max_prop"],
                 self.info.options["limit"]
             )
-            show_removed = removed[:30] + [("...", None, None)] if len(removed) > 30 else removed
-            self.log.info("Filters removed %d items from vocabulary: %s" % (
-                len(removed), ", ".join(char for (char, __, __) in show_removed)
+            show_removed_freq = removed_freq[:30] + [("...", None, None)] if len(removed_freq) > 30 else removed_freq
+            show_removed_rare = removed_rare[:30] + [("...", None, None)] if len(removed_rare) > 30 else removed_rare
+            self.log.info("Filters removed {:,} frequent items from vocabulary: {}".format(
+                len(removed_freq), ", ".join(char for (char, __, __) in show_removed_freq)
+            ))
+            self.log.info("Filters removed {:,} rare items from vocabulary: {}".format(
+                len(removed_rare), ", ".join(char for (char, __, __) in show_removed_rare)
             ))
 
             if self.info.options["include"] is not None:
@@ -69,6 +81,12 @@ def execute(self):
                      else 0 for dn, doc in pbar(input_docs) if not is_invalid_doc(doc)), 0
                 )
                 vocab_writer.data.dfs[oov_id] = oov_count
-                self.log.info("Added OOV token '%s' with count of %d" % (oov_token, oov_count))
+                self.log.info("Added OOV token '{}' with count of {:,}".format(oov_token, oov_count))
+
+            self.log.info("Outputting vocab ({} terms)".format(len(vocab_writer.data)))
+
+            stopwords = list(vocab_writer.data.stopwords)
 
-            self.log.info("Outputting vocab (%d terms)" % len(vocab_writer.data))
+        self.log.info("Final list of {:,} stopwords".format(len(stopwords)))
+        with self.info.get_output_writer("stopwords") as stopwords_writer:
+            stopwords_writer.write_list(stopwords)
diff --git a/src/python/pimlico/modules/corpora/vocab_builder/info.py b/src/python/pimlico/modules/corpora/vocab_builder/info.py
@@ -9,7 +9,13 @@
 Similar to :mod:`pimlico.modules.features.vocab_builder`, which builds two vocabs, one for terms and one for
 features.
 
+May specify a list of stopwords, which will be ignored, even if they're found in the corpus.
+The filter to remove frequent words (controlled  by `max_prop`) will potentially add further
+stopwords, so the resulting list is output as `stopwords`.
+
 """
+from pimlico.datatypes import StringList
+
 from pimlico.core.modules.base import BaseModuleInfo
 from pimlico.core.modules.options import comma_separated_strings
 from pimlico.datatypes.corpora import GroupedCorpus
@@ -21,7 +27,11 @@ class ModuleInfo(BaseModuleInfo):
     module_type_name = "vocab_builder"
     module_readable_name = "Corpus vocab builder"
     module_inputs = [("text", GroupedCorpus(TokenizedDocumentType()))]
-    module_outputs = [("vocab", Dictionary())]
+    module_optional_inputs = [("stopwords", StringList())]
+    module_outputs = [
+        ("vocab", Dictionary()),
+        ("stopwords", StringList()),
+    ]
     module_options = {
         "threshold": {
             "help": "Minimum number of occurrences required of a term to be included",