Added new embedding mappers

New datatypes and modules to produce them. These provide functions to map words to embeddings, allowing various different types of embeddings that aren't restricted to a fixed vocabulary to be used.
markgw · Oct 2, 2020 · 7110eb5 · 7110eb5
1 parent 74f74b2
commit 7110eb5
Show file tree

Hide file tree

Showing 13 changed files with 400 additions and 6 deletions.
diff --git a/docs/modules/pimlico.modules.corpora.vocab_builder.rst b/docs/modules/pimlico.modules.corpora.vocab_builder.rst
@@ -15,6 +15,10 @@ ID to every distinct word seen in the corpus, optionally applying thresholds so
 Similar to :mod:`pimlico.modules.features.vocab_builder`, which builds two vocabs, one for terms and one for
 features.
 
+May specify a list of stopwords, which will be ignored, even if they're found in the corpus.
+The filter to remove frequent words (controlled  by `max_prop`) will potentially add further
+stopwords, so the resulting list is output as `stopwords`.
+
 
 Inputs
 ======
@@ -28,11 +32,13 @@ Inputs
 Outputs
 =======
 
-+-------+---------------------------------------------------------------+
-| Name  | Type(s)                                                       |
-+=======+===============================================================+
-| vocab | :class:`dictionary <pimlico.datatypes.dictionary.Dictionary>` |
-+-------+---------------------------------------------------------------+
++-----------+---------------------------------------------------------------+
+| Name      | Type(s)                                                       |
++===========+===============================================================+
+| vocab     | :class:`dictionary <pimlico.datatypes.dictionary.Dictionary>` |
++-----------+---------------------------------------------------------------+
+| stopwords | :class:`string_list <pimlico.datatypes.core.StringList>`      |
++-----------+---------------------------------------------------------------+
 
 
 Options

diff --git a/docs/modules/pimlico.modules.embeddings.mappers.fasttext.rst b/docs/modules/pimlico.modules.embeddings.mappers.fasttext.rst
@@ -0,0 +1,50 @@
+fastText to doc\-embedding mapper
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. py:module:: pimlico.modules.embeddings.mappers.fasttext
+
++------------+---------------------------------------------+
+| Path       | pimlico.modules.embeddings.mappers.fasttext |
++------------+---------------------------------------------+
+| Executable | yes                                         |
++------------+---------------------------------------------+
+
+Use trained fastText embeddings to map words to their embeddings,
+including OOVs, using sub-word information.
+
+First train a fastText model using the fastText training module. Then
+use this module to produce a doc-embeddings mapper.
+
+
+*This module does not support Python 2, so can only be used when Pimlico is being run under Python 3*
+
+Inputs
+======
+
++------------+--------------------------------------------------------------------------------+
+| Name       | Type(s)                                                                        |
++============+================================================================================+
+| embeddings | :class:`fasttext_embeddings <pimlico.datatypes.embeddings.FastTextEmbeddings>` |
++------------+--------------------------------------------------------------------------------+
+
+Outputs
+=======
+
++--------+------------------------------------------------------------------------------------------+
+| Name   | Type(s)                                                                                  |
++========+==========================================================================================+
+| mapper | :class:`fasttext_doc_embeddings_mapper <pimlico.datatypes.embeddings.FastTextDocMapper>` |
++--------+------------------------------------------------------------------------------------------+
+
+Example config
+==============
+
+This is an example of how this module can be used in a pipeline config file.
+
+.. code-block:: ini
+   
+   [my_fasttext_doc_mapper_module]
+   type=pimlico.modules.embeddings.mappers.fasttext
+   input_embeddings=module_a.some_output
+   
+
diff --git a/docs/modules/pimlico.modules.embeddings.mappers.fixed.rst b/docs/modules/pimlico.modules.embeddings.mappers.fixed.rst
@@ -0,0 +1,50 @@
+Fixed embeddings to doc\-embedding mapper
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. py:module:: pimlico.modules.embeddings.mappers.fixed
+
++------------+------------------------------------------+
+| Path       | pimlico.modules.embeddings.mappers.fixed |
++------------+------------------------------------------+
+| Executable | yes                                      |
++------------+------------------------------------------+
+
+Use trained fixed word embeddings to map words to their embeddings.
+Does nothing with OOVs, which we don't have any way to map.
+
+First train or load embeddings using another module.
+Then use this module to produce a doc-embeddings mapper.
+
+
+*This module does not support Python 2, so can only be used when Pimlico is being run under Python 3*
+
+Inputs
+======
+
++------------+---------------------------------------------------------------+
+| Name       | Type(s)                                                       |
++============+===============================================================+
+| embeddings | :class:`embeddings <pimlico.datatypes.embeddings.Embeddings>` |
++------------+---------------------------------------------------------------+
+
+Outputs
+=======
+
++--------+---------------------------------------------------------------------------------------------------------+
+| Name   | Type(s)                                                                                                 |
++========+=========================================================================================================+
+| mapper | :class:`fixed_embeddings_doc_embeddings_mapper <pimlico.datatypes.embeddings.FixedEmbeddingsDocMapper>` |
++--------+---------------------------------------------------------------------------------------------------------+
+
+Example config
+==============
+
+This is an example of how this module can be used in a pipeline config file.
+
+.. code-block:: ini
+   
+   [my_fixed_embeddings_doc_mapper_module]
+   type=pimlico.modules.embeddings.mappers.fixed
+   input_embeddings=module_a.some_output
+   
+
diff --git a/docs/modules/pimlico.modules.embeddings.mappers.rst b/docs/modules/pimlico.modules.embeddings.mappers.rst
@@ -0,0 +1,17 @@
+Doc embedding mappers
+~~~~~~~~~~~~~~~~~~~~~
+
+
+.. py:module:: pimlico.modules.embeddings.mappers
+
+
+Produce datatypes that can map tokens in documents to their embeddings.
+
+
+
+.. toctree::
+   :maxdepth: 2
+   :titlesonly:
+
+   pimlico.modules.embeddings.mappers.fasttext
+   pimlico.modules.embeddings.mappers.fixed
diff --git a/docs/modules/pimlico.modules.embeddings.rst b/docs/modules/pimlico.modules.embeddings.rst
@@ -19,6 +19,7 @@ provided by sklearn.
    :titlesonly:
 
    pimlico.modules.embeddings.fasttext
+   pimlico.modules.embeddings.mappers
    pimlico.modules.embeddings.normalize
    pimlico.modules.embeddings.store_embeddings
    pimlico.modules.embeddings.store_tsv

diff --git a/src/python/pimlico/datatypes/embeddings.py b/src/python/pimlico/datatypes/embeddings.py
@@ -12,7 +12,7 @@
 """
 from __future__ import absolute_import
 
-from builtins import str
+from builtins import str, input
 from past.builtins import basestring
 from builtins import object
 
@@ -350,3 +350,186 @@ def save_model(self, model):
             model_path = os.path.join(self.data_dir, "model.bin")
             model.save_model(model_path)
             self.task_complete("model")
+
+
+class DocEmbeddingsMapper(PimlicoDatatype):
+    """
+    Abstract datatype.
+
+    An embedding loader provides a method to take a list of tokens (e.g. a tokenized document)
+    and produce an embedding for each token.
+    It will not necessarily be able to produce an embedding for *any* given term, so
+    might return None for some tokens.
+
+    This is more general than the :class:`~.Embeddings` datatype, as it allows this
+    method to potentially produce embeddings for an infinite set of terms. Conversely,
+    it is not able to say which set of terms it can produce embeddings for.
+
+    It provides a unified interface to composed embeddings, like fastText, which can
+    use sub-word information to produce embeddings of OOVs; context-sensitive
+    embeddings, like BERT, which taken into account the context of a token; and fixed
+    embeddings, which just return a fixed embedding for in-vocab terms.
+
+    Some subtypes are just wrappers for fixed sets of embeddings.
+
+    """
+    datatype_name = "doc_embeddings_mapper"
+
+    def get_software_dependencies(self):
+        return super(DocEmbeddingsMapper, self).get_software_dependencies() + [numpy_dependency]
+
+    def run_browser(self, reader, opts):
+        """
+        Simple tool to display embeddings for the words of user-entered sentences.
+        """
+        print("Enter a sentence to see its word vectors. Ctrl+D to exit")
+        try:
+            while True:
+                input_text = input("> ")
+                sentence = input_text.split()
+                embeddings = reader.get_embeddings(sentence)
+                for w, (word, embedding) in enumerate(zip(sentence, embeddings)):
+                    print("{} {}: {}".format(w, word, embedding))
+        except EOFError:
+            print("Exiting")
+
+    class Reader:
+        def get_embeddings(self, tokens):
+            """
+            Subclasses should produce a list, with an item for each token. The
+            item may be None, or a numpy array containing a vector for the token.
+
+            :param tokens: list of strings
+            :return: list of embeddings
+            """
+            raise NotImplementedError("abstract datatype does not implement get_embeddings")
+
+
+class FastTextDocMapper(DocEmbeddingsMapper):
+    datatype_name = "fasttext_doc_embeddings_mapper"
+
+    def get_software_dependencies(self):
+        return super(FastTextEmbeddings, self).get_software_dependencies() + [PythonPackageOnPip("fasttext")]
+
+    class Reader:
+        @cached_property
+        def model(self):
+            import fasttext
+
+            model_path = os.path.join(self.data_dir, "model.bin")
+            return fasttext.load_model(model_path)
+
+        def get_embeddings(self, tokens):
+            return [self.model.get_word_vector(token) for token in tokens]
+
+    class Writer:
+        required_tasks = ["model"]
+
+        def save_model(self, model):
+            model_path = os.path.join(self.data_dir, "model.bin")
+            model.save_model(model_path)
+            self.task_complete("model")
+
+
+class FixedEmbeddingsDocMapper(DocEmbeddingsMapper):
+    datatype_name = "fixed_embeddings_doc_embeddings_mapper"
+
+    class Reader(object):
+        class Setup(object):
+            def get_required_paths(self):
+                return ["vectors.npy", "vocab.csv"]
+
+        @cached_property
+        def vectors(self):
+            import numpy
+            with io.open(os.path.join(self.data_dir, "vectors.npy"), "rb") as f:
+                return numpy.load(f, allow_pickle=False)
+
+        @cached_property
+        def vector_size(self):
+            return self.vectors.shape[1]
+
+        @cached_property
+        def word_counts(self):
+            with io.open(os.path.join(self.data_dir, "vocab.csv"), "r", encoding="utf-8", newline="") as f:
+                reader = csv.reader(f)
+                return [(row[0], int(row[1])) for row in reader]
+
+        @cached_property
+        def index2vocab(self):
+            return [Vocab(word, i, count=count) for i, (word, count) in enumerate(self.word_counts)]
+
+        @cached_property
+        def index2word(self):
+            return [v.word for v in self.index2vocab]
+
+        @cached_property
+        def vocab(self):
+            # Build the vocab by indexing the vocab items (in index2word) by word
+            return dict((v.word, v) for v in self.index2vocab)
+
+        def __len__(self):
+            return len(self.index2vocab)
+
+        def word_vec(self, word):
+            """
+            Accept a single word as input.
+            Returns the word's representation in vector space, as a 1D numpy array.
+
+            """
+            try:
+                word_id = self.vocab[word].index
+            except KeyError as e:
+                raise KeyError("word not in vocabulary: {}".format(e))
+            return self.vectors[word_id]
+
+        def __contains__(self, word):
+            return word in self.vocab
+
+        def get_embeddings(self, tokens):
+            return [self.word_vec(token) if token in self else None for token in tokens]
+
+    class Writer(object):
+        required_tasks = ["vocab", "vectors"]
+
+        def write_vectors(self, arr):
+            """Write out vectors from a Numpy array """
+            import numpy
+            with open(os.path.join(self.data_dir, "vectors.npy"), "wb") as f:
+                numpy.save(f, arr, allow_pickle=False)
+            self.task_complete("vectors")
+
+        def write_word_counts(self, word_counts):
+            """
+            Write out vocab from a list of words with counts.
+
+            :param word_counts: list of (unicode, int) pairs giving each word and its count. Vocab indices are
+                determined by the order of words
+            """
+            with io.open(os.path.join(self.data_dir, "vocab.csv"), "w", encoding="utf-8", newline="") as f:
+                writer = csv.writer(f)
+                for word, count in word_counts:
+                    writer.writerow([str(word), str(count)])
+            self.task_complete("vocab")
+
+        def write_vocab_list(self, vocab_items):
+            """
+            Write out vocab from a list of vocab items (see ``Vocab``).
+
+            :param vocab_items: list of ``Vocab`` s
+            """
+            self.write_word_counts([(v.word, v.count) for v in vocab_items])
+
+        def write_keyed_vectors(self, *kvecs):
+            """
+            Write both vectors and vocabulary straight from Gensim's ``KeyedVectors`` data structure.
+            Can accept multiple objects, which will then be concatenated in the output.
+
+            """
+            import numpy
+            if len(kvecs) > 1:
+                vecs = numpy.vstack(tuple(kv.syn0 for kv in kvecs))
+            else:
+                vecs = kvecs[0].syn0
+            self.write_vectors(vecs)
+            self.write_word_counts([(w, kv.vocab[w].count) for kv in kvecs for w in kv.index2word])
diff --git a/src/python/pimlico/modules/embeddings/mappers/__init__.py b/src/python/pimlico/modules/embeddings/mappers/__init__.py
@@ -0,0 +1,5 @@
+"""Doc embedding mappers
+
+Produce datatypes that can map tokens in documents to their embeddings.
+
+"""
diff --git a/src/python/pimlico/modules/embeddings/mappers/fasttext/__init__.py b/src/python/pimlico/modules/embeddings/mappers/fasttext/__init__.py
diff --git a/src/python/pimlico/modules/embeddings/mappers/fasttext/execute.py b/src/python/pimlico/modules/embeddings/mappers/fasttext/execute.py
@@ -0,0 +1,13 @@
+# This file is part of Pimlico
+# Copyright (C) 2016 Mark Granroth-Wilding
+# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html
+
+from pimlico.core.modules.base import BaseModuleExecutor
+
+
+class ModuleExecutor(BaseModuleExecutor):
+    def execute(self):
+        input_embeddings = self.info.get_input("embeddings")
+
+        with self.info.get_output_writer("mapper") as writer:
+            writer.save_model(input_embeddings.load_model())