Replace spaces when storing w2v format

Format doesn't support spaces in words, since vocab file format splits on spaces. Replace when storing: assumes we invert replacement when reading
markgw · Sep 13, 2018 · f19bdf9 · f19bdf9
1 parent e86c091
commit f19bdf9
Showing 1 changed file with 17 additions and 0 deletions.
diff --git a/src/python/pimlico/modules/embeddings/store_word2vec/execute.py b/src/python/pimlico/modules/embeddings/store_word2vec/execute.py
@@ -7,6 +7,19 @@ def execute(self):
         embeddings = self.info.get_input("embeddings")
         # Convert to Gensim KeyedVectors
         keyed_vectors = embeddings.to_keyed_vectors()
+        self.log.info("{} vectors in input".format(keyed_vectors.vectors.shape[0]))
+
+        spaces = sum(1 for w in keyed_vectors.index2word if u" " in w or u"\u00A0" in w)
+        if spaces > 0:
+            self.log.info("Some words ({}) include spaces: replacing with '<space>', so they don't mess up the "
+                          "w2v vocab".format(spaces))
+            # If there are spaces in any of the words, this is a problem for the w2v format, since the vocab
+            # is stored in a space-separated file!
+            keyed_vectors.vocab = dict(
+                (replace_spaces(word), v) for (word, v) in keyed_vectors.vocab.items()
+            )
+            # Apply the same mapping to index2word
+            keyed_vectors.index2word = [replace_spaces(word) for word in keyed_vectors.index2word]
 
         # Output to the file
         with Word2VecFilesWriter(self.info.get_absolute_output_dir("embeddings")) as writer:
@@ -21,3 +34,7 @@ def execute(self):
             # Both files have been written
             writer.file_written(writer.filenames[0])
             writer.file_written(writer.filenames[1])
+
+
+def replace_spaces(word):
+    return word.replace(u" ", u"<space>").replace(u"\u00A0", u"<nbspace>")