Skip to content

Commit

Permalink
Replace spaces when storing w2v format
Browse files Browse the repository at this point in the history
Format doesn't support spaces in words, since vocab file format splits on spaces. Replace when storing: assumes we invert replacement when reading
  • Loading branch information
Mark Granroth-Wilding committed Sep 13, 2018
1 parent e86c091 commit f19bdf9
Showing 1 changed file with 17 additions and 0 deletions.
17 changes: 17 additions & 0 deletions src/python/pimlico/modules/embeddings/store_word2vec/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,19 @@ def execute(self):
embeddings = self.info.get_input("embeddings")
# Convert to Gensim KeyedVectors
keyed_vectors = embeddings.to_keyed_vectors()
self.log.info("{} vectors in input".format(keyed_vectors.vectors.shape[0]))

spaces = sum(1 for w in keyed_vectors.index2word if u" " in w or u"\u00A0" in w)
if spaces > 0:
self.log.info("Some words ({}) include spaces: replacing with '<space>', so they don't mess up the "
"w2v vocab".format(spaces))
# If there are spaces in any of the words, this is a problem for the w2v format, since the vocab
# is stored in a space-separated file!
keyed_vectors.vocab = dict(
(replace_spaces(word), v) for (word, v) in keyed_vectors.vocab.items()
)
# Apply the same mapping to index2word
keyed_vectors.index2word = [replace_spaces(word) for word in keyed_vectors.index2word]

# Output to the file
with Word2VecFilesWriter(self.info.get_absolute_output_dir("embeddings")) as writer:
Expand All @@ -21,3 +34,7 @@ def execute(self):
# Both files have been written
writer.file_written(writer.filenames[0])
writer.file_written(writer.filenames[1])


def replace_spaces(word):
return word.replace(u" ", u"<space>").replace(u"\u00A0", u"<nbspace>")

0 comments on commit f19bdf9

Please sign in to comment.