Skip to content

Commit

Permalink
Added rusvectores embedding back to utils section #323
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Jun 27, 2022
1 parent 2c84512 commit 2e425b1
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 0 deletions.
Empty file.
69 changes: 69 additions & 0 deletions arekit/contrib/utils/embeddings/rusvectores.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import numpy as np
from arekit.common.text.stemmer import Stemmer
from arekit.contrib.networks.embeddings.base import Embedding
from gensim.models import KeyedVectors


class RusvectoresEmbedding(Embedding):

def __init__(self, matrix, words):
super(RusvectoresEmbedding, self).__init__(matrix=matrix,
words=words)

self.__index_without_pos = self.__create_terms_without_pos()
self.__stemmer = None
self.__lemmatize_by_default = True

@classmethod
def from_word2vec_format(cls, filepath, binary):
assert(isinstance(binary, bool))

w2v_model = KeyedVectors.load_word2vec_format(filepath, binary=binary)
words_count = len(w2v_model.wv.vocab)

return cls(matrix=np.array([vector for vector in w2v_model.syn0]),
words=[w2v_model.wv.index2word[index] for index in range(words_count)])

def set_stemmer(self, stemmer):
assert(isinstance(stemmer, Stemmer))
self.__stemmer = stemmer

def try_find_index_by_plain_word(self, word):
assert(isinstance(word, str))

temp = self.__lemmatize_by_default
self.__lemmatize_by_default = False
index = super(RusvectoresEmbedding, self).try_find_index_by_plain_word(word)
self.__lemmatize_by_default = temp

return index

def _handler(self, word):
return self.__try_find_word_index_pair_lemmatized(word, self.__lemmatize_by_default)

# region private methods

def __try_find_word_index_pair_lemmatized(self, term, lemmatize):
assert(isinstance(term, str))
assert(isinstance(lemmatize, bool))

if lemmatize:
term = self.__stemmer.lemmatize_to_str(term)

index = self.__index_without_pos[term] \
if term in self.__index_without_pos else None

return term, index

def __create_terms_without_pos(self):
d = {}
for word_with_pos, index in self.iter_vocabulary():
assert(isinstance(word_with_pos, str))
word = word_with_pos.split(u'_')[0]
if word in d:
continue
d[word] = index

return d

# endregion

0 comments on commit 2e425b1

Please sign in to comment.