In [1]:
import json

In [2]:
from gensim.models import Word2Vec

In [3]:
from gensim.models import KeyedVectors

In [4]:
with open ('torchat.json', 'r', encoding='utf-8') as inp:
    torchat_contexts = json.load(inp)

In [5]:
with open ('vystupat.json', 'r', encoding='utf-8') as inp:
    vystupat_contexts = json.load(inp)

with open ('igrat.json', 'r', encoding='utf-8') as inp:
    igrat_contexts = json.load(inp)

In [6]:
model = KeyedVectors.load_word2vec_format('model.bin', binary=True)

In [12]:
help(Word2Vec.load)

Help on method load in module gensim.models.word2vec:

load(*args, **kwargs) method of builtins.type instance
    Load a previously saved :class:`~gensim.models.word2vec.Word2Vec` model.
    
    See Also
    --------
    :meth:`~gensim.models.word2vec.Word2Vec.save`
        Save model.
    
    Parameters
    ----------
    fname : str
        Path to the saved file.
    
    Returns
    -------
    :class:`~gensim.models.word2vec.Word2Vec`
        Loaded model.



In [52]:
model['заяц_NOUN'].shape

(300,)

В используемой модели (с сайта RusVectores) вектора слов 300-мерные.

In [23]:
words_in_model = [i for i in model.index2entity]

In [24]:
len(words_in_model)

189193

In [7]:
from collections import defaultdict

In [35]:
d = defaultdict(lambda: 4)
d['a'] += 1
d.items()

dict_items([('a', 5)])

In [8]:
dfs = defaultdict(int)

In [9]:
from math import log

In [32]:
log(4)

1.3862943611198906

In [10]:
all_contexts = torchat_contexts + vystupat_contexts + igrat_contexts

In [11]:
for context in all_contexts:
    for word in set(context[0]):
        dfs[word] += 1

In [12]:
N = len(all_contexts)

In [13]:
idfs = {key: log(N/value) for key, value in dfs.items()}

In [14]:
import numpy as np

In [19]:
def encode_contexts(contexts, term, model):
    context_vectors = np.zeros((len(contexts), 300))
    for i, context in enumerate(contexts):
        doc_tfidfs = {word: idfs[word]*context[0].count(word) for word in context[0] if word in model and word != term}
        s = sum([val for key, val in doc_tfidfs.items()])
        ## normalize tf-idfs by sum of them:
        doc_tfidfs = {word: val/s for word, val in doc_tfidfs.items()}
        for word in doc_tfidfs:
                context_vectors[i] += model[word] * doc_tfidfs[word]
    return context_vectors

In [20]:
torchat_context_vectors = encode_contexts(torchat_contexts, 'торчать_VERB', model)

In [21]:
torchat_context_vectors

array([[ 6.60220277e-01,  5.36946028e-01,  8.27821409e-02, ...,
         1.50436234e-01, -1.78850746e-01,  8.33026446e-01],
       [ 3.30332913e-01,  1.13299102e-01,  2.49291226e-01, ...,
         8.13163057e-01, -2.78794773e-01,  1.00880357e+00],
       [ 1.46440847e+00, -1.32067330e+00,  4.55866966e-01, ...,
         1.35255982e-02,  7.61189498e-01, -4.37579438e-01],
       ...,
       [-9.23769549e-04, -6.51350049e-01,  9.01471281e-01, ...,
         6.07399661e-02, -1.31308943e-01, -2.86413323e-01],
       [ 2.11645010e+00,  1.14646370e+00,  5.18576756e-01, ...,
        -1.45598428e+00,  1.48639170e-01, -3.74864049e-01],
       [ 1.14026469e+00, -4.41123918e-02,  1.13401303e+00, ...,
        -9.22222398e-01,  4.58757207e-03, -1.68906049e+00]])

In [22]:
np.save('torchat_context_vectors', torchat_context_vectors)

In [89]:
np.load('torchat_context_vectors.npy').shape

(2564, 300)

In [23]:
vystupat_context_vectors = encode_contexts(vystupat_contexts, 'выступать_VERB', model)
np.save('vystupat_context_vectors', vystupat_context_vectors)

In [24]:
igrat_context_vectors = encode_contexts(igrat_contexts, 'играть_VERB', model)
np.save('igrat_context_vectors', igrat_context_vectors)