In [34]:
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
corpus = [
        'Это первое предложение".',
        'Это предложение - второе предложение.',
        'Это третье.',
        'Ещё одно преложение, но не первое, не второе и не третье.',
    ]

In [21]:
# Bag of words
vectorizer1 = CountVectorizer()
X1 = vectorizer1.fit_transform(corpus)
print(vectorizer1.get_feature_names())

['второе', 'ещё', 'не', 'но', 'одно', 'первое', 'предложение', 'преложение', 'третье', 'это']


In [22]:
print(X1.toarray())

[[0 0 0 0 0 1 1 0 0 1]
 [1 0 0 0 0 0 2 0 0 1]
 [0 0 0 0 0 0 0 0 1 1]
 [1 1 3 1 1 1 0 1 1 0]]


In [36]:
# TF-IDF
vectorizer2 = TfidfVectorizer()
X2 = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names())

['второе', 'ещё', 'не', 'но', 'одно', 'первое', 'предложение', 'преложение', 'третье', 'это']


In [39]:
X2_ = np.round(X2.toarray(), decimals=2)
print(X2_)

[[0.   0.   0.   0.   0.   0.61 0.61 0.   0.   0.5 ]
 [0.42 0.   0.   0.   0.   0.   0.84 0.   0.   0.34]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.78 0.63]
 [0.2  0.26 0.78 0.26 0.26 0.2  0.   0.26 0.2  0.  ]]


In [93]:
import gensim

In [60]:
# Word2vec с rusvectores.org: https://github.com/akutuzov/webvectors/blob/master/preprocessing/rusvectores_tutorial.ipynb
import wget
import zipfile
model_url = 'http://vectors.nlpl.eu/repository/11/180.zip'
m = wget.download(model_url)
model_file = model_url.split('/')[-1]
with zipfile.ZipFile(model_file, 'r') as archive:
    stream = archive.open('model.bin')
    model = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=True)

In [78]:
model.save('word2vec.model')

In [88]:
model = gensim.models.KeyedVectors.load("word2vec.model")

In [89]:
word = 'предложение_NOUN'
print(word in model)

True


In [96]:
v = model[word]
print(v.shape)

(300,)


In [97]:
print(v)

[ 1.66909289e+00 -3.71764570e-01 -5.26434183e-01 -6.82842135e-01
 -1.62450612e+00 -2.20065808e+00 -1.81778371e+00  2.90364170e+00
  2.24821758e+00  2.97690058e+00 -2.98163795e+00  1.96871698e+00
 -7.99362600e-01  1.24590778e+00  8.56268406e-01 -4.60979305e-02
  9.01667953e-01  9.64356244e-01  4.00738083e-02 -1.62123907e+00
 -1.84248257e+00  1.25218070e+00 -6.53645933e-01  7.64003873e-01
 -4.76151798e-03  1.25383735e+00 -8.45721185e-01  1.32694411e+00
  3.45238388e-01 -2.56015253e+00 -8.94605875e-01  1.17479467e+00
 -2.53849792e+00  2.74415660e+00  2.49971822e-01  1.10538876e+00
  1.41485155e+00 -9.84229624e-01  1.09358573e+00 -1.39898881e-01
  4.11349297e+00 -1.47960913e+00  1.11645654e-01 -2.69390893e+00
  4.51827049e-01 -2.21500444e+00 -9.25164104e-01  2.12821412e+00
  3.34145951e+00  4.93367583e-01  1.41334996e-01 -7.48092055e-01
 -1.46801794e+00  3.24250722e+00 -6.94198906e-01  1.48963094e+00
 -1.25568449e+00  1.51824725e+00 -3.91350865e-01  2.60295320e+00
  1.76235747e+00  1.77682

In [92]:
model.most_similar(word)

[('заявление_NOUN', 0.5784456729888916),
 ('проект_NOUN', 0.5685489773750305),
 ('поправка_NOUN', 0.5616397261619568),
 ('просьба_NOUN', 0.5575388669967651),
 ('предлагать_VERB', 0.5502241253852844),
 ('рекомендация_NOUN', 0.5333122611045837),
 ('предложение_PROPN', 0.5310955047607422),
 ('законопроект_NOUN', 0.525166928768158),
 ('запрос_NOUN', 0.5148158073425293),
 ('решение_NOUN', 0.5138726234436035)]