In [1]:
import pandas as pd
import os
import pickle
import json

import numpy as np
from tqdm import tqdm_notebook
from itertools import combinations

from gensim.models import KeyedVectors
from ast import literal_eval

In [2]:
word2vec = KeyedVectors.load_word2vec_format(
    "gensim_models/skipgram_wikipedia_no_lemma/model.txt"
)
word2vec.init_sims(replace=True)

word_dict = pd.read_csv(
    "brown_corpus_tags.csv",
    sep=';',
    index_col="Unnamed: 0"
)["0"].apply(literal_eval).to_dict()
pos_dict = {
    tag: set() for word, tags in word_dict.items() for tag in tags
}
for tag in pos_dict:
    for word, tags in word_dict.items():
        if tag in tags:
            pos_dict[
                tag
            ].add(
                word
            )
pos_dict = {key: list(val) for key,val in pos_dict.items()}

  word2vec.init_sims(replace=True)


In [3]:
word2vec.vectors.shape

(199807, 300)

Попробуем заранее посчитать матрицу расстояний между словами, чтобы потом просто доставать слова из списка:

In [4]:
words = list(word_dict.keys())

In [5]:
len(words), len([word for word in words if word in word2vec.key_to_index])

(49814, 27923)

In [6]:
words = [word for word in words if word in word2vec.key_to_index]

In [7]:
%%time
word_embeddings = word2vec.vectors[
    [
        word2vec.key_to_index[word] for word in words
    ]
]

CPU times: total: 46.9 ms
Wall time: 36 ms


In [8]:
%%time
similarities = word_embeddings.dot(word_embeddings.T)

CPU times: total: 12.2 s
Wall time: 7.92 s


In [9]:
similarities.shape

(27923, 27923)

In [10]:
similarities.min(), similarities.max()

(-0.21397078, 1.000001)

In [11]:
((similarities+1)/2).min(), ((similarities+1)/2).max()

(0.3930146, 1.0000005)

Применим скейлинг от [-1,1] к [0,1]

In [12]:
similarities = (similarities + 1) / 2

Получим матрицу с нулями на месте главной диагонали и на месте пар слов, которые не могут быть одной части речи:

In [26]:
word_pair_matrix = np.zeros(shape=(len(words),len(words)))

for w1_id, word1 in tqdm_notebook(enumerate(words), total=len(words)):
    for w2_id, word2 in enumerate(words):
        if len(set(word_dict[word1])&set(word_dict[word2])) != 0:
            word_pair_matrix[w1_id][w2_id] = 1

for i in range(len(word_pair_matrix)):
    word_pair_matrix[i][i] = 0

np.save('word_pair_matrix.npy', word_pair_matrix)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for w1_id, word1 in tqdm_notebook(enumerate(words), total=len(words)):


  0%|          | 0/27923 [00:00<?, ?it/s]

In [13]:
word_pair_matrix = np.load("word_pair_matrix.npy")

Теперь домножим матрицу близости на эту матрицу:

In [14]:
similarities = similarities * word_pair_matrix

In [15]:
similarities

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.53045624,
        0.54268241],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.53045624, 0.        , ..., 0.        , 0.        ,
        0.62604725],
       [0.        , 0.54268241, 0.        , ..., 0.        , 0.62604725,
        0.        ]])

In [16]:
sinonyms = dict()

In [17]:
words = np.array(words)

In [25]:
sinonyms = dict()

with open("sinonyms.json",'w',encoding='utf8') as outp:
    json.dump(sinonyms, outp, indent=4, ensure_ascii=False)

In [30]:
for word_id, word in tqdm_notebook(enumerate(words), total=len(words)):
    word_sims = similarities[word_id]
    syn_ids = sorted(word_sims.nonzero()[0], key=lambda x: word_sims[x], reverse=True)
    syns = words[syn_ids].tolist()
    sinonyms[word] = syns

with open("sinonyms.json",'w',encoding='utf8') as outp:
    json.dump(sinonyms, outp, indent=4, ensure_ascii=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for word_id, word in tqdm_notebook(enumerate(words), total=len(words)):


  0%|          | 0/27923 [00:00<?, ?it/s]

Теперь у нас есть словарь с похожими словами (в порядке похожести по версии Word2Vec) для каждого слова из Brown Corpus

In [31]:
sinonyms["cat"]

['dog',
 'kitten',
 'rabbit',
 'pet',
 'poodle',
 'fox',
 'puppy',
 'monkey',
 'wolf',
 'goat',
 'elephant',
 'creature',
 'tiger',
 'hound',
 'panther',
 'goldfish',
 'weasel',
 'boar',
 'squirrel',
 'rat',
 'teddy',
 'mermaid',
 'beast',
 'pig',
 'dragon',
 'donkey',
 'cow',
 'doll',
 'hare',
 'snake',
 'rooster',
 'crocodile',
 'terrier',
 'hyena',
 'alligator',
 'collie',
 'owl',
 'walrus',
 'bird',
 'lioness',
 'duck',
 'demon',
 'monster',
 'otter',
 'rhinoceros',
 'pup',
 'parrot',
 'crow',
 'mouse',
 'ape',
 'spider',
 'lion',
 'deer',
 'flea',
 'anteater',
 'octopus',
 'dolphin',
 'cheetah',
 'coyote',
 'clown',
 'burglar',
 'vulture',
 'ghoul',
 'retriever',
 'kid',
 'fowl',
 'beggar',
 'goose',
 'redhead',
 'barking',
 'turtle',
 'stranger',
 'pony',
 'gnome',
 'girl',
 'maniac',
 'grizzly',
 'animal',
 'mastiff',
 'tortoise',
 'robot',
 'python',
 'tigress',
 'possum',
 'mink',
 'rodent',
 'sock',
 'ocelot',
 'grandma',
 'peacock',
 'devil',
 'ass',
 'paw',
 'pimp',
 'chick

In [32]:
sinonyms["percentage"]

['proportion',
 'percent',
 'fraction',
 'average',
 'amount',
 'ratio',
 'majority',
 'rate',
 'margin',
 'turnout',
 'total',
 'cent',
 'turnover',
 'fielding',
 'yardage',
 'likelihood',
 'probability',
 'weight',
 'efficiency',
 'population',
 'averaging',
 'expectancy',
 'incidence',
 'difference',
 'density',
 'minimum',
 'deficit',
 'variance',
 'goal',
 'expenditure',
 'median',
 'net',
 'number',
 'prevalence',
 'quantity',
 'vote',
 'decrease',
 'voter',
 'point',
 'offense',
 'minority',
 'share',
 'count',
 'reduction',
 'disparity',
 'increase',
 'income',
 'deduction',
 'mortality',
 'productivity',
 'size',
 'sum',
 'estimate',
 'overall',
 'salary',
 'risk',
 'field',
 'coefficient',
 'outlay',
 'calculation',
 'prospect',
 'taxpayer',
 'threshold',
 'depreciation',
 'throw',
 'penalty',
 'result',
 'attainment',
 'maximum',
 'fee',
 'minus',
 'record',
 'freshman',
 'concentration',
 'payroll',
 'portion',
 'chance',
 'yard',
 'price',
 'amassing',
 'attendance',
 'ent

In [33]:
sinonyms["reaches"]

['enters',
 'rises',
 'descends',
 'attains',
 'terminates',
 'empties',
 'drops',
 'plunges',
 'exceeds',
 'widens',
 'emerges',
 'joins',
 'grows',
 'meets',
 'merges',
 'lies',
 'disappears',
 'exits',
 'crosses',
 'turns',
 'separates',
 'departs',
 'goes',
 'leaves',
 'loses',
 'settles',
 'receives',
 'pulls',
 'pushes',
 'extends',
 'spreads',
 'falls',
 'shrinks',
 'consumes',
 'narrows',
 'diminishes',
 'gets',
 'clears',
 'flows',
 'climbs',
 'dominates',
 'lowers',
 'achieves',
 'absorbs',
 'comes',
 'fills',
 'bends',
 'arrives',
 'dries',
 'explodes',
 'broadens',
 'becomes',
 'drains',
 'brings',
 'travels',
 'migrates',
 'collapses',
 'pours',
 'wanders',
 'takes',
 'divides',
 'sits',
 'decreases',
 'eats',
 'passes',
 'regains',
 'tumbles',
 'completes',
 'originates',
 'connects',
 'expands',
 'sees',
 'stays',
 'drifts',
 'vanishes',
 'stretches',
 'occupies',
 'waits',
 'recovers',
 'ends',
 'splits',
 'veers',
 'sends',
 'puts',
 'closes',
 'gathers',
 'accumulates