In [1]:
import numpy as np
import os
import scipy
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from nltk.corpus import wordnet

In [2]:
glove_file = datapath(os.path.abspath('data/glove.6B.300d.txt'))
tmp_file = get_tmpfile("test_word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)
model.save("data/vectors.kv")

  _ = glove2word2vec(glove_file, tmp_file)


In [3]:
model.sort_by_descending_frequency()

In [4]:
# model.distances("king", ["car", "queen", "man", "jack", "woman"])
print(model.similarity("king", "queen"))
print(model.similarity("king", "jack"))
print(model.similarity("king", "car"))
print(model.similarity("king", "fish"))
print(model.similarity("king", "turtle"))
print(model.similarity("fish", "turtle"))
print(model.similarity("goldfish", "koi"))
print(model.similarity("king", "crown"))
print(model.similarity("king", "throne"))

0.6336469
0.2231067
0.12324726
0.16535373
0.063374296
0.4094217
0.4485258
0.52248627
0.56064874


In [5]:
model.most_similar("fish")

[('salmon', 0.6541996598243713),
 ('tuna', 0.6304338574409485),
 ('shrimp', 0.6116418242454529),
 ('meat', 0.6113753914833069),
 ('seafood', 0.605174720287323),
 ('fishes', 0.6048167943954468),
 ('trout', 0.5963797569274902),
 ('eat', 0.5483304262161255),
 ('fishing', 0.5461738705635071),
 ('shark', 0.5428288578987122)]

In [6]:
def word_similarity(word1: str, word2: str, encoding_model) -> float:
    distance = scipy.spatial.distance.euclidean(encoding_model[word1], encoding_model[word2])
    return distance

In [7]:
word1 = "toyota"
word2 = "honda"

word_similarity(word1, word2, model)

5.5094475746154785

In [8]:
model.most_similar("corolla")

[('camry', 0.634617805480957),
 ('rav4', 0.5531828999519348),
 ('corollas', 0.5511929392814636),
 ('toyota', 0.5509843230247498),
 ('hatchback', 0.550690233707428),
 ('sedan', 0.5453417897224426),
 ('yaris', 0.5359853506088257),
 ('prius', 0.5339162349700928),
 ('sedans', 0.5198630094528198),
 ('tercel', 0.5056036114692688)]

In [9]:
model.most_similar(model.index_to_key[42000])

[('peake', 0.48120614886283875),
 ('wallabies', 0.3934621810913086),
 ('croom', 0.3829837441444397),
 ('dervla', 0.38141125440597534),
 ('charlesworth', 0.3736904263496399),
 ('wallaby', 0.37211552262306213),
 ('alcock', 0.3676917254924774),
 ('mceuen', 0.36511439085006714),
 ('lomu', 0.3630010187625885),
 ('bilic', 0.361842542886734)]

In [10]:
i = 5000
model.index_to_key[i:i+15]

['technique',
 'inspection',
 'situations',
 'symptoms',
 'risen',
 'properly',
 'ferry',
 'folk',
 'foods',
 'derby',
 '1951',
 'achievement',
 'demonstrated',
 'preserve',
 'dropping']

In [11]:
model.most_similar(positive=["king"])

[('queen', 0.6336469054222107),
 ('prince', 0.6196622252464294),
 ('monarch', 0.5899620652198792),
 ('kingdom', 0.5791267156600952),
 ('throne', 0.5606487393379211),
 ('ii', 0.5562329292297363),
 ('iii', 0.5503199100494385),
 ('crown', 0.5224862098693848),
 ('reign', 0.5217353701591492),
 ('kings', 0.5066401958465576)]

In [150]:
def generate_category(starting_word: str, model):
    words = [starting_word]
    for _ in range(3):
        words.append(model.most_similar(positive=words, restrict_vocab=10_000)[0])
    return words

In [159]:
generate_category("pair", model)

['pair',
 ('pairs', 0.7135139107704163),
 ('two', 0.5374454259872437),
 ('three', 0.6627466082572937)]

In [88]:
from nltk.corpus import wordnet

def get_pos(word):
    synsets = wordnet.synsets(word)
    pos_tags = set()
    for synset in synsets:
        pos_tags.add(synset.pos())
    return pos_tags

In [93]:
def enrich_with_wordnet(key: str, model, suppress_error=False):
    try:
        # Part of speech
        pos = get_pos(key)
        model.set_vecattr(key, "pos", pos)
        # Synsets
        synsets = wordnet.synsets(key)
        model.set_vecattr(key, "synsets", synsets)
        return True
    except Exception as exc:
        if not suppress_error:
            print("Failed to enrich \"%s\", %s" % (key, exc))
        return False

In [94]:
enrich_with_wordnet("glove", model)
print(model.get_vecattr("glove", "pos"))
print(model.get_vecattr("glove", "synsets"))

{'n'}
[Synset('baseball_glove.n.01'), Synset('glove.n.02'), Synset('boxing_glove.n.01')]


In [95]:
success_count = 0

for key in model.key_to_index.keys():
    success_count += enrich_with_wordnet(key, model, suppress_error=True)

print(f"{success_count} of {len(model.key_to_index)} words enriched")

400000 of 400000 words enriched


In [100]:
print(model.get_vecattr("fish", "pos"))
print(model.get_vecattr("fish", "synsets"))

{'v', 'n'}
[Synset('fish.n.01'), Synset('fish.n.02'), Synset('pisces.n.02'), Synset('pisces.n.01'), Synset('fish.v.01'), Synset('fish.v.02')]
