In [1]:
import spacy

# word vectors occupy lot of space. hence en_core_web_sm model do not have them included. 
# In order to download
# word vectors you need to install large or medium english model. We will install the large one!
# make sure you have run "python -m spacy download en_core_web_lg" to install large english model
nlp = spacy.load("en_core_web_lg")

In [15]:
doc = nlp("dog cat banana kem")

for token in doc:
    print(token.text, "Vector : ", token.has_vector, ", Out of vocabulary : ", token.is_oov)

dog Vector :  True , Out of vocabulary :  False
cat Vector :  True , Out of vocabulary :  False
banana Vector :  True , Out of vocabulary :  False
kem Vector :  False , Out of vocabulary :  True


In [20]:
doc[0].vector.shape

(300,)

In [21]:
base_token = nlp("bread")
base_token.vector.shape

(300,)

In [22]:
doc = nlp("bread sandwich burger car tiger human wheat")

for token in doc:
    print(f"{token.text} <-> {base_token.text}:", token.similarity(base_token))

bread <-> bread: 1.0000000681995158
sandwich <-> bread: 0.6341067417450952
burger <-> bread: 0.47520687769584247
car <-> bread: 0.06451532596945217
tiger <-> bread: 0.04764612079317772
human <-> bread: 0.2151154210812192
wheat <-> bread: 0.615036141030184


In [29]:
doc = nlp("Lee Guy Gaara Hinata Sakura Kakashi Naruto Iruka Sasuke Itachi Shikamaru Shino Neji kiba inuzuka ")
base_token = nlp("Naruto")

for token in doc:
    print(f"{token.text} <--> {base_token.text}:", token.similarity(base_token))

Lee <--> Naruto: -0.031207492729277943
Guy <--> Naruto: 0.07498834099946865
Gaara <--> Naruto: 0.6078217113786216
Hinata <--> Naruto: 0.5995691966014622
Sakura <--> Naruto: 0.7116691288239683
Kakashi <--> Naruto: 0.6815041324966612
Naruto <--> Naruto: 0.9999999628773092
Iruka <--> Naruto: 0.5493015951216635
Sasuke <--> Naruto: 0.8054578404602226
Itachi <--> Naruto: 0.5625754305814314
Shikamaru <--> Naruto: 0.5379541700689966
Shino <--> Naruto: 0.5667605885674356
Neji <--> Naruto: 0.6709203505115122
kiba <--> Naruto: 0.0
inuzuka <--> Naruto: 0.0


  print(f"{token.text} <--> {base_token.text}:", token.similarity(base_token))


In [35]:
def print_similarity(base_word, words_to_compare):
    base_token = nlp(base_word)
    doc = nlp(words_to_compare)
    for token in doc:
        print(f"{token.text} <-> {base_token.text}: ", token.similarity(base_token))

In [36]:
print_similarity("iphone", "apple samsung iphone dog kitten")

apple <-> iphone:  0.4387907401919904
samsung <-> iphone:  0.6708590303423401
iphone <-> iphone:  0.9999999983096304
dog <-> iphone:  0.08211864726684225
kitten <-> iphone:  0.10222319084362265


In [37]:
king = nlp.vocab["king"].vector
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector
queen = nlp.vocab["queen"].vector

result = king - man + woman

In [38]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([result], [queen])

array([[0.6178014]], dtype=float32)