In [22]:
import spacy

# word vectors occupy lot of space. hence en_core_web_sm model do not have them included. 
# In order to download
# word vectors you need to install large or medium english model. We will install the large one!
# make sure you have run "python -m spacy download en_core_web_lg" to install large english model
nlp = spacy.load("en_core_web_lg")

# spacy word vectors means that each word is represented by a vector of 300 dimensions
# this is a pretrained model. so we can use it directly
# meaning of neumerical values is what the model has learned from the training data

In [13]:
doc = nlp("dog cat banana raju")

for token in doc:
    print(token.text, "Vector:", token.has_vector, "OOV:", token.is_oov)
    # token.has_vector - returns true if word vector is available for the token

dog Vector: True OOV: False
cat Vector: True OOV: False
banana Vector: True OOV: False
raju Vector: True OOV: False


In [19]:
doc[0].vector
doc[0].vector.shape

# vector for the entire sentence
# doc.vector

# word vector meeans the vector representation of the word in the vocabulary

(300,)

In [21]:
base_token = nlp("bread")
base_token.vector.shape

# as a result of above command, we get a vector of 300 dimensions
# as a individual word vector is of 300 dimensions
# as an individual word vector size is equal to the size of the word vector of the entire sentence
base_token.vector
base_token[0].vector
# this return array means that the word vector of the word "bread" is a 300 dimensional vector

array([-2.1260e+00, -4.2692e+00, -7.9904e-01,  1.5811e+00,  2.6300e+00,
       -5.6096e+00,  1.6773e+00,  5.7257e+00, -5.3666e+00,  6.0170e-01,
        6.2670e+00,  1.0349e+00,  3.2992e+00,  1.7873e+00,  2.2344e+00,
       -6.3331e+00,  9.8316e-01, -9.6958e-01,  1.0844e+00, -2.2912e+00,
       -3.3732e+00,  2.5687e+00,  5.6621e-01, -2.7560e+00, -2.5407e+00,
        1.3260e+00, -4.4180e+00,  3.8633e+00, -2.5972e+00,  1.2347e+00,
        4.2847e+00, -2.4904e+00,  9.3384e-01, -1.5534e+00,  3.3076e-01,
       -4.1011e+00, -1.2641e-02,  4.2600e+00, -9.5077e-01,  9.2123e-01,
       -3.5137e-01,  7.0070e-01,  2.0004e+00,  5.8038e-01,  4.7097e+00,
        4.5480e-01, -3.4912e-01,  2.1022e+00, -2.2968e+00, -4.0087e+00,
       -3.1960e+00, -1.4585e+00,  5.1237e+00, -4.6956e+00, -4.1237e+00,
       -1.3773e+00, -8.5791e-01,  2.5150e+00,  4.5625e+00,  2.0744e+00,
        2.4309e+00, -1.5673e+00,  3.6562e+00, -7.8879e-01,  2.7344e+00,
        9.6651e-01, -7.0019e+00, -2.3392e+00,  1.6044e+00,  1.31

In [13]:
doc = nlp("bread sandwich burger car tiger human wheat")

for token in doc:
    print(f"{token.text} <-> {base_token.text}:", token.similarity(base_token))

bread <-> bread: 1.0
sandwich <-> bread: 0.6341067010130894
burger <-> bread: 0.47520687769584247
car <-> bread: 0.06451533308853552
tiger <-> bread: 0.04764611675903374
human <-> bread: 0.2151154210812192
wheat <-> bread: 0.6150360888607199


In [23]:
def print_similarity(base_word, words_to_compare):
    base_token = nlp(base_word)
    doc = nlp(words_to_compare)
    for token in doc:
        print(f"{token.text} <-> {base_token.text}: ", token.similarity(base_token))

In [24]:
print_similarity("iphone", "apple samsung iphone dog kitten")

apple <-> iphone:  0.4387907401919904
samsung <-> iphone:  0.670859081425417
iphone <-> iphone:  1.0
dog <-> iphone:  0.08211864228011527
kitten <-> iphone:  0.10222317834969896


In [27]:
king = nlp.vocab["king"].vector
king
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector
queen = nlp.vocab["queen"].vector

result = king - man + woman

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([result], [queen])

# more than .5 is considered as good similarity

array([[0.61780137]], dtype=float32)