In [2]:
import spacy
nlp = spacy.load ('en_core_web_lg')

In [4]:
nlp ('lion').vector

array([ 1.8963e-01, -4.0309e-01,  3.5350e-01, -4.7907e-01, -4.3311e-01,
        2.3857e-01,  2.6962e-01,  6.4332e-02,  3.0767e-01,  1.3712e+00,
       -3.7582e-01, -2.2713e-01, -3.5657e-01, -2.5355e-01,  1.7543e-02,
        3.3962e-01,  7.4723e-02,  5.1226e-01, -3.9759e-01,  5.1333e-03,
       -3.0929e-01,  4.8911e-02, -1.8610e-01, -4.1702e-01, -8.1639e-01,
       -1.6908e-01, -2.6246e-01, -1.5983e-02,  1.2479e-01, -3.7276e-02,
       -5.7125e-01, -1.6296e-01,  1.2376e-01, -5.5464e-02,  1.3244e-01,
        2.7519e-02,  1.2592e-01, -3.2722e-01, -4.9165e-01, -3.5559e-01,
       -3.0630e-01,  6.1185e-02, -1.6932e-01, -6.2405e-02,  6.5763e-01,
       -2.7925e-01, -3.0450e-03, -2.2400e-02, -2.8015e-01, -2.1975e-01,
       -4.3188e-01,  3.9864e-02, -2.2102e-01, -4.2693e-02,  5.2748e-02,
        2.8726e-01,  1.2315e-01, -2.8662e-02,  7.8294e-02,  4.6754e-01,
       -2.4589e-01, -1.1064e-01,  7.2250e-02, -9.4980e-02, -2.7548e-01,
       -5.4097e-01,  1.2823e-01, -8.2408e-02,  3.1035e-01, -6.33

The values for a vector of a Document (_a collection of tokens_) are essentially the **Average** of corresponding values of all of it's tokens

In [5]:
nlp ('The quick brown fox jumped.').vector

array([-1.72348157e-01,  1.13993334e-02, -5.07186651e-02,  3.04736700e-02,
        1.02041634e-02,  1.36314198e-01, -1.43308327e-01, -1.56106679e-02,
        8.28423277e-02,  1.67149007e+00, -3.76483351e-01,  8.27163458e-04,
        3.39448266e-02, -1.83933794e-01, -2.17978001e-01,  2.91569922e-02,
        7.38043338e-02,  1.03464663e+00, -6.53124973e-02, -3.02769482e-01,
       -1.21348329e-01, -2.10911278e-02, -5.04233642e-03, -1.49620354e-01,
        7.19222352e-02, -1.14348307e-02, -2.94718355e-01, -9.38479975e-02,
        2.39413325e-02, -2.36319661e-01, -1.38145790e-01,  1.62768334e-01,
        8.31211656e-02, -1.94691680e-02,  3.39896716e-02, -9.83766690e-02,
        1.40833361e-02,  4.41392362e-02, -3.44334841e-02, -1.59693331e-01,
        1.34939671e-01, -5.46016656e-02,  4.42504995e-02, -1.69036329e-01,
        1.77600995e-01,  1.24370992e-01, -2.35389009e-01, -1.37921795e-02,
        1.09435163e-01, -3.67643349e-02, -1.80684015e-01,  5.96696734e-02,
        1.74656641e-02, -

In [9]:
nlp ('The quick brown fox jumped.').vector.shape

(300,)

In [24]:
doc = nlp ("lion cat pet")

In [26]:
for token1 in doc:
    for token2 in doc:
        print (f"{token1.text} AND {token2.text} : {token1.similarity (token2)}")

lion AND lion : 1.0
lion AND cat : 0.5265437960624695
lion AND pet : 0.39923766255378723
cat AND lion : 0.5265437960624695
cat AND cat : 1.0
cat AND pet : 0.7505456805229187
pet AND lion : 0.39923766255378723
pet AND cat : 0.7505456805229187
pet AND pet : 1.0


We should keep in mind that **Words that have quite opposite meaning but often appear in the same context** may actually have **SIMILAR Vectors**

In [12]:
doc = nlp ("like love hate")

In [13]:
for token1 in doc:
    for token2 in doc:
        print (f"{token1.text} AND {token2.text} : {token1.similarity (token2)}")

like AND like : 1.0
like AND love : 0.6579040288925171
like AND hate : 0.6574651598930359
love AND like : 0.6579040288925171
love AND love : 1.0
love AND hate : 0.6393099427223206
hate AND like : 0.6574651598930359
hate AND love : 0.6393099427223206
hate AND hate : 1.0


In [14]:
tokens = nlp ("dog cat margle")
for token in tokens:
    print (token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
margle False 0.0 True


In [8]:
from scipy import spatial

In [9]:
cosine_similarity = lambda vec1, vec2: 1 - spatial.distance.cosine (vec1, vec2)

In [10]:
# Creating vectors for king, woman and man
king, man, woman = [x.vector for x in nlp ("king man woman")]

In [11]:
king.shape

(300,)

In [12]:
new_vector = king - man + woman

Now, this **new_vector** should be somewhat similar to **queen** or **princess** etc.

In [19]:
# Iterating in the entire corpus of word vectors to find similar word vectors
computed_similarities = []
for word in nlp.vocab:
    if word.has_vector and word.is_lower and word.is_alpha:
        similarity = cosine_similarity (new_vector, word.vector)
        computed_similarities.append ( (word, similarity) )

In [20]:
# key param in sorted () is the key on which sorting would take place, and - is there for Descending order
computed_similarities = sorted (computed_similarities, key = lambda x: -x[1])

In [22]:
print ([t[0].text for t in computed_similarities[:10]])

['king', 'queen', 'prince', 'kings', 'princess', 'royal', 'throne', 'queens', 'monarch', 'kingdom']
