In [1]:
from pymagnitude import Magnitude, MagnitudeUtils

In [2]:
vectors = Magnitude(MagnitudeUtils.download_model('word2vec/heavy/GoogleNews-vectors-negative300.magnitude'))

In [3]:
vectors_light = Magnitude(MagnitudeUtils.download_model('word2vec/light/GoogleNews-vectors-negative300.magnitude'))

### Basic pymagnitude usage

In [4]:
vectors.most_similar("carrot")

[('carrots', 0.7685449),
 ('proverbial_carrot', 0.5643057),
 ('Carrot', 0.48995772),
 ('celery', 0.4753185),
 ('dangling_carrot', 0.47465426),
 ('Coarsely_grate', 0.47062412),
 ('carrot_dangling', 0.4619137),
 ('broccoli', 0.45493668),
 ('raisin_salad', 0.45256886),
 ('shredded_zucchini', 0.44899866)]

In [5]:
# only heavy model supports most_similar_approx
vectors.most_similar_approx("carrot")

[('carrots', 0.7685446999923613),
 ('celery', 0.47531832163405596),
 ('broccoli', 0.4549365083523469),
 ('ginger_puree', 0.4444111730874454),
 ('dangle_carrot', 0.4436831039658671),
 ('beefsteak_tomatoes', 0.4339051843958259),
 ('apple_slices', 0.4278265749573791),
 ('plums', 0.4260337278994655),
 ('cucumber', 0.4251839295296165),
 ('peppermint_candy', 0.42477266782699985)]

In [6]:
vectors.similarity("carrot", "fruit"), vectors.similarity("carrot", "vegetables")

(0.3729638, 0.27530932)

In [7]:
# it is possible to query a list of words
result = vectors.query(["I", "read", "a", "book"])
print(result.shape)
result

(4, 300)


array([[ 0.0332858 , -0.0021189 ,  0.0470522 , ..., -0.0028509 ,
         0.0179784 , -0.0435592 ],
       [-0.0131628 , -0.0455637 , -0.0483987 , ..., -0.0115428 ,
        -0.0429311 , -0.0289582 ],
       [-0.01974812,  0.08088748,  0.00189519, ...,  0.06656797,
         0.01162064,  0.01157236],
       [ 0.0448648 , -0.0103908 , -0.0178682 , ...,  0.0384555 ,
        -0.0229179 , -0.0020514 ]], dtype=float32)

In [8]:
import numpy as np

# it is the same as querying each word and appending the vector to a list and converting to numpy array
result = []
for word in ["I", "read", "a", "book"]:
    result.append(vectors.query(word))
np.array(result).shape

(4, 300)

## Get a vector from a sentence

### Using nltk

In [9]:
import numpy as np
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
pass

In [10]:
def get_mean_vec_nltk(sentence):
    sentence = sentence.translate(str.maketrans('', '', string.punctuation)) # sentence without punctuation
    tokens = word_tokenize(sentence)
    
    vecs = []
    for word in tokens:
        word_lower = word.lower()
        if word_lower in stopwords.words('english'): # skip stopwords
            continue
        if word_lower in vectors:
            print(word_lower)
            vecs.append(vectors.query(word_lower))
            
    print(np.array(vecs).shape)
    return np.mean(vecs, axis=0)

In [11]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
mean_vec_nltk = get_mean_vec_nltk("Gin-ger, Carrot. soup!!!")
mean_vec_nltk

ginger
carrot
soup
(3, 300)


array([-3.82726006e-02,  6.88864961e-02,  1.13998661e-02,  8.47308636e-02,
        3.09115350e-02,  6.98080054e-03,  4.66029346e-02, -7.67960325e-02,
       -3.97804640e-02,  7.82321319e-02, -4.38365638e-02, -3.68401669e-02,
        2.72793253e-03,  6.29672036e-02, -1.17294811e-01,  6.74639642e-02,
       -5.92347346e-02, -5.35799796e-03, -6.85786009e-02, -1.31368697e-01,
        1.44090327e-02,  5.43683656e-02,  7.51434639e-02, -1.28464662e-02,
        1.44738005e-02, -6.27673343e-02, -3.92605029e-02,  4.99931984e-02,
       -4.53256965e-02,  5.21523356e-02,  1.01935333e-02,  2.69586984e-02,
        5.36336703e-03, -4.85099368e-02, -4.00059950e-03,  6.88883290e-03,
        1.79078002e-02,  2.27759965e-03,  2.61274334e-02,  5.48931360e-02,
       -1.12914657e-02, -7.56996945e-02,  2.07556989e-02,  2.77133379e-03,
       -4.64606611e-03, -1.21082805e-01, -6.41838014e-02, -3.85268331e-02,
        4.77593988e-02,  2.92357355e-02, -6.15815334e-02,  6.91258311e-02,
       -2.66396999e-02,  

### Using sklearn tokenizer

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

def get_mean_vec_tokenizer(sentence):
    vectorizer = CountVectorizer(stop_words="english")
    tokenize = vectorizer.build_analyzer()
    tokens = tokenize(sentence)
    print(tokens)
    
    vecs = vectors.query(tokens)
    print(vecs.shape)
    
    return np.mean(vecs, axis=0)

In [14]:
mean_vec_tokenizer = get_mean_vec_tokenizer("Gin-ger, Carrot. soup!!!")
mean_vec_tokenizer

['gin', 'ger', 'carrot', 'soup']
(4, 300)


array([-0.01446415,  0.05315382,  0.0025164 ,  0.06717843, -0.01500367,
        0.00552063,  0.01404245, -0.05438235, -0.01472828,  0.06645077,
       -0.0081687 , -0.04797465, -0.05127674,  0.05805258, -0.07532087,
        0.05604888, -0.01085855,  0.02669287, -0.00987417, -0.08990412,
        0.00031438,  0.03512045,  0.05480458, -0.0165348 ,  0.02219485,
       -0.03278407, -0.0716916 ,  0.06664208, -0.05481673,  0.04298195,
       -0.00633855,  0.03806415, -0.00093902, -0.02398727, -0.0106668 ,
        0.01767807,  0.03986753,  0.0404997 ,  0.0355647 ,  0.00229858,
       -0.0026229 , -0.03088055,  0.01656757,  0.00391013, -0.0207961 ,
       -0.08400465, -0.05062438, -0.0233638 ,  0.0036168 ,  0.03652943,
       -0.03163557,  0.043674  , -0.0207148 ,  0.01630798, -0.0174227 ,
       -0.012678  , -0.02822323,  0.00759573, -0.0358003 , -0.08306959,
       -0.01682785,  0.02608917, -0.0430061 ,  0.05345447,  0.02478633,
       -0.0773314 , -0.0435354 , -0.07119825,  0.00759793,  0.02

### Using split()

In [15]:
def get_mean_vec_split(sentence):
    tokens = sentence.split()
    print(tokens)
    
    vecs = vectors.query(tokens)
    print(vecs.shape)
    
    return np.mean(vecs, axis=0)

In [16]:
mean_vec_split = get_mean_vec_split("ginger carrot soup")
mean_vec_split

['ginger', 'carrot', 'soup']
(3, 300)


array([-3.82726006e-02,  6.88864961e-02,  1.13998661e-02,  8.47308636e-02,
        3.09115350e-02,  6.98080054e-03,  4.66029346e-02, -7.67960325e-02,
       -3.97804640e-02,  7.82321319e-02, -4.38365638e-02, -3.68401669e-02,
        2.72793253e-03,  6.29672036e-02, -1.17294811e-01,  6.74639642e-02,
       -5.92347346e-02, -5.35799796e-03, -6.85786009e-02, -1.31368697e-01,
        1.44090327e-02,  5.43683656e-02,  7.51434639e-02, -1.28464662e-02,
        1.44738005e-02, -6.27673343e-02, -3.92605029e-02,  4.99931984e-02,
       -4.53256965e-02,  5.21523356e-02,  1.01935333e-02,  2.69586984e-02,
        5.36336703e-03, -4.85099368e-02, -4.00059950e-03,  6.88883290e-03,
        1.79078002e-02,  2.27759965e-03,  2.61274334e-02,  5.48931360e-02,
       -1.12914657e-02, -7.56996945e-02,  2.07556989e-02,  2.77133379e-03,
       -4.64606611e-03, -1.21082805e-01, -6.41838014e-02, -3.85268331e-02,
        4.77593988e-02,  2.92357355e-02, -6.15815334e-02,  6.91258311e-02,
       -2.66396999e-02,  

## Find the most similar category

In [17]:
categories = ["vegetable", "fruit", "carrot", "yoghurt"]
categories_vecs = vectors.query(categories)
categories_vecs

array([[-0.0204747,  0.0424415,  0.0081236, ...,  0.0633307,  0.051394 ,
         0.0566992],
       [-0.0192403,  0.0223799, -0.0177912, ...,  0.0278542,  0.0076076,
         0.0801814],
       [ 0.0231406,  0.0913366,  0.0369331, ...,  0.0027393, -0.0161678,
        -0.0003304],
       [ 0.0293025, -0.0195826,  0.0554603, ...,  0.0597484,  0.0168668,
        -0.0405946]], dtype=float32)

In [18]:
from scipy import spatial

similarities = []
for category, category_vec in zip(categories, categories_vecs):
    similarity = 1 - spatial.distance.cosine(mean_vec_nltk, category_vec)
    similarities.append(similarity)
    print(category, similarity)
    
categories[np.array(similarities).argmax()]

vegetable 0.5529444217681885
fruit 0.4844015836715698
carrot 0.7097286581993103
yoghurt 0.5526320934295654


'carrot'

In [19]:
vectors.similarity("carrot", "carrot")

1.0

In [20]:
vectors.similarity(vectors.query("carrot"), "carrot")

1.0

In [21]:
vectors.most_similar_to_given("carrot", categories)

'carrot'

In [22]:
vectors.most_similar_to_given(vectors.query("carrot"), categories)

'carrot'

In [23]:
vectors.most_similar_to_given("ginger carrot soup", categories)

'yoghurt'

In [24]:
vectors.most_similar_to_given(mean_vec_nltk, categories)

'carrot'

In [25]:
vectors.most_similar_to_given(mean_vec_tokenizer, categories)

'carrot'

In [26]:
vectors.most_similar_to_given(mean_vec_split, categories)

'carrot'

<div class="alert alert-block alert-success">
Conclusion: mean of words works better than passing the whole sentence.
</div>

In [27]:
categories = {
    "television" : "📺",
    "mobile" : "📱",
    "computer" : "💻",
    "watch": "⌚️",
    "camera": "📷",
    "headphones": "🎧",
    "videogame" : "🎮",
    "paper": "📄",
    "pencil": "✏️",
    "shirt": "👕",
    "jeans": "👖",
    "shoes": "👟",
    "vegetable": "🥗",
    "fruit": "🍏",
    "carrot": "🥕",
    "yoghurt": "🥛",
    
}

In [28]:
def get_whole_sentence_category(sentence):
    """
    Query for the most similar key out of a list of keys to a given key.
    Keys can be composed of multiple words.
    """
    most_similar = vectors.most_similar_to_given(sentence, list(categories.keys()))
    
    emoji = categories[most_similar]
    return {
        "category": most_similar,
        "emoji": emoji,
    }

In [29]:
get_whole_sentence_category("smartphone")

{'category': 'mobile', 'emoji': '📱'}

In [30]:
get_whole_sentence_category("carrot and fruit")

{'category': 'fruit', 'emoji': '🍏'}

In [31]:
get_whole_sentence_category("ginger carrot soup")

{'category': 'yoghurt', 'emoji': '🥛'}

In [32]:
def get_category_using_mean(sentence):
    mean_vec = get_mean_vec_tokenizer(sentence)
    most_similar = vectors.most_similar_to_given(mean_vec, list(categories.keys()))
    
    emoji = categories[most_similar]
    return {
        "category": most_similar,
        "emoji": emoji,
    }

In [33]:
get_category_using_mean("smartphone")

['smartphone']
(1, 300)


{'category': 'mobile', 'emoji': '📱'}

In [34]:
get_category_using_mean("carrot and fruit")

['carrot', 'fruit']
(2, 300)


{'category': 'fruit', 'emoji': '🍏'}

In [35]:
get_category_using_mean("ginger carrot soup")

['ginger', 'carrot', 'soup']
(3, 300)


{'category': 'carrot', 'emoji': '🥕'}

## When also categories have multiple words

In [36]:
categories_multiword = {
    "video game" : "🎮",
    "ice cream": "🍦",
    "cream": "🥛",
    "vegetable": "🥗",
    "fruit": "🍏",
    "carrot soup": "🥕",
    "yoghurt": "🥛",
    "womans boot": "👢",
    "Woman's sandal": "👡",
}

In [58]:
categories_multiword_vecs = [get_mean_vec_tokenizer(key) for key in categories_multiword.keys()]
print(f"{len(categories_multiword_vecs)}x{len(categories_multiword_vecs[0])}")

def get_category_using_mean_for_sentence_and_categories(sentence):
    mean_vec = get_mean_vec_tokenizer(sentence)
    similarities = [vectors.similarity(mean_vec, category_vec) for category_vec in categories_multiword_vecs]
    most_similar_index = np.array(similarities).argmax()
    most_similar = list(categories_multiword)[most_similar_index]
    
    emoji = categories_multiword[most_similar]
    return {
        "category": most_similar,
        "emoji": emoji,
    }

['video', 'game']
(2, 300)
['ice', 'cream']
(2, 300)
['cream']
(1, 300)
['vegetable']
(1, 300)
['fruit']
(1, 300)
['carrot', 'soup']
(2, 300)
['yoghurt']
(1, 300)
['womans', 'boot']
(2, 300)
['woman', 'sandal']
(2, 300)
9x300


In [61]:
get_category_using_mean_for_sentence_and_categories("carrot")

['carrot']
(1, 300)


{'category': 'carrot soup', 'emoji': '🥕'}

In [40]:
get_category_using_mean_for_sentence_and_categories("Woman's boot")

['woman', 'boot']
(2, 300)


{'category': 'womans boot', 'emoji': '👢'}

In [41]:
def get_whole_sentence_category_2(sentence):
    """
    Similar to `get_whole_sentence_category` but uses `categories_multiword`.
    """
    most_similar = vectors.most_similar_to_given(sentence, list(categories_multiword.keys()))
    
    emoji = categories_multiword[most_similar]
    return {
        "category": most_similar,
        "emoji": emoji,
    }

In [42]:
get_whole_sentence_category_2("carrot")

{'category': 'fruit', 'emoji': '🍏'}

In [43]:
get_whole_sentence_category_2("Woman's boot")

{'category': 'womans boot', 'emoji': '👢'}

In [44]:
def get_category_using_mean_for_sentence_but_not_for_categories(sentence):
    most_similar = vectors.most_similar_to_given(get_mean_vec_tokenizer(sentence), 
                                                 list(categories_multiword.keys()))
    
    emoji = categories_multiword[most_similar]
    return {
        "category": most_similar,
        "emoji": emoji,
    }

In [45]:
get_category_using_mean_for_sentence_but_not_for_categories("carrot")

['carrot']
(1, 300)


{'category': 'fruit', 'emoji': '🍏'}

In [46]:
get_category_using_mean_for_sentence_but_not_for_categories("Woman's boot")

['woman', 'boot']
(2, 300)


{'category': "Woman's sandal", 'emoji': '👡'}

<div class="alert alert-block alert-success">
Conclusion: mean of words of category works better than passing the whole category.
</div>

### Cosine similarity and distance

In [47]:
cat_vec = vectors.query("cat")
dog_vec = vectors.query("dog")

In [48]:
# cosine distance
spatial.distance.cosine(cat_vec, dog_vec)

0.23905426263809204

In [49]:
# cosine similarity
similarity = 1 - spatial.distance.cosine(cat_vec, dog_vec)
similarity

0.760945737361908

In [50]:
# cosine distance
distance = 1 - similarity
distance

0.23905426263809204

In [51]:
# cosine similarity
vectors.similarity("cat", "dog")

0.76094574

In [52]:
# cosine similarity
vectors.similarity(cat_vec, dog_vec)

0.76094574

In [53]:
# pymagnitude distance
vectors.distance("cat", "dog")

0.69145405

In [54]:
from sklearn.metrics.pairwise import cosine_similarity

# cosine similarity
cosine_similarity([cat_vec], [dog_vec])

array([[0.7609456]], dtype=float32)

In [55]:
from sklearn.metrics.pairwise import cosine_distances

# cosine distance
cosine_distances([cat_vec], [dog_vec])

array([[0.23905438]], dtype=float32)

In [56]:
# compute cosine similarity manually
np.inner(cat_vec, dog_vec) / np.linalg.norm(cat_vec) * np.linalg.norm(dog_vec)

0.76094574