In [1]:
from pymagnitude import Magnitude, MagnitudeUtils

In [2]:
vectors = Magnitude(MagnitudeUtils.download_model('word2vec/heavy/GoogleNews-vectors-negative300.magnitude'))

In [3]:
vectors.most_similar("carrot")

[('carrots', 0.7685449),
 ('proverbial_carrot', 0.5643057),
 ('Carrot', 0.48995772),
 ('celery', 0.4753185),
 ('dangling_carrot', 0.47465426),
 ('Coarsely_grate', 0.47062412),
 ('carrot_dangling', 0.4619137),
 ('broccoli', 0.45493668),
 ('raisin_salad', 0.45256886),
 ('shredded_zucchini', 0.44899866)]

In [6]:
vectors.most_similar_approx("carrot")

[('carrots', 0.7685446999923613),
 ('celery', 0.47531832163405596),
 ('broccoli', 0.4549365083523469),
 ('ginger_puree', 0.4444111730874454),
 ('dangle_carrot', 0.4436831039658671),
 ('beefsteak_tomatoes', 0.4339051843958259),
 ('apple_slices', 0.4278265749573791),
 ('plums', 0.4260337278994655),
 ('cucumber', 0.4251839295296165),
 ('peppermint_candy', 0.42477266782699985)]

In [5]:
vectors.similarity("carrot", "fruit"), vectors.similarity("carrot", "vegetables")

(0.3729638, 0.27530932)

In [6]:
categories = {
    "television" : "📺",
    "mobile" : "📱",
    "computer" : "💻",
    "watch": "⌚️",
    "camera": "📷",
    "headphones": "🎧",
    "videogame" : "🎮",
    "paper": "📄",
    "pencil": "✏️",
    "shirt": "👕",
    "jeans": "👖",
    "shoes": "👟",
}

In [7]:
list(categories.keys())

['television',
 'mobile',
 'computer',
 'watch',
 'camera',
 'headphones',
 'videogame',
 'paper',
 'pencil',
 'shirt',
 'jeans',
 'shoes']

In [8]:
most_similar = vectors.most_similar_to_given("smartphone", list(categories.keys()))
most_similar

'mobile'

In [9]:
emoji = categories[most_similar]
emoji

'📱'

In [10]:
def category(word):
    most_similar = vectors.most_similar_to_given(word, list(categories.keys()))
    emoji = categories[most_similar]
    return {
        "category": most_similar,
        "emoji": emoji,
    }

In [11]:
category("smartphone")

{'category': 'mobile', 'emoji': '📱'}

In [12]:
def category(word):
    most_similar = vectors.most_similar_to_given(word, list(categories.keys()))
    emoji = categories[most_similar]
    return {
        "category": most_similar,
        "emoji": emoji,
    }

In [13]:
item = "ginger carrot soup"

In [14]:
vectors.query(["I", "read", "a", "book"])

array([[ 0.0332858 , -0.0021189 ,  0.0470522 , ..., -0.0028509 ,
         0.0179784 , -0.0435592 ],
       [-0.0131628 , -0.0455637 , -0.0483987 , ..., -0.0115428 ,
        -0.0429311 , -0.0289582 ],
       [-0.01974812,  0.08088748,  0.00189519, ...,  0.06656797,
         0.01162064,  0.01157236],
       [ 0.0448648 , -0.0103908 , -0.0178682 , ...,  0.0384555 ,
        -0.0229179 , -0.0020514 ]], dtype=float32)

In [15]:
vectors.similarity("cat", "dog")

0.76094574

In [16]:
vectors.distance("cat", "dog")

0.69145405

In [17]:
from scipy import spatial

cat_vec = vectors.query("cat")
dog_vec = vectors.query("dog")
similarity = 1 - spatial.distance.cosine(cat_vec, dog_vec)
similarity

0.760945737361908

### Similarity between sentence and word

In [18]:
import numpy as np

from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
import string
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/petr.janik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/petr.janik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
def get_vectors_for(text):
    text = text.translate(str.maketrans('', '', string.punctuation)) # text without punctuation
    vecs = []
    tokenized = word_tokenize(text)
    for word in tokenized:
        print(word)
        word_lower = word.lower()
        if word_lower in stopwords.words('english'): # skip stopwords
            continue
        if word_lower in vectors:
            vecs.append(vectors.query(word_lower))
    return np.array(vecs)

In [20]:
sentence = "Gin-ger, Carrot. soup!!!"

In [21]:
# or simpler
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words="english")
tokenize = vectorizer.build_analyzer()
tokenize(sentence)

['gin', 'ger', 'carrot', 'soup']

In [22]:
vectors.query("soup")

array([-0.0632115,  0.0618373,  0.0077726,  0.1284843, -0.0010414,
        0.0331517,  0.0453474, -0.063555 ,  0.0324646,  0.101001 ,
       -0.046378 , -0.0515311,  0.0334952,  0.0573713, -0.0982527,
        0.0855417, -0.0443168, -0.0614938, -0.0443168, -0.1676479,
        0.0525617,  0.0384766,  0.0587455, -0.0982527,  0.0477522,
       -0.0934431,  0.0126251,  0.0549665, -0.0687082,  0.0529053,
       -0.0511876,  0.0518747,  0.0007247, -0.0233608,  0.0054752,
        0.0573713,  0.0178641,  0.0142569,  0.0184653,  0.0438015,
       -0.0893206, -0.083824 ,  0.0975656, -0.0135699, -0.0334952,
       -0.1703962, -0.1195522,  0.0198395,  0.0079873,  0.083824 ,
       -0.039679 ,  0.0189806, -0.0766096,  0.0050243, -0.0420837,
       -0.0285139,  0.0425991,  0.0213854, -0.007515 , -0.080045 ,
       -0.0290292,  0.0975656, -0.0360718,  0.0996268,  0.0831369,
       -0.1003139, -0.0594326, -0.1140556,  0.0395072, -0.0155452,
        0.0817627, -0.0166617,  0.0494699, -0.0009394, -0.0103

In [23]:
np.mean(get_vectors_for(sentence), axis=0)

Ginger
Carrot
soup


array([-3.82726006e-02,  6.88864961e-02,  1.13998661e-02,  8.47308636e-02,
        3.09115350e-02,  6.98080054e-03,  4.66029346e-02, -7.67960325e-02,
       -3.97804640e-02,  7.82321319e-02, -4.38365638e-02, -3.68401669e-02,
        2.72793253e-03,  6.29672036e-02, -1.17294811e-01,  6.74639642e-02,
       -5.92347346e-02, -5.35799796e-03, -6.85786009e-02, -1.31368697e-01,
        1.44090327e-02,  5.43683656e-02,  7.51434639e-02, -1.28464662e-02,
        1.44738005e-02, -6.27673343e-02, -3.92605029e-02,  4.99931984e-02,
       -4.53256965e-02,  5.21523356e-02,  1.01935333e-02,  2.69586984e-02,
        5.36336703e-03, -4.85099368e-02, -4.00059950e-03,  6.88883290e-03,
        1.79078002e-02,  2.27759965e-03,  2.61274334e-02,  5.48931360e-02,
       -1.12914657e-02, -7.56996945e-02,  2.07556989e-02,  2.77133379e-03,
       -4.64606611e-03, -1.21082805e-01, -6.41838014e-02, -3.85268331e-02,
        4.77593988e-02,  2.92357355e-02, -6.15815334e-02,  6.91258311e-02,
       -2.66396999e-02,  

In [24]:
sentence = "ginger carrot soup"
tokens = sentence.split()

In [25]:
mean_vec = np.mean(vectors.query(tokens), axis=0)
mean_vec

array([-3.82726006e-02,  6.88864961e-02,  1.13998661e-02,  8.47308636e-02,
        3.09115350e-02,  6.98080054e-03,  4.66029346e-02, -7.67960325e-02,
       -3.97804640e-02,  7.82321319e-02, -4.38365638e-02, -3.68401669e-02,
        2.72793253e-03,  6.29672036e-02, -1.17294811e-01,  6.74639642e-02,
       -5.92347346e-02, -5.35799796e-03, -6.85786009e-02, -1.31368697e-01,
        1.44090327e-02,  5.43683656e-02,  7.51434639e-02, -1.28464662e-02,
        1.44738005e-02, -6.27673343e-02, -3.92605029e-02,  4.99931984e-02,
       -4.53256965e-02,  5.21523356e-02,  1.01935333e-02,  2.69586984e-02,
        5.36336703e-03, -4.85099368e-02, -4.00059950e-03,  6.88883290e-03,
        1.79078002e-02,  2.27759965e-03,  2.61274334e-02,  5.48931360e-02,
       -1.12914657e-02, -7.56996945e-02,  2.07556989e-02,  2.77133379e-03,
       -4.64606611e-03, -1.21082805e-01, -6.41838014e-02, -3.85268331e-02,
        4.77593988e-02,  2.92357355e-02, -6.15815334e-02,  6.91258311e-02,
       -2.66396999e-02,  

In [26]:
categories = ["vegetable", "fruit", "carrot", "yoghurt"]
categories_vec = [vectors.query(category) for category in categories]

In [27]:
for category, category_vec in zip(categories, categories_vec):
    similarity = 1 - spatial.distance.cosine(mean_vec, category_vec)
    print(category, similarity)

vegetable 0.5529444217681885
fruit 0.4844015836715698
carrot 0.7097286581993103
yoghurt 0.5526320934295654


In [28]:
similarities = []
for i in range(len(categories_vec)):
    similarity = 1 - spatial.distance.cosine(mean_vec, categories_vec[i])
    similarities.append(similarity)
categories[np.array(similarities).argmax()]

'carrot'

In [29]:
vectors.similarity("carrot", "carrot")

1.0

In [30]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [31]:
vectors.most_similar_to_given(categories_vec[2], categories)

'carrot'

In [32]:
categories = {
    "television" : "📺",
    "mobile" : "📱",
    "computer" : "💻",
    "watch": "⌚️",
    "camera": "📷",
    "headphones": "🎧",
    "videogame" : "🎮",
    "paper": "📄",
    "pencil": "✏️",
    "shirt": "👕",
    "jeans": "👖",
    "shoes": "👟",
}

In [33]:
def category(sentence):
    tokens = tokenize(sentence)
    mean_vec = np.mean(vectors.query(tokens), axis=0)
    most_similar = vectors.most_similar_to_given(mean_vec, list(categories.keys()))
    emoji = categories[most_similar]
    return {
        "category": most_similar,
        "emoji": emoji,
    }

In [34]:
category("carrot and fruit")

{'category': 'pencil', 'emoji': '✏️'}

In [35]:
category("smartphone")

{'category': 'mobile', 'emoji': '📱'}

### Cosine similarity

In [36]:
vec_a = vectors.query("cat")
vec_b = vectors.query("dog")

In [37]:
# cosine similarity
similarity = 1 - spatial.distance.cosine(vec_a, vec_b)
similarity

0.760945737361908

In [38]:
# cosine distance
distance = 1 - similarity
distance

0.23905426263809204

In [39]:
vectors.similarity("cat", "dog")

0.76094574

In [40]:
vectors.distance("cat", "dog")

0.69145405

In [41]:
from sklearn.metrics.pairwise import cosine_distances
cosine_distances([vec_a], [vec_b])

array([[0.23905438]], dtype=float32)

In [42]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([vec_a], [vec_b])

array([[0.7609456]], dtype=float32)

In [43]:
# compute cosine similarity manually
np.inner(vec_a, vec_b) / np.linalg.norm(vec_a) * np.linalg.norm(vec_b)

0.76094574

In [99]:
categories_multiword = {
    "video game" : "🎮",
    "ice cream": "🍦",
    "cream": "🥛",
    "womans boot": "👢",
    "Woman's sandal": "👡",
}

In [100]:
def get_sentence_vector(sentence):
    tokens = tokenize(sentence)
    print(tokens)
    mean_vec = np.mean(vectors.query(tokens), axis=0)
    return mean_vec

In [101]:
categories_multiword_vecs = [get_sentence_vector(key) for key in categories_multiword.keys()]

['video', 'game']
['ice', 'cream']
['cream']
['womans', 'boot']
['woman', 'sandal']


In [102]:
def category_multiword(sentence):
    mean_vec = get_sentence_vector(sentence)  
    similarities = [vectors.similarity(mean_vec, category_vec) for category_vec in categories_multiword_vecs]
    most_similar = np.array(similarities).argmax()
    emoji = list(categories_multiword)[most_similar]
    return {
        "category": most_similar,
        "emoji": emoji,
    }

In [103]:
category_multiword("ice cream")

['ice', 'cream']


{'category': 1, 'emoji': 'ice cream'}

In [104]:
categories = categories_multiword

def category_simple(sentence):
    most_similar = vectors.most_similar_to_given(sentence, list(categories.keys()))
    emoji = categories[most_similar]
    return {
        "category": most_similar,
        "emoji": emoji,
    }

In [105]:
category_simple("ice cream")

{'category': 'ice cream', 'emoji': '🍦'}

In [106]:
category_simple("ice cream")

{'category': 'ice cream', 'emoji': '🍦'}

In [107]:
category_simple("Woman's boot")

{'category': 'womans boot', 'emoji': '👢'}