In [1]:
from pymagnitude import Magnitude, MagnitudeUtils

In [2]:
vectors = Magnitude(MagnitudeUtils.download_model('word2vec/heavy/GoogleNews-vectors-negative300.magnitude'))

In [3]:
vectors_light = Magnitude(MagnitudeUtils.download_model('word2vec/light/GoogleNews-vectors-negative300.magnitude'))

In [None]:
vectors.most_similar("carrot")

In [None]:
vectors.most_similar_approx("carrot")

In [None]:
vectors_light.most_similar_approx("carrot")

In [None]:
vectors.similarity("carrot", "fruit"), vectors.similarity("carrot", "vegetables")

In [None]:
categories = {
    "television" : "📺",
    "mobile" : "📱",
    "computer" : "💻",
    "watch": "⌚️",
    "camera": "📷",
    "headphones": "🎧",
    "videogame" : "🎮",
    "paper": "📄",
    "pencil": "✏️",
    "shirt": "👕",
    "jeans": "👖",
    "shoes": "👟",
}

In [None]:
list(categories.keys())

In [None]:
most_similar = vectors.most_similar_to_given("smartphone", list(categories.keys()))
most_similar

In [None]:
emoji = categories[most_similar]
emoji

In [None]:
def category(word):
    most_similar = vectors.most_similar_to_given(word, list(categories.keys()))
    emoji = categories[most_similar]
    return {
        "category": most_similar,
        "emoji": emoji,
    }

In [None]:
category("smartphone")

In [None]:
def category(word):
    most_similar = vectors.most_similar_to_given(word, list(categories.keys()))
    emoji = categories[most_similar]
    return {
        "category": most_similar,
        "emoji": emoji,
    }

In [None]:
item = "ginger carrot soup"

In [None]:
vectors.query(["I", "read", "a", "book"])

In [None]:
vectors.similarity("cat", "dog")

In [None]:
vectors.distance("cat", "dog")

In [None]:
from scipy import spatial

cat_vec = vectors.query("cat")
dog_vec = vectors.query("dog")
similarity = 1 - spatial.distance.cosine(cat_vec, dog_vec)
similarity

### Similarity between sentence and word

In [None]:
import numpy as np

from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
import string
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
def get_vectors_for(text):
    text = text.translate(str.maketrans('', '', string.punctuation)) # text without punctuation
    vecs = []
    tokenized = word_tokenize(text)
    for word in tokenized:
        print(word)
        word_lower = word.lower()
        if word_lower in stopwords.words('english'): # skip stopwords
            continue
        if word_lower in vectors:
            vecs.append(vectors.query(word_lower))
    return np.array(vecs)

In [None]:
sentence = "Gin-ger, Carrot. soup!!!"

In [None]:
# or simpler
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words="english")
tokenize = vectorizer.build_analyzer()
tokenize(sentence)

In [None]:
vectors.query("soup")

In [None]:
np.mean(get_vectors_for(sentence), axis=0)

In [None]:
sentence = "ginger carrot soup"
tokens = sentence.split()

In [None]:
mean_vec = np.mean(vectors.query(tokens), axis=0)
mean_vec

In [None]:
categories = ["vegetable", "fruit", "carrot", "yoghurt"]
categories_vec = [vectors.query(category) for category in categories]

In [None]:
for category, category_vec in zip(categories, categories_vec):
    similarity = 1 - spatial.distance.cosine(mean_vec, category_vec)
    print(category, similarity)

In [None]:
similarities = []
for i in range(len(categories_vec)):
    similarity = 1 - spatial.distance.cosine(mean_vec, categories_vec[i])
    similarities.append(similarity)
categories[np.array(similarities).argmax()]

In [None]:
vectors.similarity("carrot", "carrot")

In [None]:
string.punctuation

In [None]:
vectors.most_similar_to_given(categories_vec[2], categories)

In [None]:
categories = {
    "television" : "📺",
    "mobile" : "📱",
    "computer" : "💻",
    "watch": "⌚️",
    "camera": "📷",
    "headphones": "🎧",
    "videogame" : "🎮",
    "paper": "📄",
    "pencil": "✏️",
    "shirt": "👕",
    "jeans": "👖",
    "shoes": "👟",
}

In [None]:
def category(sentence):
    tokens = tokenize(sentence)
    mean_vec = np.mean(vectors.query(tokens), axis=0)
    most_similar = vectors.most_similar_to_given(mean_vec, list(categories.keys()))
    emoji = categories[most_similar]
    return {
        "category": most_similar,
        "emoji": emoji,
    }

In [None]:
category("carrot and fruit")

In [None]:
category("smartphone")

### Cosine similarity

In [None]:
vec_a = vectors.query("cat")
vec_b = vectors.query("dog")

In [None]:
# cosine similarity
similarity = 1 - spatial.distance.cosine(vec_a, vec_b)
similarity

In [None]:
# cosine distance
distance = 1 - similarity
distance

In [None]:
vectors.similarity("cat", "dog")

In [None]:
vectors.distance("cat", "dog")

In [None]:
from sklearn.metrics.pairwise import cosine_distances
cosine_distances([vec_a], [vec_b])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([vec_a], [vec_b])

In [None]:
# compute cosine similarity manually
np.inner(vec_a, vec_b) / np.linalg.norm(vec_a) * np.linalg.norm(vec_b)

In [None]:
categories_multiword = {
    "video game" : "🎮",
    "ice cream": "🍦",
    "cream": "🥛",
    "womans boot": "👢",
    "Woman's sandal": "👡",
}

In [None]:
def get_sentence_vector(sentence):
    tokens = tokenize(sentence)
    mean_vec = np.mean(vectors.query(tokens), axis=0)
    return mean_vec

In [None]:
categories_multiword_vecs = [get_sentence_vector(key) for key in categories_multiword.keys()]

In [None]:
def category_multiword(sentence):
    mean_vec = get_sentence_vector(sentence)  
    similarities = [vectors.similarity(mean_vec, category_vec) for category_vec in categories_multiword_vecs]
    most_similar_index = np.array(similarities).argmax()
    most_similar = list(categories_multiword)[most_similar_index]
    emoji = categories_multiword[most_similar]
    return {
        "category": most_similar,
        "emoji": emoji,
    }

In [None]:
category_multiword("ice cream")

In [None]:
categories = categories_multiword

def category_simple(sentence):
    most_similar = vectors.most_similar_to_given(sentence, list(categories.keys()))
    emoji = categories[most_similar]
    return {
        "category": most_similar,
        "emoji": emoji,
    }

In [None]:
category_simple("ice cream")

In [None]:
category_simple("ice cream")

In [None]:
category_simple("Woman's boot")