In [1]:
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [2]:
embeddings_dict = {}

In [3]:
with open("./glove.6B/glove.6B.50d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [4]:
def find_closest_embeddings(embedding):
    return sorted(embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean(embeddings_dict[word], embedding))

In [5]:
print(find_closest_embeddings(
    embeddings_dict["twig"] - embeddings_dict["branch"] + embeddings_dict["hand"]
)[:5])

['fingernails', 'toenails', 'stringy', 'peeling', 'shove']


In [7]:
# code for Glove word embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
 
x = {'text', 'the', 'leader', 'prime',
     'natural', 'language'}
 
# create the dict.
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x)
 
# number of unique words in dict.
print("Number of unique words in dictionary=", 
      len(tokenizer.word_index))
print("Dictionary is = ", tokenizer.word_index)
 
# download glove and unzip it in Notebook.
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove*.zip
 
# vocab: 'the': 1, mapping of words with
# integers in seq. 1,2,3..
# embedding: 1->dense vector
def embedding_for_vocab(filepath, word_index,
                        embedding_dim):
    vocab_size = len(word_index) + 1
     
    # Adding again 1 because of reserved 0 index
    embedding_matrix_vocab = np.zeros((vocab_size,
                                       embedding_dim))
 
    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix_vocab[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]
 
    return embedding_matrix_vocab
 
 
# matrix for vocab: word_index
embedding_dim = 50
embedding_matrix_vocab = embedding_for_vocab(
    './glove.6B/glove.6B.50d.txt', tokenizer.word_index,
  embedding_dim)
 
print("Dense vector for first word is => ",
      embedding_matrix_vocab[1])

Number of unique words in dictionary= 6
Dictionary is =  {'language': 1, 'natural': 2, 'leader': 3, 'prime': 4, 'the': 5, 'text': 6}
Dense vector for first word is =>  [-5.79900026e-01 -1.10100001e-01 -1.15569997e+00 -2.99059995e-03
 -2.06129998e-01  4.52890009e-01 -1.66710004e-01 -1.03820002e+00
 -9.92410004e-01  3.98840010e-01  5.92299998e-01  2.29900002e-01
  1.52129996e+00 -1.77640006e-01 -2.97259986e-01 -3.92349988e-01
 -7.84709990e-01  1.55939996e-01  6.90769970e-01  5.95369995e-01
 -4.43399996e-01  5.35139978e-01  3.28530014e-01  1.24370003e+00
  1.29719996e+00 -1.38779998e+00 -1.09249997e+00 -4.09249991e-01
 -5.69710016e-01 -3.46560001e-01  3.71630001e+00 -1.04890001e+00
 -4.67079997e-01 -4.47389990e-01  6.22999994e-03  1.96490008e-02
 -4.01609987e-01 -6.29130006e-01 -8.25060010e-01  4.55909997e-01
  8.26259971e-01  5.70909977e-01  2.11989999e-01  4.68650013e-01
 -6.00269973e-01  2.99199998e-01  6.79440022e-01  1.42379999e+00
 -3.21520008e-02 -1.26029998e-01]
