## Short Intro To Embeddings with Tensorflow

Goals
- Understand Embedding 
- Perform Embedding Lookup using Tensorflow
- Use Pre-Trained Embedding 

In [None]:
import tensorflow as tf
import numpy as np
import os
print('Tensorflow version : {0}'.format(tf.__version__))

### Sample Data

In [None]:
embedding_size = 5
vocabulary_size = 10

# create a sample embedding matrix of size 5 for vocab of size 10
embedding = np.random.rand(vocabulary_size, embedding_size)
print(embedding)

In [None]:
# create one-hot encoding for one of element in vocabulary
i = 4
one_hot = np.zeros(10)
one_hot[i] = 1.0
print(one_hot)

In [None]:
# embedding vector can be extracted by taking a dot product between the one_hot vector and embedding matrix
embedding_vector = np.dot(one_hot, embedding)
print(embedding_vector)

In [None]:
# cross validate from the embedding matrix
print(embedding[i])

## Tensorflow Embedding Lookup

In [None]:
g = tf.Graph()
with g.as_default():
    # provide input indices 
    x = tf.placeholder(shape=[None], dtype=tf.int32, name='x')
    
    # create a constant initializer
    weights_initializer = tf.constant_initializer(embedding)
    embedding_weights = tf.get_variable(
                            name='embedding_weights', 
                            shape=(vocabulary_size, embedding_size), 
                            initializer=weights_initializer,
                            trainable=False)
    # emebedding Lookup 
    embedding_lookup = tf.nn.embedding_lookup(embedding_weights, x)

In [None]:
# Getting Single Row
with tf.Session(graph=g) as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(embedding_lookup, feed_dict={x : [4]}))


In [None]:
# Getting Multiple Rows
with tf.Session(graph=g) as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(embedding_lookup, feed_dict={x : [2,4,6]}))



### Using GloVe Pre-Trained Model 

In [None]:
EMBEDDING_DIMENSION=100 # Available dimensions for 6B data is 50, 100, 200, 300
glove_weights_file_path = os.path.join('processed','glove', 'glove.6B.{0}d.txt'.format(EMBEDDING_DIMENSION))
print('Using the following glove weight file : {0}'.format(glove_weights_file_path))

In [None]:
# look at some sample rows
!head -3 processed/glove/glove.6B.100d.txt

In [None]:
glove_weights = []
word2idx = {}
vocabulary_size = 40000 # limit vocab to top 40K terms
vocabulary = []


with open(glove_weights_file_path,'r') as file:
    for index, line in enumerate(file):
        values = line.split() # Word and weights separated by space
        word = values[0] # Word is first symbol on each line
        vocabulary.append(word)
        word_weights = np.asarray(values[1:], dtype=np.float32) # Remainder of line is weights for word
        word2idx[word] = index 
        glove_weights.append(word_weights)
        
        if index + 1 == vocabulary_size:
            break
glove_weights = np.asarray(glove_weights, dtype=np.float32)

In [None]:
glove_weights.shape

In [None]:
words = ["man", "woman"]
#words = ["paris", "london","rome","berlin"]
words_indices = [word2idx[word] for word in words]
words_indices

In [None]:
g = tf.Graph()

with g.as_default():
    # provide input indices 
    x = tf.placeholder(shape=[None], dtype=tf.int32, name='x')
    
    # create a constant initializer
    weights_initializer = tf.constant_initializer(glove_weights)
    embedding_weights = tf.get_variable(
                            name='embedding_weights', 
                            shape=(vocabulary_size, EMBEDDING_DIMENSION), 
                            initializer=weights_initializer,
                            trainable=False)
    # emebedding Lookup 
    embedding_lookup = tf.nn.embedding_lookup(embedding_weights, x)
    
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding_weights), 1, keepdims=True))
    normalized_embeddings = embedding_weights / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, x)
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))
    


In [None]:
with tf.Session(graph=g) as sess:
    sess.run(tf.global_variables_initializer())
    result = sess.run(embedding_lookup, feed_dict={x : words_indices})
    sim = sess.run(similarity, feed_dict={x : words_indices})
    print('Shape of Similarity Matrix: {0}'.format(sim.shape))
    for i,word_index in enumerate(words_indices):
       
        top_k = 10 # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        log = 'Nearest to {0} :'.format(vocabulary[word_index])
        
        for k in range(top_k):
       
            close_word = vocabulary[nearest[k]]
            log = '{0} {1},'.format(log, close_word)
        print(log)
