## Short Intro To Embeddings with Tensorflow

Goals
- Understand Embedding 
- Perform Embedding Lookup using Tensorflow
- Use Pre-Trained Embedding 

In [1]:
import tensorflow as tf
import numpy as np
import os
print('Tensorflow version : {0}'.format(tf.__version__))

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Tensorflow version : 1.13.2


### Sample Data

In [2]:
embedding_size = 5
vocabulary_size = 10

# create a sample embedding matrix of size 5 for vocab of size 10
embedding = np.random.rand(vocabulary_size, embedding_size)
print(embedding)

[[0.62654171 0.50430026 0.41333328 0.40546354 0.18437554]
 [0.13246561 0.97271116 0.79417015 0.75506745 0.85404826]
 [0.78654521 0.83387791 0.6757732  0.56653559 0.43508573]
 [0.82106614 0.40988622 0.52995057 0.16870882 0.07009734]
 [0.45245634 0.27924294 0.31403412 0.22168672 0.92164942]
 [0.3997174  0.14001849 0.84149333 0.1336388  0.83283252]
 [0.96008953 0.30942925 0.85183737 0.68087703 0.20365221]
 [0.18846894 0.23434606 0.28537222 0.0658587  0.64548797]
 [0.82215281 0.00612114 0.48147152 0.28038952 0.70243643]
 [0.35311114 0.67124462 0.21654041 0.84316282 0.88596658]]


In [3]:
# create one-hot encoding for one of element in vocabulary
i = 4
one_hot = np.zeros(10)
one_hot[i] = 1.0
print(one_hot)

[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]


In [4]:
# embedding vector can be extracted by taking a dot product between the one_hot vector and embedding matrix
embedding_vector = np.dot(one_hot, embedding)
print(embedding_vector)

[0.45245634 0.27924294 0.31403412 0.22168672 0.92164942]


In [5]:
# cross validate from the embedding matrix
print(embedding[i])

[0.45245634 0.27924294 0.31403412 0.22168672 0.92164942]


## Tensorflow Embedding Lookup

In [6]:
g = tf.Graph()
with g.as_default():
    # provide input indices 
    x = tf.placeholder(shape=[None], dtype=tf.int32, name='x')
    
    # create a constant initializer
    weights_initializer = tf.constant_initializer(embedding)
    embedding_weights = tf.get_variable(
                            name='embedding_weights', 
                            shape=(vocabulary_size, embedding_size), 
                            initializer=weights_initializer,
                            trainable=False)
    # emebedding Lookup 
    embedding_lookup = tf.nn.embedding_lookup(embedding_weights, x)

Instructions for updating:
Colocations handled automatically by placer.


In [7]:
# Getting Single Row
with tf.Session(graph=g) as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(embedding_lookup, feed_dict={x : [4]}))


[[0.45245636 0.27924293 0.31403413 0.22168672 0.9216494 ]]


In [8]:
# Getting Multiple Rows
with tf.Session(graph=g) as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(embedding_lookup, feed_dict={x : [2,4,6]}))



[[0.7865452  0.8338779  0.6757732  0.5665356  0.4350857 ]
 [0.45245636 0.27924293 0.31403413 0.22168672 0.9216494 ]
 [0.9600895  0.30942926 0.8518374  0.68087703 0.20365222]]


### Using GloVe Pre-Trained Model 

In [9]:
EMBEDDING_DIMENSION=100 # Available dimensions for 6B data is 50, 100, 200, 300
glove_weights_file_path = os.path.join('processed','glove', 'glove.6B.{0}d.txt'.format(EMBEDDING_DIMENSION))
print('Using the following glove weight file : {0}'.format(glove_weights_file_path))

Using the following glove weight file : processed/glove/glove.6B.100d.txt


In [10]:
# look at some sample rows
!head -3 processed/glove/glove.6B.100d.txt

the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062
, -0.10767 0.11053 0.59812 -0.54361 0.67396 0.10663 0.038867 0.35481 0.06351 -0.094189 0.15786 -0.81665 0.14172 0.21939 0.58505 -0.52158 

In [11]:
glove_weights = []
word2idx = {}
vocabulary_size = 40000 # limit vocab to top 40K terms
vocabulary = []


with open(glove_weights_file_path,'r') as file:
    for index, line in enumerate(file):
        values = line.split() # Word and weights separated by space
        word = values[0] # Word is first symbol on each line
        vocabulary.append(word)
        word_weights = np.asarray(values[1:], dtype=np.float32) # Remainder of line is weights for word
        word2idx[word] = index 
        glove_weights.append(word_weights)
        
        if index + 1 == vocabulary_size:
            break
glove_weights = np.asarray(glove_weights, dtype=np.float32)

In [12]:
glove_weights.shape

(40000, 100)

In [13]:
words = ["man", "woman"]
#words = ["paris", "london","rome","berlin"]
words_indices = [word2idx[word] for word in words]
words_indices

[300, 787]

In [14]:
g = tf.Graph()

with g.as_default():
    # provide input indices 
    x = tf.placeholder(shape=[None], dtype=tf.int32, name='x')
    
    # create a constant initializer
    weights_initializer = tf.constant_initializer(glove_weights)
    embedding_weights = tf.get_variable(
                            name='embedding_weights', 
                            shape=(vocabulary_size, EMBEDDING_DIMENSION), 
                            initializer=weights_initializer,
                            trainable=False)
    # emebedding Lookup 
    embedding_lookup = tf.nn.embedding_lookup(embedding_weights, x)
    
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding_weights), 1, keepdims=True))
    normalized_embeddings = embedding_weights / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, x)
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))
    


In [15]:
with tf.Session(graph=g) as sess:
    sess.run(tf.global_variables_initializer())
    result = sess.run(embedding_lookup, feed_dict={x : words_indices})
    sim = sess.run(similarity, feed_dict={x : words_indices})
    print('Shape of Similarity Matrix: {0}'.format(sim.shape))
    for i,word_index in enumerate(words_indices):
       
        top_k = 10 # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        log = 'Nearest to {0} :'.format(vocabulary[word_index])
        
        for k in range(top_k):
       
            close_word = vocabulary[nearest[k]]
            log = '{0} {1},'.format(log, close_word)
        print(log)


Shape of Similarity Matrix: (2, 40000)
Nearest to man : woman, boy, one, person, another, old, life, father, turned, who,
Nearest to woman : girl, man, mother, boy, she, child, wife, her, herself, daughter,
