# NLP with TensorFlow part01
In this notebook we learn how to use NLP with TensorFlow via the following steps
* word embeddings
* language model with rnn


In [1]:
import tensorflow as tf
import numpy as np

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

import copy, sys, time
if '../common' not in sys.path:
    sys.path.insert(0, '../common')

import helper
from gradient_check import rel_error
source_path = '../common/data/small_vocab_en'
target_path = '../common/data/small_vocab_fr'
source_text = helper.load_data(source_path)
target_text = helper.load_data(target_path)


## Preprocessing data
The first step is to create lookup tables word to integer-id and vice-versa, note that we always add some special word into the dictionary e.g
~~~~
CODES = {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3 }
~~~~

In [2]:
def create_lookup_tables(text, special_codes):
    vocab_to_int = copy.copy(special_codes)
    vocab = set(text.split())
    
    for v_i, v in enumerate(vocab, len(CODES)):
        vocab_to_int[v] = v_i

    int_to_vocab = {v_i: v for v, v_i in vocab_to_int.items()}
    return vocab_to_int, int_to_vocab

CODES = {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3 }
src_vocab_to_int, src_int_to_vocab = create_lookup_tables(source_text, CODES)
des_vocab_to_int, des_int_to_vocab = create_lookup_tables(target_text, CODES)

Given lookup tables, we need convert text into ids

In [3]:
def text_to_ids(text, vocab_to_int, append_eos = False):
    eos = []
    if append_eos:
        eos = [vocab_to_int['<EOS>']]
    
    sequence_ids = []
    for sent in text.split('\n'):
        sent_ids = [vocab_to_int[w] for w in sent.split()]
        if len(sent_ids) > 0:
            sequence_ids.append(sent_ids + eos)
    return sequence_ids

src_seq_ids = text_to_ids(source_text, src_vocab_to_int)
des_seq_ids = text_to_ids(target_text, des_vocab_to_int, append_eos=True)

i_max = np.argmax([len(s) for s in src_seq_ids])
i_min = np.argmin([len(s) for s in src_seq_ids])
print ('max len {:2d} at {}'.format(len(src_seq_ids[i_max]), i_max))
print ('min len {:2d} at {}'.format(len(src_seq_ids[i_min]), i_min))

max len 17 at 1
min len  3 at 5057


## Try word embedding with RNN
In this section, we want to implement the encoder part of the following schema
<img src="images/encoder_decoder.png" width="600"/>

We will use the following helper functions
* helper.pad_sentence_batch: we want all sentence in one batch has same length
* [`tf.contrib.layers.embed_sequence`](https://www.tensorflow.org/api_docs/python/tf/contrib/layers/embed_sequence) to embed a sequence (run rnn for all sequence)

In [16]:
tf.reset_default_graph()

# create interactive session 
sess = tf.InteractiveSession()

# create data
input_data = tf.placeholder(tf.int32, shape = [None, None])
src_vocab_size = len(src_vocab_to_int)
src_embed_dim = 2

print ('source vocab-size: {}'.format(src_vocab_size))

# we create initilizer so we can control embedding-weights init
embed_weights = np.linspace(0.0, 1.0, src_vocab_size * src_embed_dim, dtype=np.float32).reshape(src_vocab_size, 
                                                                                                src_embed_dim)


embed_init = tf.constant_initializer(embed_weights)

# we create embedding
embed_input = tf.contrib.layers.embed_sequence(input_data, src_vocab_size, src_embed_dim, initializer=embed_init)

source vocab-size: 231


## Check embed layer
We will run embed-layer, we should expect **embed-outputs** match with **embed_weights**, we only test for two batches with different seq-length

In [23]:
# we initilize our variable, another way is to use tf.assign
sess.run(tf.global_variables_initializer())

batch_size = 2
indices = [1, 5057]
batch_datas = []
for idx in indices:
    test_batch = np.array(helper.pad_sentence_batch(src_seq_ids[idx:idx+batch_size]))
    print (test_batch.shape)
    batch_datas.append(test_batch)
    embed_vals = sess.run(embed_input, feed_dict={input_data:test_batch})
    seq_len = test_batch.shape[1]
    w = 0
    while (w==0): 
        i = np.random.randint(batch_size)
        j = np.random.randint(seq_len)
        w = test_batch[i,j]
    print ('word[{},{}] = {}'.format(i, j, test_batch[i,j]))
    print ('embed_vals[{},{}] = {}'.format(i, j, embed_vals[i,j]))
    print ('embed_weight[{}] = {}'.format(test_batch[i,j], embed_weights[test_batch[i,j]]))
    print ('rel-err {:e}\n'.format(rel_error(embed_vals[i,j], embed_weights[test_batch[i,j]])))

(2, 17)
word[1,7] = 127
embed_vals[1,7] = [ 0.55097616  0.55314535]
embed_weight[127] = [ 0.55097616  0.55314535]
rel-err 0.000000e+00

(2, 9)
word[1,5] = 113
embed_vals[1,5] = [ 0.49023861  0.4924078 ]
embed_weight[113] = [ 0.49023861  0.4924078 ]
rel-err 0.000000e+00



## Implement encoder layer  
Given embed_input ($w_1,...,w_n$), we are ready to make it passed through a RNN encoder. Since the seq-len is variable, we will use 

* [`tf.nn.dynamic_rnn`](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn) to perform un-roll rnn encoder
* [`tf.contrib.rnn.BasicRNNCell`](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/BasicRNNCell) or [`tf.contrib.rnn.BasicLSTMCell`](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/BasicLSTMCell) to model a cell in our RNN

In [None]:
rnn_size = 4

enc_cell = tf.contrib.rnn.BasicRNNCell(rnn_size)
_, enc_state = tf.nn.dynamic_rnn(enc_cell, embed_input, dtype=tf.float32)

In [29]:
# print all variable
tvars = tf.global_variables()
sess.run(tf.global_variables_initializer())

for var in tvars:
    print(var.name)  # Prints the name of the variable alongside its value.

EmbedSequence/embeddings:0
rnn/basic_rnn_cell/weights:0
rnn/basic_rnn_cell/biases:0


## Inspect variables
We look at our trainable variables:
* embedding-weights variable: **EmbedSequence/embeddings:0**
* rnn-weights variable: **rnn/basic_rnn_cell/weights:0**
* rnn-biases variable: **rnn/basic_rnn_cell/biases:0**

In [27]:
rnn_ew = [var for var in tvars if var.name == 'EmbedSequence/embeddings:0'][0]
rnn_w  = [var for var in tvars if var.name == 'rnn/basic_rnn_cell/weights:0'][0]
rnn_b  = [var for var in tvars if var.name == 'rnn/basic_rnn_cell/biases:0'][0]

# we should expect rnn_ew.shape = (vocab_size = 231, embed_dim = 2)
print ('rnn_ew has shape {}'.format(rnn_ew.get_shape().as_list()))

# we should expect rnn_w.shape = (embed_dim + rnn_size, rnn_size)
print (rnn_w.get_shape().as_list())

# we should expect rnn_b.shape = (rnn_size)
print (rnn_b.get_shape().as_list())

rnn_ew has shape [231, 2]
[6, 4]
[4]


## RNN encoder
Let's run RNN encoder with an input data to verify if it follows the following dynamics
$$
h_0 = (0,\ldots,0) \in \mathbb{R}^H, x_t \in \mathbb{R}^D, W \in \mathbb{R}^{(D+H)\times H}, b \in \mathbb{R}^H
$$
with update rule
$$
h_t = \tanh\left( x_{t} \times W[0:D,:] +  h_{t-1}\times W[D:,:] + b\right)
$$

In [61]:
# let run rnn now, we reduce the dimension to verify it easier
seq_in = batch_datas[0][:,0:2]
enc_in  = sess.run(embed_input, feed_dict={input_data : seq_in})
enc_out = sess.run(enc_state, feed_dict={input_data : seq_in})
print ('encoder input:  {}'.format(enc_in.shape))
print ('encoder output: {}'.format(enc_out.shape))

w_v = rnn_w.eval()
b_v = rnn_b.eval()
print (w_v.shape)
print (enc_in[0,0,:]) 
print (enc_out)
h0 = np.zeros((2,4), dtype=np.float32)
h1 = np.tanh(enc_in[:,0,:].dot(w_v[0:2,:]) + h0.dot(w_v[2:,:]) + b_v)
h2 = np.tanh(enc_in[:,1,:].dot(w_v[0:2,:]) + h1.dot(w_v[2:,:]) + b_v)
print (h2)
print ('\nrel-error: {:e}'.format(rel_error(enc_out, h2)))

encoder input:  (2, 2, 2)
encoder output: (2, 4)
(6, 4)
[ 0.29934925  0.30151844]
[[-0.2107414  -0.20193748  0.57699555  0.60662717]
 [-0.29763207 -0.18281864  0.62321222  0.66580057]]
[[-0.21074142 -0.2019375   0.57699555  0.60662705]
 [-0.29763207 -0.18281864  0.62321222  0.66580051]]

rel-error: 9.825582e-08


We can see that RNN works as epected, we do see some error since Tensorflow uses different math-backend (Eigen) than Numpy with MKL.

Now let's look at LSTM.