In [1]:
import time
import numpy as np
import tensorflow as tf

In [2]:
import sys
sys.path.insert(0, './data/')
import reader

In [None]:
#Toy LSTM cell, 1 layer, 5 hidden units (h and c of size (1,5)
LSTMCellSize = 5
LSTMCell = tf.keras.layers.LSTMCell(LSTMCellSize)
h = tf.zeros(shape=[1,LSTMCellSize], dtype=tf.float32)
c = tf.ones(shape=[1,LSTMCellSize], dtype=tf.float32)
state = (h,c)

In [None]:
sample_input = tf.constant([[1,2,3,4,5,6,7]],dtype=tf.float32)

In [None]:
output, newState = LSTMCell(sample_input, state)

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(newState))
    print(sess.run(output))

In [None]:
#Toy LSTM cell, 2 layers, 7 hidden units + 4 hidden units
l1 = 7
l2 = 4
cells = []

cell1 = tf.contrib.rnn.LSTMCell(l1)
cells.append(cell1)
h1 = tf.zeros(shape=[1,l1], dtype=tf.float32)
c1 = tf.ones(shape=[1,l1], dtype=tf.float32)
state1 = (h1,c1)

cell2 = tf.contrib.rnn.LSTMCell(l2)
cells.append(cell2)
h2 = tf.zeros(shape=[1,l2], dtype=tf.float32)
c2 = tf.ones(shape=[1,l2], dtype=tf.float32)
state2 = (h1,c1)

stackedLSTM = tf.contrib.rnn.MultiRNNCell(cells)

input_dimension = 10
data = tf.placeholder(tf.float32, shape=[None, None, input_dimension])

output, newState = tf.nn.dynamic_rnn(stackedLSTM, data, dtype=tf.float32)

In [None]:
nBatches = 3 
nSteps = 2
sample_input = np.array([[[i+10*j+100*k for i in range(input_dimension)] for j in range(nBatches)] for k in range(nSteps)])

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    #print(sess.run(newState, feed_dict={data: sample_input}))
    #print(sess.run(output, feed_dict={data: sample_input}))
    print("Input shape: ", sample_input.shape)
    print("First cell: ", sess.run(newState, feed_dict={data: sample_input})[0][1].shape)
    print("Second cell: ", sess.run(newState, feed_dict={data: sample_input})[1][1].shape)
    print("Output shape: ", sess.run(output, feed_dict={data: sample_input}).shape)

LANGUAGE MODELLING:

In [3]:
#Setting parameters:
#Initial weight scale
init_scale = 0.1
#Initial learning rate
learning_rate = 1.0
#Maximum permissible norm for the gradient (For gradient clipping -- another measure against Exploding Gradients)
max_grad_norm = 5
#The number of layers in our model
num_layers = 2
#The total number of recurrence steps, also known as the number of layers when our RNN is "unfolded"
num_steps = 20
#The number of processing units (neurons) in the hidden layers
hidden_size_l1 = 256
hidden_size_l2 = 128
#The maximum number of epochs trained with the initial learning rate
max_epoch_decay_lr = 4
#The total number of epochs in training
max_epoch = 15
#The probability for keeping data in the Dropout Layer (This is an optimization, but is outside our scope for this notebook!)
#At 1, we ignore the Dropout Layer wrapping.
keep_prob = 1
#The decay for the learning rate
decay = 0.5
#The size for each batch of data
batch_size = 30
#The size of our vocabulary
vocab_size = 10000
embeding_vector_size = 200
#Training flag to separate training from testing
is_training = 1
#Data directory for our dataset
data_dir = "data/simple-examples/data/"

{batch:30x}(steps:20x)200 input units 
-> {batch:30x}[256] Weight -> 256 Hidden units (first layer) 
-> {batch:30x}[128] Weight matrix  -> 128 Hidden_units (second layer) ->  
Softmax layer -> {batch:30x}(steps:20x)200 unit output

$$Softmax = [600(batch:30 \times steps:20) \times 128] * [128 \times 10000] + [1 \times 10000] \Longrightarrow [600 \times 10000]$$

In [4]:
session = tf.InteractiveSession()

In [5]:
#One step manually
raw_data = reader.ptb_raw_data(data_dir)
train_data, valid_data, test_data, vocab, word_to_id = raw_data

def id_to_word(id_list):
    line = []
    for w in id_list:
        for word, wid in word_to_id.items():
            if wid == w:
                line.append(word)
    return line            
                
print("Total number of train words: ", len(train_data))
print("First 10 word ids ", train_data[0:10])
print("Corresponding words ", id_to_word(train_data[0:10]))


Total number of train words:  929589
First 10 word ids  [9970, 9971, 9972, 9974, 9975, 9976, 9980, 9981, 9982, 9983]
Corresponding words  ['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec']


In [120]:
#We will read 1 mini-batch
itera = reader.ptb_iterator(train_data, batch_size, num_steps)
first_touple = itera.__next__()
x = first_touple[0]
y = first_touple[1]

print("Batch size: ", x.shape)
print("First 3 sentences (ids): ", x[0:3])
print("First sentence: ", id_to_word(x[0]))
print("Targets for 1st sentence: ", id_to_word(y[0]))

Batch size:  (30, 20)
First 3 sentences (ids):  [[9970 9971 9972 9974 9975 9976 9980 9981 9982 9983 9984 9986 9987 9988
  9989 9991 9992 9993 9994 9995]
 [2654    6  334 2886    4    1  233  711  834   11  130  123    7  514
     2   63   10  514    8  605]
 [   0 1071    4    0  185   24  368   20   31 3109  954   12    3   21
     2 2915    2   12    3   21]]
First sentence:  ['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim']
Targets for 1st sentence:  ['banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim', 'snack-food']


In [7]:
_input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) #[30#20] 1 batch of (30) 20-word sentences
_targets = tf.placeholder(tf.int32, [batch_size, num_steps]) #[30#20]
feed_dict = {_input_data:x, _targets:y}
print("Last sentence in the current batch:")
print(id_to_word(session.run(_input_data, feed_dict)[30-1]))

Last sentence in the current batch:
['also', 'returned', 'contributions', 'he', 'received', 'from', 'mr.', 'keating', 'a', 'year', 'ago', '<eos>', 'sens.', 'john', 'glenn', 'd.', 'ohio', 'john', '<unk>', 'r.']


<h4>_initial_state</h4>

For each LCTM, there are 2 state matrices, c\_state and m\_state.  c_state and m_state represent "Memory State" and "Cell State". 
Each hidden layer, has a vector of size 30, which keeps the states. so, for 256/128 hidden units in each LSTM

In [8]:
#Two LSTM cells
cells = []

cell1 = tf.contrib.rnn.LSTMCell(hidden_size_l1)
cell2 = tf.contrib.rnn.LSTMCell(hidden_size_l2)

stackedLSTMCells = tf.contrib.rnn.MultiRNNCell((cell1, cell2))

_initial_state = stackedLSTMCells.zero_state(batch_size, tf.float32)
_initial_state

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.


(LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/LSTMCellZeroState/zeros:0' shape=(30, 256) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/LSTMCellZeroState/zeros_1:0' shape=(30, 256) dtype=float32>),
 LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/LSTMCellZeroState_1/zeros:0' shape=(30, 128) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/LSTMCellZeroState_1/zeros_1:0' shape=(30, 128) dtype=float32>))

<h3>Embeddings</h3>
We have to convert the words in our dataset to vectors of numbers. The traditional approach is to use one-hot encoding method that is usually used for converting categorical values to numerical values. However, One-hot encoded vectors are high-dimensional, sparse and in a big dataset, computationally inefficient. So, we use word2vec approach. It is, in fact, a layer in our LSTM network, where the word IDs will be represented as a dense representation before feeding to the LSTM. 

The embedded vectors also get updated during the training process of the deep neural network.
We create the embeddings for our input data. <b>embedding_vocab</b> is matrix of [10000x200] for all 10000 unique words.

<b>embedding_lookup()</b> finds the embedded values for our batch of 30x20 words. It  goes to each row of <code>input_data</code>, and for each word in the row/sentence, finds the correspond vector in <code>embedding_dic<code>. <br>
It creates a [30x20x200] tensor, so, the first element of <b>inputs</b> (the first sentence), is a matrix of 20x200, which each row of it, is vector representing a word in the sentence.

In [9]:
embedding_vocab = tf.get_variable("embedding_vocab", [vocab_size, embeding_vector_size])  #[10000x200]
inputs = tf.nn.embedding_lookup(embedding_vocab, _input_data)  #_input_data(30,20) => inputs(30, 20, 200) 

<h3>Constructing Recurrent Neural Networks</h3>
<b>tf.nn.dynamic_rnn()</b> creates a recurrent neural network using <b>stacked_lstm</b>. 

The input should be a Tensor of shape: [batch_size, max_time, embedding_vector_size], in our case it would be (30, 20, 200)

This method, returns a pair (outputs, new_state) where:
<ul>
    <li><b>outputs</b>: is a length T list of outputs (one for each input), or a nested tuple of such elements.</li>
    <li><b>new_state</b>: is the final state.</li>
</ul>

In [10]:
outputs, new_state =  tf.nn.dynamic_rnn(stackedLSTMCells, inputs, initial_state=_initial_state)

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [51]:
session.run(tf.global_variables_initializer())

print("_input_data: 1 num_steps-word Batch, Before embedding")
print(session.run(_input_data, feed_dict).shape)
print("inputs: 1 um_steps-word Batch, After embedding")
print(session.run(inputs, feed_dict).shape)

print("_initial_state: First cell")
print(session.run(_initial_state, feed_dict)[0][1].shape)
print("_initial_state: Second cell")
print(session.run(_initial_state, feed_dict)[1][1].shape)

print("Going through the stacked cells... One 200-embedded word per time, 20 times, for all 30 batches")
print("PER BATCH: Each of the 20 (200-embedded) word will go through the 256/128-node cell")

print("new_state: First cell")
print(session.run(new_state, feed_dict)[0][1].shape)
print("new_state: Second cell")
print(session.run(new_state, feed_dict)[1][1].shape)

print("outputs")
print(session.run(outputs, feed_dict).shape)

_input_data: 1 num_steps-word Batch, Before embedding
(30, 20)
inputs: 1 um_steps-word Batch, After embedding
(30, 20, 200)
_initial_state: First cell
(30, 256)
_initial_state: Second cell
(30, 128)
Going through the stacked cells... One 200-embedded word per time, 20 times, for all 30 batches
PER BATCH: Each of the 20 (200-embedded) word will go through the 256/128-node cell
new_state: First cell
(30, 256)
new_state: Second cell
(30, 128)
outputs
(30, 20, 128)


we need to flatten the outputs to be able to connect it softmax layer. Lets reshape the output tensor from  [30 x 20 x 128] to [600 x 128]

<b>Notice:</b> Imagine our output is 3-d tensor as following (of course each <code>sen_x_word_y</code> is a an embedded vector by itself): 
<ul>
    <li>sentence 1: [[sen1word1], [sen1word2], [sen1word3], ..., [sen1word20]]</li> 
    <li>sentence 2: [[sen2word1], [sen2word2], [sen2word3], ..., [sen2word20]]</li>   
    <li>sentence 3: [[sen3word1], [sen3word2], [sen3word3], ..., [sen3word20]]</li>  
    <li>...  </li>
    <li>sentence 30: [[sen30word1], [sen30word2], [sen30word3], ..., [sen30word20]]</li>   
</ul>
Now, the flattened would convert this 3-dim tensor to:

[ [sen1word1], [sen1word2], [sen1word3], ..., [sen1word20],[sen2word1], [sen2word2], [sen2word3], ..., [sen2word20], ..., [sen30word20] ]

<h3>logistic unit</h3>
Now, we create a logistic unit to return the probability of the output word in our vocabulary with 1000 words. 
$$Softmax = [600(batch:30 \times steps:20) \times 128] * [128 \times 10000] + [1 \times 10000] \Longrightarrow [600 \times 10000]$$

In [57]:
output = tf.reshape(outputs, [-1, hidden_size_l2])
softmax_w = tf.get_variable("softmax_w", [hidden_size_l2, vocab_size]) #[128x1000]
softmax_b = tf.get_variable("softmax_b", [vocab_size]) #[1x10000]
logits = tf.matmul(output, softmax_w) + softmax_b #600x10000
prob = tf.nn.softmax(logits)

ValueError: Variable softmax_w already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "C:\Users\Nidhal\Anaconda3\lib\site-packages\tensorflow_core\python\framework\ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()
  File "C:\Users\Nidhal\Anaconda3\lib\site-packages\tensorflow_core\python\framework\ops.py", line 3426, in _create_op_internal
    op_def=op_def)
  File "C:\Users\Nidhal\Anaconda3\lib\site-packages\tensorflow_core\python\framework\ops.py", line 3357, in create_op
    attrs, op_def, compute_device)
  File "C:\Users\Nidhal\Anaconda3\lib\site-packages\tensorflow_core\python\util\deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "C:\Users\Nidhal\Anaconda3\lib\site-packages\tensorflow_core\python\framework\op_def_library.py", line 794, in _apply_op_helper
    op_def=op_def)


In [62]:
session.run(tf.global_variables_initializer())

print("_input_data: 1 num_steps-word Batch, Before embedding")
print(session.run(_input_data, feed_dict).shape)
print("inputs: 1 um_steps-word Batch, After embedding")
print(session.run(inputs, feed_dict).shape)

print("_initial_state: First cell")
print(session.run(_initial_state, feed_dict)[0][1].shape)
print("_initial_state: Second cell")
print(session.run(_initial_state, feed_dict)[1][1].shape)

print("Going through the stacked cells... One 200-embedded word per time, 20 times, for all 30 batches")
print("PER BATCH: Each of the 20 (200-embedded) word will go through the 256/128-node cell")

print("new_state: First cell")
print(session.run(new_state, feed_dict)[0][1].shape)
print("new_state: Second cell")
print(session.run(new_state, feed_dict)[1][1].shape)

print("outputs")
print(session.run(outputs, feed_dict).shape)
print("output (without S!)")
print(session.run(output, feed_dict).shape)

print("softmax_w")
print(session.run(softmax_w, feed_dict).shape)
print("softmax_b")
print(session.run(softmax_b, feed_dict).shape)
print("logits: output*sotfmaw_w + sotfmax_b")
print(session.run(logits, feed_dict).shape)
print("prob: Softmax(logits)")
print(session.run(prob, feed_dict).shape)

_input_data: 1 num_steps-word Batch, Before embedding
(30, 20)
inputs: 1 um_steps-word Batch, After embedding
(30, 20, 200)
_initial_state: First cell
(30, 256)
_initial_state: Second cell
(30, 128)
Going through the stacked cells... One 200-embedded word per time, 20 times, for all 30 batches
PER BATCH: Each of the 20 (200-embedded) word will go through the 256/128-node cell
new_state: First cell
(30, 256)
new_state: Second cell
(30, 128)
outputs
(30, 20, 128)
output (without S!)
(600, 128)
softmax_w
(128, 10000)
softmax_b
(10000,)
logits: output*sotfmaw_w + sotfmax_b
(600, 10000)
prob: Softmax(logits)
(600, 10000)


<h3>Prediction</h3>
What is the word correspond to the probability output? Lets use the maximum probability:

In [78]:
print("Predicted")
print(id_to_word(np.argmax(session.run(prob, feed_dict)[0:20], axis=1)))
print("Truth")
print(id_to_word(session.run(_targets, feed_dict)[0]))

Predicted
['million', 'handicapped', 'handicapped', 'handicapped', 'handicapped', 'handicapped', 'dreyfus', 'dreyfus', 'approved', 'approved', 'approved', 'settlement', 'settlement', 'settlement', 'settlement', 'settlement', 'fly', 'roles', 'rivals', 'fly']
Truth
['banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim', 'snack-food']


<h4>Objective function</h4>

Now we have to define our objective function, to calculate the similarity of predicted values to ground truth, and then, penalize the model with the error. Our objective is to minimize loss function, that is, to minimize the average negative log probability of the target words:

$$\text{loss} = -\frac{1}{N}\sum_{i=1}^{N} \ln p_{\text{target}_i}$$

This function is already implemented and available in TensorFlow through <b>sequence_loss_by_example</b>. It calculates the weighted cross-entropy loss for <b>logits</b> and the <b>target</b> sequence.  

The arguments of this function are:  
<ul>
    <li>logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols].</li>  
    <li>targets: List of 1D batch-sized int32 Tensors of the same length as logits.</li>   
    <li>weights: List of 1D batch-sized float-Tensors of the same length as logits.</li> 
</ul>

In [79]:
loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [tf.reshape(_targets, [-1])],[tf.ones([batch_size * num_steps])])
cost = tf.reduce_sum(loss) / batch_size

In [81]:
print("The first 10 log-perplexities")
print(session.run(loss, feed_dict)[:10])
print("Cost")
print(session.run(cost, feed_dict))

The first 10 log-perplexities
[9.227823 9.206076 9.21901  9.198953 9.217763 9.219764 9.196862 9.213927
 9.202095 9.208251]
Cost
184.25726


<h3>Training</h3>

To do training for our network, we have to take the following steps:
<ol>
    <li>Define the optimizer.</li>
    <li>Extract variables that are trainable.</li>
    <li>Calculate the gradients based on the loss function.</li>
    <li>Apply the optimizer to the variables/gradients tuple.</li>
</ol>

In [135]:
#Define Optimizer
lr = tf.Variable(0.1, trainable=False)
optimizer = tf.train.GradientDescentOptimizer(lr)
#Extract variables that are trainable
tvars = tf.trainable_variables()
#Calculate the gradients based on the loss function
grad_t_list = tf.gradients(cost, tvars)
#Clip norms
grads, _ = tf.clip_by_global_norm(grad_t_list, max_grad_norm)
#Apply the optimizer
train_op = optimizer.apply_gradients(zip(grads, tvars))

In [152]:
session.run(tf.global_variables_initializer())
session.run(train_op, feed_dict)

In [153]:
print("The first 10 log-perplexities")
print(session.run(loss, feed_dict)[:10])
print("Cost")
print(session.run(cost, feed_dict))

The first 10 log-perplexities
[9.216266  9.210908  9.209421  9.208688  9.191307  9.193218  9.19128
 9.219045  9.2014475 9.191616 ]
Cost
183.68549
