In [1]:
import time
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import warnings; warnings.filterwarnings('ignore')
import tensorflow as tf

In [3]:
import reader

## Set Configs

In [4]:
init_scale = 0.1 # initial weight scale
learning_rate = 1.0 # initial learning rate
max_grad_norm = 5 # maximum permissible norm for the gradient clipping
num_layers = 2 # the number of layers in our model
num_steps = 20 # the total number of recurrence steps, also known as the number of layers when our RNN is unfolded
hidden_size_l1 = 256 # the number of processing units (neurons) in the hidden layers
hidden_size_l2 = 128
max_epoch_decay_lr = 4 # the maximum number of epochs trained with the initial learning rate
num_epochs = 15 # the total number of epochs in training
keep_prob = 1 # at 1, we ignore the Dropout Layer wrapping
decay = 0.5 # the decay for the learning rate
batch_size = 60 # the size for each batch of data
vocab_size = 10000 # the size of our vocabulary
embedding_vector_size = 200
is_training = 1 # training flag to separate training from testing

## Create Interactive Session

In [5]:
sess = tf.InteractiveSession()

## Load Datasets

In [6]:
data_dir = './datasets/data/simple-examples/data/' # data directory for our datasets

In [7]:
# reads the data and separates it into training, validation and testing data
raw_data = reader.ptb_raw_data(data_dir)
train_data, valid_data, test_data, vocab, word_to_id = raw_data

In [8]:
def id_to_word(id_list):
    line = []
    for w in id_list:
        for word, wid in word_to_id.items():
            if wid == w:
                line.append(word)
    return line

In [9]:
print('Total of Training Data:', len(train_data))
print('Word Examples:', id_to_word(train_data[0:8]))

Total of Training Data: 929589
Word Examples: ['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano']


## Set Placeholders

In [10]:
iterator = reader.ptb_iterator(train_data, batch_size, num_steps)
first_tupple = iterator.__next__()
X = first_tupple[0]
y = first_tupple[1]

In [11]:
input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
targets = tf.placeholder(tf.int32, [batch_size, num_steps])

## Create LSTM Cell

In [12]:
LSTM_cells = []

In [13]:
cell = tf.contrib.rnn.BasicLSTMCell(hidden_size_l1, forget_bias=0.0)
LSTM_cells.append(cell)

In [14]:
cell = tf.contrib.rnn.BasicLSTMCell(hidden_size_l2, forget_bias=0.0)
LSTM_cells.append(cell)

In [15]:
stacked_LSTM = tf.contrib.rnn.MultiRNNCell(LSTM_cells)

In [16]:
initial_state = stacked_LSTM.zero_state(batch_size, tf.float32)

## Create The Embeddings

In [17]:
embedding_vocab = tf.get_variable('embedding_vocab', [vocab_size, embedding_vector_size])

In [18]:
inputs = tf.nn.embedding_lookup(embedding_vocab, input_data)

In [19]:
lstm_out, hidden_state = tf.nn.dynamic_rnn(stacked_LSTM, inputs, initial_state=initial_state)

In [20]:
output = tf.reshape(lstm_out, [-1, hidden_size_l2])

## Set Operation

In [21]:
softmax_W = tf.get_variable('softmax_W', [hidden_size_l2, vocab_size])
softmax_b = tf.get_variable('softmax_b', [vocab_size])

In [22]:
logits_op = tf.matmul(output, softmax_W) + softmax_b
logits_op = tf.reshape(logits_op, [batch_size, num_steps, vocab_size])

In [23]:
probs_op = tf.nn.softmax(logits_op)

In [24]:
words_op = tf.argmax(probs_op, axis=2)

## Set Loss Function

In [25]:
loss_op = tf.contrib.seq2seq.sequence_loss(logits_op, targets, tf.ones([batch_size, num_steps], dtype=tf.float32), 
                                           average_across_timesteps=False, average_across_batch=True)

In [26]:
loss_op = tf.reduce_sum(loss_op)

## Set Optimizer

In [27]:
lr = tf.Variable(0.0, trainable=False)

In [28]:
train_vars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(loss_op, train_vars), max_grad_norm)

In [29]:
optimizer_op = tf.train.GradientDescentOptimizer(lr)
optimizer_op = optimizer_op.apply_gradients(zip(grads, train_vars))

## Train The Graph

In [30]:
def run_model(data, optimizer_op, verbose=False):
    
    epoch_size = ((len(data) // batch_size) - 1) // num_steps
    start_time = time.time()

    losses = 0.0
    iters = 0

    state = sess.run(initial_state)

    for step, (X, y) in enumerate(reader.ptb_iterator(data, batch_size, num_steps)):

        loss, state, words, _ = sess.run([loss_op, hidden_state, words_op, optimizer_op], 
                                          feed_dict={input_data: X, targets: y, initial_state: state})
        losses += loss
        iters += num_steps

        if verbose and step % (epoch_size // 10) == 10:
            speed = iters * batch_size / (time.time() - start_time)
            print(f'Iteration: {step}/ {epoch_size}, Perplexity: {np.exp(losses/iters):.3f}, Speed: {speed:.0f} wps')

    perplexity = np.exp(losses / iters)
    
    return perplexity

In [31]:
with tf.Session() as sess:

    init_op = tf.global_variables_initializer()
    
    sess.run(init_op) # run the init_op using an interactive session

    initializer = tf.random_uniform_initializer(-init_scale, init_scale)
    
    for i_epoch in range(1, num_epochs+1):
    
        # define the decay for this epoch
        lr_decay = decay ** max(i_epoch - max_epoch_decay_lr, 0.0)

        learning_rate = tf.assign(lr, learning_rate*lr_decay); learning_rate = sess.run(learning_rate)
        print(f'Epoch: {i_epoch}, Learning Rate: {learning_rate:.3f}')

        # run the loop for this epoch in the training model
        train_perplexity = run_model(train_data, optimizer_op, verbose=True)
        print(f'Epoch {i_epoch}, Train Perplexity: {train_perplexity:3f}')
        
        # run the loop for this epoch in the validation model
        valid_perplexity = run_model(valid_data, tf.no_op())
        print(f'Epoch {i_epoch}, Valid Perplexity: {valid_perplexity:3f}')
        
    # run the loop in the testing model to see how effective was our training
    test_perplexity = run_model(test_data, tf.no_op())
    print(f'Training LSTM Model is done. Test Perplexity: {test_perplexity:.3f}')

Epoch: 1, Learning Rate: 1.000
Iteration: 10/ 774, Perplexity: 4403.066, Speed: 1705 wps
Iteration: 87/ 774, Perplexity: 1289.160, Speed: 1591 wps
Iteration: 164/ 774, Perplexity: 1041.208, Speed: 1472 wps
Iteration: 241/ 774, Perplexity: 892.711, Speed: 1253 wps
Iteration: 318/ 774, Perplexity: 800.996, Speed: 1197 wps
Iteration: 395/ 774, Perplexity: 727.190, Speed: 1294 wps
Iteration: 472/ 774, Perplexity: 669.708, Speed: 1375 wps
Iteration: 549/ 774, Perplexity: 616.279, Speed: 1438 wps
Iteration: 626/ 774, Perplexity: 571.154, Speed: 1460 wps
Iteration: 703/ 774, Perplexity: 533.093, Speed: 1486 wps
Epoch 1, Train Perplexity: 504.587058
Epoch 1, Valid Perplexity: 306.678596
Epoch: 2, Learning Rate: 1.000
Iteration: 10/ 774, Perplexity: 318.602, Speed: 2045 wps
Iteration: 87/ 774, Perplexity: 272.840, Speed: 2033 wps
Iteration: 164/ 774, Perplexity: 260.731, Speed: 2044 wps
Iteration: 241/ 774, Perplexity: 248.666, Speed: 1997 wps
Iteration: 318/ 774, Perplexity: 244.548, Speed: 19

Iteration: 87/ 774, Perplexity: 94.775, Speed: 1941 wps
Iteration: 164/ 774, Perplexity: 93.197, Speed: 1933 wps
Iteration: 241/ 774, Perplexity: 90.799, Speed: 1930 wps
Iteration: 318/ 774, Perplexity: 91.356, Speed: 1905 wps
Iteration: 395/ 774, Perplexity: 89.596, Speed: 1783 wps
Iteration: 472/ 774, Perplexity: 88.881, Speed: 1820 wps
Iteration: 549/ 774, Perplexity: 86.366, Speed: 1824 wps
Iteration: 626/ 774, Perplexity: 84.359, Speed: 1807 wps
Iteration: 703/ 774, Perplexity: 82.909, Speed: 1788 wps
Epoch 13, Train Perplexity: 81.872793
Epoch 13, Valid Perplexity: 125.723708
Epoch: 14, Learning Rate: 0.000
Iteration: 10/ 774, Perplexity: 107.831, Speed: 1890 wps
Iteration: 87/ 774, Perplexity: 94.775, Speed: 1951 wps
Iteration: 164/ 774, Perplexity: 93.197, Speed: 1998 wps
Iteration: 241/ 774, Perplexity: 90.799, Speed: 1951 wps
Iteration: 318/ 774, Perplexity: 91.356, Speed: 1948 wps
Iteration: 395/ 774, Perplexity: 89.596, Speed: 1927 wps
Iteration: 472/ 774, Perplexity: 88.88

---