In [10]:
import time
import numpy as np
import tensorflow as tf

<h3> Get Data: </h3>
Download the Penn Treebank dataset from IBM:

In [21]:
!mkdir data
!wget -q -O data/ptb.zip https://ibm.box.com/shared/static/z2yvmhbskc45xd2a9a4kkn6hg4g4kj5r.zip
!unzip -o data/ptb.zip -d data
!cp data/ptb/reader.py .

import reader

mkdir: cannot create directory ‘data’: File exists
Archive:  data/ptb.zip
  inflating: data/ptb/reader.py      
  inflating: data/__MACOSX/ptb/._reader.py  
  inflating: data/__MACOSX/._ptb     


Download simple examples dataset:

In [12]:
!wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz 
!tar xzf simple-examples.tgz -C data/

--2019-03-08 18:12:29--  http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
Resolving www.fit.vutbr.cz (www.fit.vutbr.cz)... 147.229.9.23, 2001:67c:1220:809::93e5:917
Connecting to www.fit.vutbr.cz (www.fit.vutbr.cz)|147.229.9.23|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34869662 (33M) [application/x-gtar]
Saving to: ‘simple-examples.tgz.8’


2019-03-08 18:12:40 (3.30 MB/s) - ‘simple-examples.tgz.8’ saved [34869662/34869662]



<h3> Defining Hyperparameters: </h3>
Here, we define the model's hypterparameters so that we can practice playing around with them:

In [13]:
init_scale = 0.1                  # initial weight scale
learning_rate = 1.0               # initial learning weight
max_grad_norm = 5                 # max permissible norm for the gradient -- for Gradient Clipping
num_layers = 2                    # number of layers in our model
num_steps = 20                    # total number of recurrence steps 

hidden_size_l1 = 256              # number of neurons (processing units) in the hidden layers
hidden_size_l2 = 128

max_epoch_decay_lr = 4            # max number of epochs trained with the initial learning weight
max_epoch = 15                    # total epochs in training

keep_prob = 1                     # probability of keeping data in the Dropout layer
decay = 0.5                       # the decay for the learning rate
batch_size = 60                   # size for each batch of data
vocab_size = 10000                # vocab size
embedding_vector_size = 200       

is_training = 1                   # training flag to separate training from testing
data_dir = "data/simple-examples/data/" # data directory

<h3> LSTM Model </h3>
All the code in the other document shows how the model is built step by step. Now we can create a class combining all the steps in that document that represents our model

In [14]:
class PTBModel(object):
    
    def __init__(self, action_type):
        
        # setting parameters for ease of use
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.hidden_size_l1 = hidden_size_l1
        self.hidden_size_l2 = hidden_size_l2
        self.vocab_size = vocab_size
        self.embedding_vector_size = embedding_vector_size
        # ------------------------------------------------
        
        # creating placeholders for our input data and expected output
        self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) #[60x20]
        self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) #[60x20]
        # -----------------------------------------------------------------------
        
        
        #############################
        # Creating the LSTM structure
        #############################
            # and connecting this with the RNN structure
            # here, no bias is added to the Forget gate
            # the LSTM cell processes one word at a time and computes probabilities of the possible.. 
                # continuations of the sentence
        lstm_cell_l1 = tf.contrib.rnn.BasicLSTMCell(self.hidden_size_l1, forget_bias=0.0)
        lstm_cell_l2 = tf.contrib.rnn.BasicLSTMCell(self.hidden_size_l2, forget_bias=0.0)
        # ---------------------------------------------------------------------------------
        
        # creating the Dropout wrapper for our LSTM unit
            # unless we changed keep_prob, this won't actually execute
        if action_type == "is_training" and keep_prob < 1:
            lstm_cell_l1 = tf.contrib.rnn.DropoutWrapper(lstm_cell_l1, output_keep_prob=keep_prob)
            lstm_cell_l2 = tf.contrib.rnn.DropoutWrapper(lstm_cell_l2, output_keep_prob=keep_prob)
        # -----------------------------------------------------------------------------------------
        
        # Creating the RNN structure. The RNN is composed sequentially of multiple simple cells.
        stacked_lstm = tf.contrib.rnn.MultiRNNCell([lstm_cell_l1, lstm_cell_l2])
        # ----------------------------------------------------------------------
        
        # Defining the initial state.
            # the memory of the network is initialized with a vector of zeros and gets updated after reading each word
        self._initial_state = stacked_lstm.zero_state(batch_size, tf.float32)
        # -------------------------------------------------------------------
        
        
        ##################################################################
        # Creating the word embeddings and pointing them to the input data
        ##################################################################
        with tf.device("/cpu:0"):
            # Create the embeddings for our input data. Size is hidden size.
            embedding = tf.get_variable("embedding", [vocab_size, self.embedding_vector_size])  # [10000x200]
            # Define where to get the data for our embeddings from
            inputs = tf.nn.embedding_lookup(embedding, self._input_data)
        
        # Creating the Dropout Addition for our inputs
            # this will only execute if we change keep_prob
        if action_type == "is_training" and keep_prob < 1:
            inputs = tf.nn.dropout(inputs, keep_prob)
        
        
        ##########################################
        # Creating the input structure for our RNN
        ##########################################
        
        
        ##################################################################################################
        # Instantiating our RNN model and retrieving the structure for returning the outputs and the state 
        ##################################################################################################
        
        outputs, state = tf.nn.dynamic_rnn(stacked_lstm, inputs, initial_state=self._initial_state)
        
        #######################################################################
        # Creating a logistic unit to return the probability of the output word
        #######################################################################
        output = tf.reshape(outputs, [-1, self.hidden_size_l2])
        softmax_w = tf.get_variable("softmax_w", [self.hidden_size_l2, vocab_size]) #[200x1000]
        softmax_b = tf.get_variable("softmax_b", [vocab_size]) #[1x1000]
        logits = tf.matmul(output, softmax_w) + softmax_b
        logits = tf.reshape(logits, [self.batch_size, self.num_steps, vocab_size])
        prob = tf.nn.softmax(logits)
        out_words = tf.argmax(prob, axis=2)
        self._output_words = out_words
        
        ###############################################################
        # Defining the loss and cost functions for our model's learning
        ###############################################################
        
        # use the contrib seq loss and aerage over the batches
        loss = tf.contrib.seq2seq.sequence_loss(
            logits,
            self.targets,
            tf.ones([batch_size, num_steps], dtype=tf.float32),
            average_across_timesteps=False,
            average_across_batch=True)
        
        self._cost = tf.reduce_sum(loss)
        
        # Store the final state
        self._final_state = state
        
        # everything after this point is only relevant for training
        if action_type != "is_training":
            return
        
        #############################################
        # Create the training operation for our model
        #############################################
        
        # Create a variable for the learning rate
        self._lr = tf.Variable(0.0, trainable=False)
        # Get all TensorFlow variables marked as "trainable" (i.e. all of them except _lr, which we just created)
        tvars = tf.trainable_variables()
        # Define the gradient clipping threshold
        grads, _ = tf.clip_by_global_norm(tf.gradients(self._cost, tvars), max_grad_norm)
        # Create the gradient descent optimizer with our learning rate
        optimizer = tf.train.GradientDescentOptimizer(self.lr)
        # Create the training TensorFlow Operation through our optimizer
        self._train_op = optimizer.apply_gradients(zip(grads, tvars))
        
    #########################################
    # Helper functions for our LSTM RNN class
    #########################################
    
    # assign learning rate for this model:
    def assign_lr(self, session, lr_value):
        session.run(tf.assign(self.lr, lr_value))
    
    # Returns the input data for this model at a point in time
    @property
    def input_data(self):
        return self._input_data
    
    # Returns the targets for this model at a point in time
    @property
    def targets(self):
        return self._targets
    
    # Returns the Initial State for this model
    @property
    def initial_state(self):
        return self._initial_state
    
    # Returns the defined cost
    @property
    def cost(self):
        return self._cost
    
    # Returns the final state for this model
    @property
    def final_state(self):
        return self._final_state
    
    # Returns the final output words for this model
    @property
    def final_output_words(self):
        return self._output_words
    
    # Returns the current learning rate for this model
    @property
    def lr(self):
        return self._lr
    
    # Returns the training operation defined for this model:
    @property
    def train_op(self):
        return self._train_op

With that, the actual structure of our Recurrent Neural Network with Long Short-Term Memory is finished. What remains for us to do is to actually create the methods to run through time -- that is, the <code>run_epoch</code> method to be run at each epoch and a <code>main</code> script which ties all of this together.

What our <code>run_epoch</code> method should do is take our input data and feed it to the relevant operations. This will return at the very least the current result for the cost function.

In [15]:
# our function takes the parameters: the current session, the model instance, the data to be fed, the operation to be run
def run_one_epoch(session, m, data, eval_op, verbose=False):
    
    # define epoch size based on the length of the data, batch size and number of steps
    epoch_size = ((len(data) // m.batch_size) - 1) // m.num_steps
    start_time = time.time()
    costs = 0.0
    iters = 0
    
    state = session.run(m.initial_state)
    
    # For each step and data point:
    for step, (x, y) in enumerate(reader.ptb_iterator(data, m.batch_size, m.num_steps)):
        
        #Evaluate and return cost, state by running cost, final_state and the function passed as parameter
        cost, state, out_words, _ = session.run([m.cost, m.final_state, m.final_output_words, eval_op],
                                     {m.input_data: x,
                                      m.targets: y,
                                      m.initial_state: state})

        #Add returned cost to costs (which keeps track of the total costs for this epoch)
        costs += cost
        
        #Add number of steps to iteration counter
        iters += m.num_steps

        if verbose and step % (epoch_size // 10) == 10:
            print("Itr %d of %d, perplexity: %.3f speed: %.0f wps" % (step , epoch_size, np.exp(costs / iters), iters * m.batch_size / (time.time() - start_time)))

    # Returns the Perplexity rating for us to keep track of how the model is evolving
    return np.exp(costs / iters)

Now, we create the <code>main</code> method to tie everything together. The code here reads the data from the directory, using the <code>reader</code> helper module, and then trains and evaluates the model on both a testing and a validating subset of data.

In [19]:
# Reads the data and separates it into training, validation, and testing
raw_data = reader.ptb_raw_data(data_dir)
train_data, valid_data, test_data, _, _ = raw_data

<h3> Running the Model: </h3>

In [22]:
# Initializes the execution graph and the Session:
with tf.Graph().as_default(), tf.Session() as session:
    initializer = tf.random_uniform_initializer(-init_scale, init_scale)
    
    # Instantiates the model for training
    # tf.variable_scope adds a prefix to the variables created with tf.get_variable
    with tf.variable_scope("model", reuse=None, initializer=initializer):
        m = PTBModel("is_training")
        
    
    # Reuses the trained parameters for the validation and testing models;
    with tf.variable_scope("model", reuse = True, initializer = initializer):
        mvalid = PTBModel("is_validating")
        mtest = PTBModel("is_testing")
        
    # initialize all variables:
    tf.global_variables_initializer().run()
    
    for i in range(max_epoch):
        # Define the decay for this epoch
        lr_decay = decay ** max(i - max_epoch_decay_lr, 0.0)
        
        # Set the decayed learning rate as the learning rate for this epoch
        m.assign_lr(session, learning_rate * lr_decay)
        
        print("Epoch %d: Learning Rate: %.3f" % (i + 1, session.run(m.lr)))
        
        # Run the loop for this epoch in the training model
        train_perplexity = run_one_epoch(session, m, train_data, m.train_op, verbose=True)
        print("Epoch %d : Train Perplexity: %.3f" % (i + 1, train_perplexity))
        
        # Run the loop for this epoch in the validation model
        valid_perplexity = run_one_epoch(session, mvalid, valid_data, tf.no_op())
        print("Epoch %d : Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
        
    # Run the loop in the testing model to see how effective our training was:
    test_perplexity = run_one_epoch(session, mtest, test_data, tf.no_op())
    
    print("Test Perplexity: %.3f" % test_perplexity)

Epoch 1: Learning Rate: 1.000
Itr 10 of 774, perplexity: 5085.076 speed: 1355 wps
Itr 87 of 774, perplexity: 1305.121 speed: 1363 wps
Itr 164 of 774, perplexity: 998.157 speed: 1359 wps
Itr 241 of 774, perplexity: 825.813 speed: 1356 wps
Itr 318 of 774, perplexity: 727.719 speed: 1357 wps
Itr 395 of 774, perplexity: 650.444 speed: 1357 wps
Itr 472 of 774, perplexity: 587.534 speed: 1355 wps
Itr 549 of 774, perplexity: 532.543 speed: 1355 wps
Itr 626 of 774, perplexity: 488.792 speed: 1356 wps
Itr 703 of 774, perplexity: 454.458 speed: 1357 wps
Epoch 1 : Train Perplexity: 430.138
Epoch 1 : Valid Perplexity: 235.147
Epoch 2: Learning Rate: 1.000
Itr 10 of 774, perplexity: 271.578 speed: 1333 wps
Itr 87 of 774, perplexity: 237.939 speed: 1345 wps
Itr 164 of 774, perplexity: 228.008 speed: 1347 wps
Itr 241 of 774, perplexity: 218.474 speed: 1350 wps
Itr 318 of 774, perplexity: 215.992 speed: 1352 wps
Itr 395 of 774, perplexity: 210.086 speed: 1352 wps
Itr 472 of 774, perplexity: 205.738 sp

In [24]:
m.save_weights("data/simple-examples/data/")
mtest.save_weights("data/simple-examples/data/")

AttributeError: 'PTBModel' object has no attribute 'save_weights'

In [23]:
m.save("data/simple-examples/data/train_model.h5")
mtest.save("data/simple-examples/data/train_model.h5")

AttributeError: 'PTBModel' object has no attribute 'save'