[View in Colaboratory](https://colab.research.google.com/github/lverwimp/RNN_language_modeling/blob/master/rnn_lms.ipynb)

Imports:

In [0]:
import tensorflow as tf
import numpy as np
import urllib, collections

Some global variables:

In [0]:
BATCH_SIZE = 20
NUM_STEPS = 20

Get training, validation and test data:

In [0]:
train_url = 'http://homes.esat.kuleuven.be/~lverwimp/course_speech_recognition/train.txt'
valid_url = 'http://homes.esat.kuleuven.be/~lverwimp/course_speech_recognition/valid.txt'
test_url = 'http://homes.esat.kuleuven.be/~lverwimp/course_speech_recognition/test.txt'
train_file = urllib.urlopen(train_url).read()
valid_file = urllib.urlopen(valid_url).read()
test_file = urllib.urlopen(test_url).read()

The data looks like this:

In [7]:
print('{0}...'.format(valid_file[:500]))

 consumers may want to move their telephones a little closer to the tv set 
 <unk> <unk> watching abc 's monday night football can now vote during <unk> for the greatest play in N years from among four or five <unk> <unk> 
 two weeks ago viewers of several nbc <unk> consumer segments started calling a N number for advice on various <unk> issues 
 and the new syndicated reality show hard copy records viewers ' opinions for possible airing on the next day 's show 
 interactive telephone technology...


<unk\> is a symbol for the unknown words class, 'N' is a symbol used for the numbers class.
  
Convert data to correct format:

In [0]:
# convert the string to a list and replace newlines with the end-of-sentence symbol
train_text = [w for w in train_file.replace('\n',' <eos>').split(' ')]
valid_text = [w for w in valid_file.replace('\n',' <eos>').split(' ')]
test_text = [w for w in test_file.replace('\n',' <eos>').split(' ')]

# count the frequencies of the words in the training data
counter = collections.Counter(train_text)

# sort according to decreasing frequency
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

# words = list of all the words (in decreasing frequency)
items, _ = list(zip(*count_pairs))

# make a dictionary with a mapping from each word to an id; word with highest frequency gets lowest id etc.
item_to_id = dict(zip(items, range(len(items))))
id_to_item = dict(zip(range(len(items)), items))
vocab_size = len(item_to_id)

# convert the words to indices
train_ids_large = [item_to_id[item] for item in train_text]
valid_ids_large = [item_to_id[item] for item in valid_text]
test_ids_large = [item_to_id[item] for item in test_text]

# take a smaller subset to speed up training
train_ids = train_ids_large[:200000]
valid_ids = valid_ids_large[:20000]
test_ids = test_ids_large[:20000]

Once the data is converted to ids, it looks like this:

In [52]:
print(valid_ids[:100])

[2, 1133, 94, 359, 6, 330, 52, 9837, 7, 327, 2477, 6, 0, 663, 389, 2, 3, 1, 1, 2975, 2159, 10, 382, 1069, 2348, 90, 100, 848, 199, 1, 12, 0, 3384, 1120, 8, 4, 73, 21, 212, 347, 37, 259, 1, 1, 2, 3, 76, 423, 196, 3918, 5, 250, 1796, 1, 581, 3529, 893, 2375, 7, 4, 298, 12, 2710, 17, 1187, 1, 251, 2, 3, 9, 0, 36, 9923, 3748, 465, 711, 2999, 2038, 3918, 135, 6146, 12, 495, 5895, 17, 0, 131, 273, 10, 465, 2, 3, 9959, 733, 504, 31, 642, 7, 36, 6499]


Class for the language model:

In [0]:
class rnn_lm(object):
  '''
  This is a class to build and execute a recurrent neural network language model.
  '''
  
  def __init__(self,
              cell='LSTM',
              vocab_size=10000,
              embedding_size=64,
              hidden_size=128,
              dropout_rate=0.5,
              is_training=True):
    self.which_cell = cell
    self.vocab_size = vocab_size
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size
    self.dropout_rate = dropout_rate
    self.is_training = is_training
    self.batch_size = BATCH_SIZE
    self.num_steps = NUM_STEPS
    self.max_grad_norm = 5
    self.lr = 1
    
    self.init_graph()
    
    self.output, self.state = self.feed_to_network()
    
    self.loss = self.calc_loss(self.output)
    
    if self.is_training:
      self.update_params(self.loss)
    
    
  def init_graph(self):
    '''
    This function initializes all elements of the network.
    '''
    
    self.inputs = tf.placeholder(dtype=tf.int32, shape=[self.batch_size, self.num_steps])
    self.targets = tf.placeholder(dtype=tf.int32, shape=[self.batch_size, self.num_steps])
    
    # input embedding weights
    self.embedding = tf.get_variable("embedding", 
                                     [self.vocab_size, self.embedding_size], 
                                     dtype=tf.float32)
    
    # hidden layer
    if self.which_cell == 'LSTM':
      self.basic_cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_size)
    elif self.which_cell == 'RNN':
      self.basic_cell = tf.contrib.rnn.BasicRNNCell(self.hidden_size)
    elif self.which_cell == 'GRU':
      self.basic_cell = tf.contrib.rnn.GRUCell(self.hidden_size)
    else:
      raise IOError("Specify which type of RNN you want to use: RNN, GRU or LSTM.")
      
    # apply dropout  
    self.cell = tf.contrib.rnn.DropoutWrapper(self.basic_cell, 
                                              output_keep_prob=self.dropout_rate)
    
    # initial state contains all zeros
    self.initial_state = self.cell.zero_state(self.batch_size, tf.float32)
    
    # output weight matrix and bias
    self.softmax_w = tf.get_variable("softmax_w",
                                     [self.hidden_size, self.vocab_size], 
                                     dtype=tf.float32)
    self.softmax_b = tf.get_variable("softmax_b",
                                     [self.vocab_size], 
                                     dtype=tf.float32)
    
    self.initial_state = self.cell.zero_state(self.batch_size, dtype=tf.float32)
    
    
  def feed_to_network(self):
    '''
    This function feeds the input to the network and returns the output and the state.
   
    '''
    
    # map input indices to continuous input vectors
    inputs = tf.nn.embedding_lookup(self.embedding, self.inputs)

	  # use dropout on the input embeddings
    inputs = tf.nn.dropout(inputs, self.dropout_rate)
    
    state = self.initial_state
    
    # feed inputs to network: outputs = predictions, state = new hidden state
    outputs, state = tf.nn.dynamic_rnn(self.cell, inputs, sequence_length=None, initial_state=state)
    
    output = tf.reshape(tf.concat(outputs, 1), [-1, self.hidden_size])
    
    return output, state
    
  
  def calc_loss(self, output):
    
    # calculate logits
    logits = tf.matmul(output, self.softmax_w) + self.softmax_b
      
    # calculate cross entropy loss
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=tf.reshape(self.targets, [-1]), logits=logits)
      
    # average loss per batch
    avg_loss = tf.reduce_sum(loss) / self.batch_size
    
    return avg_loss
  
  def update_params(self, loss):
    
    # calculate gradients for all trainable variables 
    # + clip them if their global norm > 5 (prevents exploding gradients)
    grads, _ = tf.clip_by_global_norm(
        tf.gradients(loss, tf.trainable_variables()), self.max_grad_norm)
    
    # optimize with stochastic gradient descent
    optimizer = tf.train.GradientDescentOptimizer(self.lr)
    
    # update the weights
    self.train_op = optimizer.apply_gradients(
				zip(grads, tf.trainable_variables()))

Create a class that will generate mini-batches from the data.

In [0]:
class batchGenerator(object):
  '''
  This class generates batches for a dataset.
  Input argument:
    data: list of indices (word ids)
  '''
  
  def __init__(self, data):
    '''
    Prepares a dataset.
    '''
  
    data_array = np.array(data)

    len_batch_instance = len(data) / BATCH_SIZE

    data_array = data_array[:BATCH_SIZE*len_batch_instance]

    # divide data in BATCH_SIZE parts
    self.data_reshaped = np.reshape(data_array, (BATCH_SIZE, len_batch_instance))

    # number of mini-batches that can be generated
    self.num_batches_in_data = len_batch_instance / NUM_STEPS - 1
    
    self.curr_idx = 0
  
  def generate(self):
    '''
    Generates
      input_batch: numpy array or None, if the end of the dataset is reached
      target_batch: numpy array or None, if the end of the dataset is reached
      end_reached: boolean, True is end of dataset is reached
    '''
    
    if self.curr_idx >= self.num_batches_in_data:
      return None, None, True

    # input: take slice of size BATCH_
    input_batch = self.data_reshaped[:,self.curr_idx*BATCH_SIZE:self.curr_idx*BATCH_SIZE+BATCH_SIZE]
    
    # target = input shifted 1 time step
    target_batch = self.data_reshaped[:,self.curr_idx*BATCH_SIZE+1:self.curr_idx*BATCH_SIZE+BATCH_SIZE+1]    

    self.curr_idx += 1
    
    return input_batch, target_batch, False
  


Here is an example of how batchGenerator can be used. You will notice that the target batch contains the same indices as the input batch, but shifted one (time) step to the right.

In [77]:
generator = batchGenerator(valid_ids)
input_batch, target_batch, end_reached = generator.generate()
print('This is what an input batch looks like:\n{0}'.format(input_batch))
print('And this is what a target batch looks like:\n{0}'.format(target_batch))

This is what an input batch looks like:
[[   2 1133   94  359    6  330   52 9837    7  327 2477    6    0  663
   389    2    3    1    1 2975]
 [  30   15   10 1540    6   26   44    4    4  626 2039    1  173    2
     3   65   47  584    6  189]
 [   1   95 2469   11  390    1   47  214 9936   20    1   80   26 1950
    67    0    1    5    1  175]
 [ 146 2370   16    2    3  640  748 3382 4740 2785   57  336  562    8
  1120   23    7   13    4   49]
 [  47 3280   11   24 2785   37   55    5    0   62 1379 1557  280    1
    23    0  948    8 6393   37]
 [  15  176   33 1056    6  330  122 1296   28    1 1853    8  133 8666
  9744    2    3   68   24 2670]
 [1094  485    1 3218   94   26  659    6  621   11 1030   30 1371   37
  6552 8336  128   20   39    1]
 [   2    3   36  632 4168   83 3089   96 3254 2369    0  695  904   12
    28  463    5 1768    1    1]
 [ 607  363    2    3    1 2848   10    1 3404  193    7 5443    2    3
   190    1   35    6   26   75]
 [   6  300   3

This is a function that does one pass over the whole dataset. If we are training the model, it will update the parameters and return the perplexity. Otherwise, it will just return the perplexity.

In [0]:
def run_epoch(session, rnn, data, is_training=True):
    '''
    This function runs a single epoch (pass) over the data,
    updating the model parameters if we are training,
    and returns the perplexity.
    Input arguments:
      rnn: object of the rnn_lm class
      data: list of word indices
      is_training: boolean, True is we are training the model
    Returns:
      ppl: float, perplexity of the dataset
    '''
  
    generator = batchGenerator(data)
      
    state = session.run(rnn.initial_state)
    sum_loss = 0.0
    iters = 0
      
    while True:

      input_batch, target_batch, end_reached = generator.generate()
        
      if end_reached:
        break

      feed_dict = {rnn.inputs: input_batch,
                  rnn.targets: target_batch,
                  rnn.initial_state : state}

      fetches = {'loss': rnn.loss,
                'state': rnn.state}
      
      if is_training:
        fetches['train_op'] = rnn.train_op

      result = session.run(fetches, feed_dict)
        
      state = result['state']
      loss = result['loss']

      sum_loss += loss
      iters += NUM_STEPS
        
    ppl = np.exp(sum_loss / iters)
    
    return ppl

This function can be called to build, train and test models with different parameter settings. 

In [0]:
def run_lm():
  '''
  Creates training, validation and/or test models,
  trains, validates and/or tests the model.
  '''
  
  with tf.Graph().as_default():

      with tf.variable_scope("Model"):
        rnn_train = rnn_lm(vocab_size=vocab_size)
      with tf.variable_scope("Model", reuse=True):
        rnn_valid = rnn_lm(vocab_size=vocab_size, is_training=False)
      with tf.variable_scope("Model", reuse=True):
        rnn_test = rnn_lm(vocab_size=vocab_size, is_training=False)
      

      sv = tf.train.Supervisor()

      with sv.managed_session(config=tf.ConfigProto()) as session:
        
        for i in xrange(5):
          
          print('Epoch {0}'.format(i+1))

          train_ppl = run_epoch(session, rnn_train, train_ids)
          print('train_ppl: {0}'.format(train_ppl))

          valid_ppl = run_epoch(session, rnn_valid, valid_ids, is_training=False)
          print('valid_ppl: {0}'.format(valid_ppl))

        test_ppl = run_epoch(session, rnn_test, test_ids, is_training=False)
        print('test_ppl: {0}'.format(test_ppl))
        
        
      
      
        
     

      

In [83]:
run_lm()

INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Starting standard services.
INFO:tensorflow:Starting queue runners.
Epoch 1
train_ppl: 486.357779209
valid_ppl: 339.133497603
Epoch 2
train_ppl: 268.892308079
valid_ppl: 298.921601973
Epoch 3
train_ppl: 220.752845014
valid_ppl: 279.741526476
Epoch 4
train_ppl: 196.242662901
valid_ppl: 275.163753012
Epoch 5
train_ppl: 181.286802735
valid_ppl: 272.032202024
test_ppl: 219.384786118
