[View in Colaboratory](https://colab.research.google.com/github/lverwimp/RNN_language_modeling/blob/master/rnn_lms.ipynb)

Imports:

In [0]:
import tensorflow as tf
import numpy as np
import urllib, collections

Enable eager execution in TensorFlow:

In [0]:
tf.enable_eager_execution()

Some global variables:

In [0]:
BATCH_SIZE = 20
NUM_STEPS = 20

Get training, validation and test data:

In [0]:
train_url = 'http://homes.esat.kuleuven.be/~lverwimp/course_speech_recognition/train.txt'
valid_url = 'http://homes.esat.kuleuven.be/~lverwimp/course_speech_recognition/valid.txt'
test_url = 'http://homes.esat.kuleuven.be/~lverwimp/course_speech_recognition/test.txt'
train_file = urllib.urlopen(train_url).read()
valid_file = urllib.urlopen(valid_url).read()
test_file = urllib.urlopen(test_url).read()

The data looks like this:

In [107]:
print('{0}...'.format(valid_file[:500]))

 consumers may want to move their telephones a little closer to the tv set 
 <unk> <unk> watching abc 's monday night football can now vote during <unk> for the greatest play in N years from among four or five <unk> <unk> 
 two weeks ago viewers of several nbc <unk> consumer segments started calling a N number for advice on various <unk> issues 
 and the new syndicated reality show hard copy records viewers ' opinions for possible airing on the next day 's show 
 interactive telephone technology...


<unk\> is a symbol for the unknown words class, 'N' is a symbol used for the numbers class.
  
Convert data to correct format:

In [0]:
# convert the string to a list and replace newlines with the end-of-sentence symbol
train_text = [w for w in train_file.replace('\n',' <eos>').split(' ')]
valid_text = [w for w in valid_file.replace('\n',' <eos>').split(' ')]
test_text = [w for w in test_file.replace('\n',' <eos>').split(' ')]

# count the frequencies of the words in the training data
counter = collections.Counter(train_text)

# sort according to decreasing frequency
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

# words = list of all the words (in decreasing frequency)
items, _ = list(zip(*count_pairs))

# make a dictionary with a mapping from each word to an id; word with highest frequency gets lowest id etc.
item_to_id = dict(zip(items, range(len(items))))

# convert the words to indices
train_ids = [item_to_id[item] for item in train_text]
valid_ids = [item_to_id[item] for item in valid_text]
test_ids = [item_to_id[item] for item in test_text]

Once the data is converted to ids, it looks like this:

In [109]:
print(valid_ids[:100])

[2, 1133, 94, 359, 6, 330, 52, 9837, 7, 327, 2477, 6, 0, 663, 389, 2, 3, 1, 1, 2975, 2159, 10, 382, 1069, 2348, 90, 100, 848, 199, 1, 12, 0, 3384, 1120, 8, 4, 73, 21, 212, 347, 37, 259, 1, 1, 2, 3, 76, 423, 196, 3918, 5, 250, 1796, 1, 581, 3529, 893, 2375, 7, 4, 298, 12, 2710, 17, 1187, 1, 251, 2, 3, 9, 0, 36, 9923, 3748, 465, 711, 2999, 2038, 3918, 135, 6146, 12, 495, 5895, 17, 0, 131, 273, 10, 465, 2, 3, 9959, 733, 504, 31, 642, 7, 36, 6499]


Class for the language model:

In [0]:
class rnn_lm(object):
  '''
  This is a class to build and execute a recurrent neural network language model.
  '''
  
  def __init__(self,
              cell='LSTM',
              vocab_size=10000,
              embedding_size=64,
              hidden_size=128,
              dropout_rate=0.5):
    self.which_cell = cell
    self.vocab_size = vocab_size
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size
    self.dropout_rate = dropout_rate
    self.batch_size = BATCH_SIZE
    self.max_grad_norm = 5
    self.lr = 1
    
  def build_training_graph(self):
    '''
    This function builds the training graph.
    '''
    
    # input embedding weights
    self.embedding = tf.get_variable("embedding", 
                                     [self.vocab_size, self.embedding_size], 
                                     dtype=tf.float32)
    
    # hidden layer
    if self.which_cell == 'LSTM':
      self.cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_size)
    elif self.which_cell == 'RNN':
      self.cell = tf.contrib.rnn.BasicRNNCell(self.hidden_size)
    elif self.which_cell == 'GRU':
      self.cell = tf.contrib.rnn.GRUCell(self.hidden_size)
    else:
      raise IOError("Specify which type of RNN you want to use: RNN, GRU or LSTM.")
      
    # apply dropout  
    self.cell = tf.contrib.rnn.DropoutWrapper(self.cell, 
                                              output_keep_prob=self.dropout_rate)
    
    # initial state contains all zeros
    self.initial_state = self.cell.zero_state(self.batch_size, tf.float32)
    
    # output weight matrix and bias
    self.softmax_w = tf.get_variable("softmax_w",
                                     [self.hidden_size, self.vocab_size], 
                                     dtype=tf.float32)
    self.softmax_b = tf.get_variable("softmax_b",
                                     [self.vocab_size], 
                                     dtype=tf.float32)
    
    return self.initial_state
    
  def train_model(self, inputs, targets, state):
    
    # Step 1: feed the inputs to model and calculate output logits
    output, state = self.feed_to_network(inputs, state)
    
    # Step 2: calculate loss
    self.calc_loss(output, targets)
    
    # Step 3: calculate gradients and update the parameters of the network
    #self.update_params(loss)
    
    return state
    
    
  def feed_to_network(self, inputs, state):
    
    # map input indices to continuous input vectors
    inputs = tf.nn.embedding_lookup(self.embedding, inputs)

	  # use dropout on the input embeddings
    inputs = tf.nn.dropout(inputs, self.dropout_rate)
    
    # feed inputs to network: outputs = predictions, state = new hidden state
    outputs, state = tf.nn.dynamic_rnn(self.cell, inputs, sequence_length=None, initial_state=state)
    
    output = tf.reshape(tf.concat(outputs, 1), [-1, self.hidden_size])
    
    # calculate logits
    #logits = tf.matmul(output, self.softmax_w) + self.softmax_b
    
    return output, state
  
  def calc_loss(self, output, targets):
    
    with tf.GradientTape() as tape:
      logits = tf.matmul(output, self.softmax_w) + self.softmax_b
      
      # calculate cross entropy loss
      loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=tf.reshape(targets, [-1]), logits=logits)
      
      # average loss per batch
      avg_loss = tf.reduce_sum(loss) / self.batch_size
    
    grads = tape.gradient(avg_loss, tf.trainable_variables())
    print('grads: {0}'.format(grads))
    print(tf.trainable_variables())
    
    grads = tf.clip_by_global_norm(grads, self.max_grad_norm)
    print('grads: {0}'.format(grads))
    
    # optimize with stochastic gradient descent
    optimizer = tf.train.GradientDescentOptimizer(self.lr)
    
    optimizer.apply_gradients(
				zip(grads, tf.trainable_variables()))
    
    #return avg_loss
  
  def update_params(self, cost):
    
    # calculte gradients for all trainable variables 
    # + clip them if their global norm > 5 (prevents exploding gradients)
    grads, _ = tf.clip_by_global_norm(
        tf.gradients(cost, tf.trainable_variables()), self.max_grad_norm)
    
    # optimize with stochastic gradient descent
    optimizer = tf.train.GradientDescentOptimizer(self.lr)
    
    optimizer.apply_gradients(
				zip(grads, tf.trainable_variables()))

Create network:

In [0]:
def create_batches(data, vocab_size, rnn, state):
  
  data_array = np.array(data)
  
  len_batch_instance = vocab_size / BATCH_SIZE
  
  data_array = data_array[:BATCH_SIZE*len_batch_instance]
  
  # divide data in BATCH_SIZE parts
  data_reshaped = np.reshape(data_array, (BATCH_SIZE, len_batch_instance))
  
  # number of mini-batches that can be generated
  num_batches_in_data = len_batch_instance / NUM_STEPS

  # generate mini-batches
  for i in range(num_batches_in_data):
    input_batch = data_reshaped[:,i*BATCH_SIZE:i*BATCH_SIZE+BATCH_SIZE]
    # target = input shifted 1 time step
    target_batch = data_reshaped[:,i*BATCH_SIZE+1:i*BATCH_SIZE+BATCH_SIZE+1]
    
    state = rnn.train_model(input_batch, target_batch, state)
  


In [134]:
rnn = rnn_lm()

init_state = rnn.build_training_graph()

create_batches(train_ids, len(item_to_id), rnn, init_state)

grads: []
[]
grads: ([], <tf.Tensor: id=10808, shape=(), dtype=float32, numpy=0.0>)


ValueError: ignored