Basic seq2seq model with Tensorflow
======================

In [2]:
__author__ = 'Nicholas Tomlin'
__version__ = "CSLI Summer 2018 internship"

### Imports 
Tested with Tensorflow 1.8.0. Using the Dense layer for seq2seq inference decoder, which will be described below. We'll need to add the `src/models/` file to our Python path to import the base RNN model.

In [80]:
import numpy as np
import tensorflow as tf
import warnings
import random
from tensorflow.python.layers.core import Dense

In [81]:
import sys
sys.path.append('../src/models/')
from tf_rnn_classifier import TfRNNClassifier

### Basic seq2seq class definition
We build a single graph which includes embeddings, encoding, and two separate decoding functions. One decoding function is used during training, and the other is used for inference (prediction). 

In [120]:
class TfEncoderDecoder(TfRNNClassifier):
    def __init__(self, max_input_length=5, max_output_length=6, num_layers=2, **kwargs):
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
        self.num_layers = num_layers
        super(TfEncoderDecoder, self).__init__(**kwargs)

    def build_graph(self):
        self._define_embedding()
        self._init_placeholders()
        self._init_embedding()
        self.encoding_layer()
        self.decoding_layer()


    def _init_placeholders(self):
        self.encoder_inputs = tf.placeholder(
            shape=[None, None],
            dtype=tf.int32,
            name="encoder_inputs")

        self.encoder_lengths = tf.placeholder(
            shape=[None,],
            dtype=tf.int32,
            name="encoder_lengths")

        self.decoder_inputs= tf.placeholder(
            shape=[None, None],
            dtype=tf.int32,
            name="decoder_inputs")

        self.decoder_targets = tf.placeholder(
            shape=[None, None],
            dtype=tf.int32,
            name="decoder_targets")

        self.decoder_lengths = tf.placeholder(
            shape=[None,],
            dtype=tf.int32,
            name="decoder_lengths")

    def _init_embedding(self):
#         self.embedded_encoder_inputs = tf.nn.embedding_lookup(self.embedding, self.encoder_inputs)
#         self.embedded_decoder_inputs = tf.nn.embedding_lookup(self.embedding, self.decoder_inputs)
        self.embedded_encoder_inputs = tf.contrib.layers.embed_sequence(
            self.encoder_inputs,
            self.vocab_size,
            self.embed_dim)
        decoder_embedding_space = tf.Variable(tf.random_uniform([self.vocab_size, self.embed_dim]))
        self.embedded_decoder_inputs = tf.nn.embedding_lookup(
            decoder_embedding_space,
            self.decoder_inputs)


    def encoding_layer(self):
#         encoder_cell = tf.nn.rnn_cell.LSTMCell(
#             self.hidden_dim, 
#             activation=self.hidden_activation)
        encoder_cell = tf.contrib.rnn.MultiRNNCell([
            tf.nn.rnn_cell.LSTMCell(
            self.hidden_dim, 
            initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2)) for _ in range(self.num_layers)])
        
        encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
            cell=encoder_cell,
            inputs=self.embedded_encoder_inputs,
            sequence_length=self.encoder_lengths,
            dtype=tf.float32,
            scope="encoding_layer")
        
        self.encoder_final_state = encoder_final_state

    def decoding_layer(self):
        self.decoding_training()
        self.decoding_inference()

    def decoding_training(self):
#         self.decoder_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_dim, activation=self.hidden_activation)
        self.decoder_cell = tf.contrib.rnn.MultiRNNCell([
            tf.nn.rnn_cell.LSTMCell(
            self.hidden_dim, 
            initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2)) for _ in range(self.num_layers)])
        
        self.output_layer = Dense(
            self.vocab_size,
            kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
        
        training_helper = tf.contrib.seq2seq.TrainingHelper(
            inputs=self.embedded_decoder_inputs,
            sequence_length=self.decoder_lengths,
            time_major=False)
        
        training_decoder = tf.contrib.seq2seq.BasicDecoder(
            self.decoder_cell,
            training_helper,
            self.encoder_final_state,
            self.output_layer)
        
        training_decoder_output = tf.contrib.seq2seq.dynamic_decode(
            training_decoder,
            impute_finished=True,
            maximum_iterations=self.max_output_length)[0]
        
        self.training_outputs = training_decoder_output
        self.training_logits = training_decoder_output.rnn_output
#         self.decoder_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_dim, activation=self.hidden_activation)

#         decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(
#             self.decoder_cell,
#             self.embedded_decoder_inputs,
#             initial_state=self.encoder_final_state,
#             time_major=True,
#             dtype=tf.float32,
#             scope="decoding_layer")
#         decoder_logits = tf.contrib.layers.linear(decoder_outputs, self.vocab_size)
        
#         self.training_outputs = decoder_outputs
#         self.training_logits = decoder_logits

    def decoding_inference(self):
        start_tokens = tf.tile(
            input=tf.constant([2], dtype=tf.int32), # TODO: don't hardcode start token like this (2)
            multiples=[self.batch_size],
            name='start_tokens')

        helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
            embedding=self.embedding,
            start_tokens=start_tokens,
            end_token=3) # TODO: don't hardcode end token like this (3)

        inference_decoder = tf.contrib.seq2seq.BasicDecoder(
            self.decoder_cell,
            helper,
            self.encoder_final_state,
            self.output_layer)

        inference_decoder_output = tf.contrib.seq2seq.dynamic_decode(
            inference_decoder,
            impute_finished=True,
            maximum_iterations=self.max_output_length)[0]

        self.inference_decoder_output = inference_decoder_output 
        self.inference_logits = inference_decoder_output.sample_id

    def prepare_output_data(self, y):
        return y
    
    def prepare_data(self, X, max_length):
        new_X = np.zeros((len(X), max_length), dtype='int')
        ex_lengths = []
        index = dict(zip(self.vocab, range(len(self.vocab))))
        unk_index = index['$UNK']
        for i in range(new_X.shape[0]):
            ex_len = min([len(X[i]), max_length])
            ex_lengths.append(max_length)
            vals = X[i][-self.max_length: ]
            vals = [index.get(w, unk_index) for w in vals]
            temp = np.zeros((max_length,), dtype='int')
            temp[0: len(vals)] = vals
            new_X[i] = temp
        return new_X, ex_lengths

    def get_cost_function(self, **kwargs):
        masks = tf.sequence_mask(self.decoder_lengths, self.max_output_length, dtype=tf.float32, name='masks')
        cost = tf.contrib.seq2seq.sequence_loss(
            self.training_logits,
            self.decoder_targets,
            masks)
        return cost
#         return tf.reduce_mean(
#             tf.nn.softmax_cross_entropy_with_logits_v2(
#                 logits=self.training_logits,
#                 labels=tf.one_hot(self.decoder_targets, depth=self.vocab_size, dtype=tf.float32)))

#     def get_optimizer(self):
#         optimizer = tf.train.AdamOptimizer(self.eta)
#         gradients = optimizer.compute_gradients(self.cost)
#         capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
#         return optimizer.apply_gradients(capped_gradients)
# #         return tf.train.GradientDescentOptimizer(
# #             self.eta).minimize(self.cost)

    def predict(self, X):
        X, x_lengths = self.prepare_data(X, self.max_input_length)
        length = X.shape[1]
        X.resize((self.batch_size, length))
        answer_logits = self.sess.run(self.inference_logits, {self.encoder_inputs: X, 
                                      self.decoder_lengths: [6]*self.batch_size, 
                                      self.encoder_lengths: [5]*self.batch_size})[0] 
        return answer_logits

    def train_dict(self, X, y):
        decoder_inputs = [["<GO>"] + list(seq) for seq in y]
        decoder_targets = [list(seq) + ["<EOS>"] for seq in y]

        encoder_inputs, _ = self.prepare_data(X, self.max_input_length)
        decoder_inputs, _ = self.prepare_data(decoder_inputs, self.max_output_length)
        decoder_targets, _ = self.prepare_data(decoder_targets, self.max_output_length)
        encoder_lengths = [len(seq) for seq in encoder_inputs]
        decoder_lengths = [len(seq) for seq in decoder_inputs]
        
        return {self.encoder_inputs: encoder_inputs,
            self.decoder_inputs: decoder_inputs,
            self.decoder_targets: decoder_targets,
            self.encoder_lengths: encoder_lengths,
            self.decoder_lengths: decoder_lengths}

### Simple test dataset
Generate a dataset of "ab" strings that translates "a" to "b" and vice versa. For example:
 * "aaab" -> "bbba"
 * "bb" -> "aa"

Also need to define the vocab set. The superclass `TfModelBase` will take care of preprocessing.

In [121]:
vocab = ['<PAD>', '$UNK', '<GO>', '<EOS>', 'a', 'b']

train = []
for i in range(100):
    input_string = ""
    output_string = ""
    length = random.randint(1,5)
    for char in range(length):
        if (random.random() > 0.5):
            input_string += "a"
            output_string += "b"
        else:
            input_string += "b"
            output_string += "a"
        train.append([np.asarray(list(input_string)), np.asarray(list(output_string))])

In [122]:
train[:5]

[[array(['a'], dtype='<U1'), array(['b'], dtype='<U1')],
 [array(['a', 'a'], dtype='<U1'), array(['b', 'b'], dtype='<U1')],
 [array(['a', 'a', 'a'], dtype='<U1'), array(['b', 'b', 'b'], dtype='<U1')],
 [array(['a', 'a', 'a', 'a'], dtype='<U1'),
  array(['b', 'b', 'b', 'b'], dtype='<U1')],
 [array(['b'], dtype='<U1'), array(['a'], dtype='<U1')]]

In [123]:
test = [[np.asarray(list('ab')), np.asarray(list('ba'))],
        [np.asarray(list('ba')), np.asarray(list('ab'))]]

Now we can instantiate the class and test it:

In [124]:
seq2seq = TfEncoderDecoder(
    vocab=vocab, max_iter=10, max_length=6, eta=0.1)

X, y = zip(*train);
seq2seq.fit(X, y);

Iteration 10: loss: 1.5913816690444946

In [119]:
X_test, _ = zip(*test)
print('\nPredictions:', seq2seq.predict(X_test))


Predictions: [5 4 4 0 0 0]
