# Hierarchical RNN for dialogue
The baseline hierarchical model from "Hierarchical Text Generation and Planning for Strategic Dialogue."

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import warnings
import random
import json
from tensorflow.python.layers.core import Dense

In [2]:
import sys
sys.path.append("../src/models/")
sys.path.append('../src/models/agents/')
sys.path.append('../src/data/')
from agent import Agent
from parse import SentenceParser

In [3]:
train_iterations = 300
learning_rate = 0.1
max_input_length = 6
max_output_length = 20
unk_threshold = 20

### Sentence-level parsing
Not dealing with the final action data for now. Training examples are lists of utterances.

In [4]:
parser = SentenceParser(unk_threshold=unk_threshold,
                  input_directory="../data/raw/",
                  output_directory="../data/tmp/")
print("Vocab size: {}".format(parser.vocab_size))

Vocab size: 502


In [5]:
parser.parse()

### Hierarchical Agent

In [12]:
class HierarchicalAgent(Agent):    
    def _init_placeholders(self):
        self.encoder_inputs = tf.placeholder(
            shape=[None, None, None],
            dtype=tf.int32,
            name="encoder_inputs")

        self.encoder_lengths = tf.placeholder(
            shape=[None, None],
            dtype=tf.int32,
            name="encoder_lengths")

        self.decoder_inputs= tf.placeholder(
            shape=[None, None, None],
            dtype=tf.int32,
            name="decoder_inputs")

        self.decoder_targets = tf.placeholder(
            shape=[None, None, None],
            dtype=tf.int32,
            name="decoder_targets")

        self.decoder_lengths = tf.placeholder(
            shape=[None, None],
            dtype=tf.int32,
            name="decoder_lengths")
        
    def encoding_layer(self):
        self.encoder_cell = tf.contrib.rnn.MultiRNNCell([
            tf.nn.rnn_cell.LSTMCell(
                self.hidden_dim, 
                initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=1),
                reuse=tf.AUTO_REUSE)
            for _ in range(self.num_layers)])
        
        self.encoder_final_states = tf.map_fn(
            fn=self.encode_step, 
            elems=(self.embedded_encoder_inputs, self.encoder_lengths), dtype=tf.float32)
    
    def encode_step(self, args):
        embedded_encoder_inputs = args[0]
        encoder_lengths = args[1]
        
        encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
            cell=self.encoder_cell,
            inputs=embedded_encoder_inputs,
            sequence_length=encoder_lengths,
            dtype=tf.float32)
        
        return encoder_final_state[-1][1]
        
    def context_layer(self):
        self.context_cell = tf.contrib.rnn.MultiRNNCell([
            tf.nn.rnn_cell.LSTMCell(
                self.hidden_dim, 
                initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            for _ in range(self.num_layers)])
        
        context_outputs, context_final_state = tf.nn.dynamic_rnn(
            cell=self.context_cell,
            inputs=self.encoder_final_states,
            sequence_length=[10] * self.batch_size, # TODO: replace "10" with max_turns
            dtype=tf.float32,
            scope="context_layer")
        
        self.context_outputs = context_outputs
        
    def decoding_layer(self):
        self.decoder_cell = tf.contrib.rnn.MultiRNNCell([
            tf.nn.rnn_cell.GRUCell(self.hidden_dim) for _ in range(self.num_layers)])
        
        self.output_layer = Dense(
            units=self.vocab_size,
            kernel_initializer=tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
        
        self.decoding_training()
#         self.decoding_inference()
    
    def decoding_training(self):
        self.decoding_logits = tf.map_fn(
            fn=self.decoding_training_step,
            elems=(self.context_outputs, self.embedded_decoder_inputs, self.decoder_lengths),
            dtype=tf.float32)
        
    def decoding_training_step(self, args):
        context_state = args[0]
        embedded_decoder_inputs = args[1]
        decoder_lengths = args[2]
        
        training_helper = tf.contrib.seq2seq.TrainingHelper(
            inputs=embedded_decoder_inputs,
            sequence_length=decoder_lengths,
            time_major=False)
        
        training_decoder = tf.contrib.seq2seq.BasicDecoder(
            cell=self.decoder_cell,
            helper=training_helper,
            initial_state=(context_state,context_state),
            output_layer=self.output_layer)

        training_outputs = tf.contrib.seq2seq.dynamic_decode(
            decoder=training_decoder,
            impute_finished=True,
            maximum_iterations=self.max_output_length)[0]
        
        return training_outputs.rnn_output # logits
    
    def build_graph(self):
        self._init_placeholders()
        self._define_embedding()
        self.encoding_layer()
        self.context_layer()
        self.decoding_layer()

In [13]:
tf.reset_default_graph()
a = HierarchicalAgent(vocab=parser.vocab,
              max_iter=train_iterations,
              eta=learning_rate,
              max_input_length=max_input_length,
              max_output_length=max_output_length,
              hidden_dim=64)

In [14]:
a.build_graph()

## External RNN code
Code from "Hierarchical Recurrent Encoder-Decoder for Generative Context-Aware Query Suggestion"
by Sordoni et al. (2015)

Paper: https://arxiv.org/abs/1507.02221
Repository: https://github.com/tscheepers/hred-attention-tensorflow

In [9]:
sys.path.append("../src/models/external/")
import layers

In [10]:
class HierarchicalAgent(Agent):    
    def encoding_layer(self):
        self.eoq_mask = tf.expand_dims(tf.cast(tf.not_equal(self.encoder_inputs, self.vocab.index("<eos>")), tf.float32), 2)
        
        self.sentence_encoder = tf.scan(
            lambda result_prev, x: layers.gru_layer_with_reset(
                result_prev[1],
                x,
                name='sentence_encoder',
                x_dim=self.embed_dim,
                y_dim=self.hidden_dim,
                reuse=tf.AUTO_REUSE
            ),
            (self.embedded_encoder_inputs, self.eoq_mask),
            initializer=tf.zeros((2, self.batch_size, self.hidden_dim))
        )
        
        self.encoder_final_states, _ = tf.unstack(self.sentence_encoder, axis=1)
        
    def context_layer(self):
        context_encoder = tf.scan(
            lambda result_prev, x: layers.gru_layer_with_retain(
                result_prev[1],
                x,
                name='context_encoder',
                x_dim=self.hidden_dim,
                y_dim=self.context_dim,
                reuse=tf.AUTO_REUSE
            ),
            (self.encoder_final_states, self.eoq_mask),
            initializer=tf.zeros((2, self.batch_size, self.context_dim))
        )
        
        self.context_final_states, _ = tf.unstack(context_encoder, axis=1)

    def decoding_layer(self):
        num_of_steps = tf.shape(self.encoder_inputs)[0]
        
        decoder = tf.scan(
            lambda result_prev, x: layers.gru_layer_with_state_reset(
                result_prev,
                x,
                name='decoder',
                x_dim=self.embed_dim,
                h_dim=self.context_dim,
                y_dim=self.hidden_dim,
                reuse=tf.AUTO_REUSE
            ),
            (self.embedded_encoder_inputs, self.eoq_mask, self.context_final_states),
            initializer=tf.zeros((self.batch_size, self.hidden_dim))
        )
        
        flatten_decoder = tf.reshape(decoder, (-1, self.hidden_dim))
        flatten_embedder = tf.reshape(self.embedded_encoder_inputs, (-1, self.embed_dim))
        
        output_layer = layers.output_layer(
            flatten_embedder,
            flatten_decoder,
            x_dim=self.embed_dim,
            h_dim=self.hidden_dim,
            y_dim=self.vocab_size,
            reuse=tf.AUTO_REUSE
        )
        
        flatten_logits = layers.logits_layer(
            output_layer,
            x_dim=self.vocab_size,
            y_dim=self.vocab_size,
            reuse=tf.AUTO_REUSE
        )
        
        self.training_logits = tf.reshape(flatten_logits, (num_of_steps, self.batch_size, self.vocab_size))
    
    def build_graph(self):
        self.context_dim = 101
        
        self._init_placeholders()
        self._define_embedding()
        self.encoding_layer()
        self.context_layer()
        self.decoding_layer()

In [11]:
b = HierarchicalAgent(vocab=parser.vocab,
              max_iter=train_iterations,
              eta=learning_rate,
              max_input_length=max_input_length,
              max_output_length=max_output_length,
              hidden_dim=64)

In [12]:
# b.build_graph()

### Model training

In [13]:
train_data = []
with open("../data/processed/train.txt", "r") as train_file:
    for line in train_file:
        train_example = json.loads(line)
        train_data.append((
            train_example["input"],
            train_example["output"][0].split()))

In [16]:
X, y = zip(*train_data)
X = y

In [17]:
b.fit(X, y, save_path="../models/tmp")

ValueError: setting an array element with a sequence.