In [1]:
import numpy as np
import tensorflow as tf
import Queue, sys

In [2]:
# Don't use all the VRAM!
#gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.15)
#sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

# We're on CPU!
sess = tf.Session()

In [3]:
# Data variables
seq_length = 16
out_seq_length = 16
batch_size = 1
vocab_size = 26 + 1               # 0 for padding
embedding_dim = 26

# Network hyperparameters
memory_dim = 200
num_layers = 1

# Training variables
epochs = 10

First build input placeholders and constants. The `seq2seq` API generally deals with lists of tensors, where each tensor represents a single timestep. An input to an embedding encoder, for example, would be a list of `seq_length` tensors, each of which is of dimension `batch_size` (specifying the embedding indices to input at a particular timestep).

We allocate a `labels` placeholder using the same convention. A `weights` constant specifies cross-entropy weights for each label at each timestep.

In [4]:
enc_inp = [tf.placeholder(tf.int32, shape=(None,), name="inp%i" % t) for t in range(seq_length)]
labels = [tf.placeholder(tf.int32, shape=(None,), name="labels%i" % t) for t in range(out_seq_length)]
weights = [tf.ones_like(labels_t, dtype=tf.float32) for labels_t in labels]
use_initial = tf.placeholder(tf.int32, shape=[], name="use_initial")
supplied_prev  = tf.placeholder(tf.float32, shape=(1, 28), name="supplied_prev")
supplied_state = tf.placeholder(tf.float32, shape=(1, memory_dim*2), name="supplied_state")
supplied_attns = tf.placeholder(tf.float32, shape=(1, memory_dim), name="supplied_attns")

# Decoder input: prepend some "GO" token and drop the final
# token of the decoder output
dec_inp = [tf.placeholder(tf.int32, shape=(None,), name="dec_inp%i" % t) for t in range(out_seq_length)]

# Initial memory value for recurrence.
prev_mem = tf.zeros((batch_size, memory_dim))

Build the sequence-to-sequence graph.

There is a **lot** of complexity hidden in these two calls, and it's certainly worth digging into both in order to really understand how this is working.

In [5]:
constituent_cell = tf.nn.rnn_cell.BasicLSTMCell(memory_dim)

if num_layers > 1:
    cell = tf.nn.rnn_cell.MultiRNNCell([constituent_cell] * num_layers)
else:
    cell = constituent_cell

# Without teacher forcing, with attention
#ntf_dec_outputs, ntf_dec_memory = tf.nn.seq2seq.embedding_attention_seq2seq(enc_inp, dec_inp, cell, vocab_size+1, vocab_size+1, embedding_dim, feed_previous=True)



In [6]:
from tensorflow.python.framework import dtypes
from tensorflow.python.ops import variable_scope
from tensorflow.python.ops import rnn_cell, rnn, nn_ops, math_ops, embedding_ops, array_ops

linear = rnn_cell._linear

In [7]:
def _extract_argmax_and_embed(embedding, output_projection=None,
                              update_embedding=True):
  """Get a loop_function that extracts the previous symbol and embeds it.
  Args:
    embedding: embedding tensor for symbols.
    output_projection: None or a pair (W, B). If provided, each fed previous
      output will first be multiplied by W and added B.
    update_embedding: Boolean; if False, the gradients will not propagate
      through the embeddings.
  Returns:
    A loop function.
  """
  def loop_function(prev, _):
    if output_projection is not None:
      prev = nn_ops.xw_plus_b(
          prev, output_projection[0], output_projection[1])
    prev_symbol = math_ops.argmax(prev, 1)
    # Note that gradients will not propagate through the second parameter of
    # embedding_lookup.
    emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
    if not update_embedding:
      emb_prev = array_ops.stop_gradient(emb_prev)
    return emb_prev
  return loop_function

In [8]:
with variable_scope.variable_scope("embedding_attention_seq2seq"):
    num_encoder_symbols = vocab_size + 1
    num_decoder_symbols = vocab_size + 1
    embedding_size      = embedding_dim

    # Encoder.
    encoder_cell = rnn_cell.EmbeddingWrapper(cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size)
    encoder_outputs, encoder_state = rnn.rnn(encoder_cell, enc_inp, dtype=dtypes.float32)

    # First calculate a concatenation of encoder outputs to put attention on.
    top_states = [array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs]
    attention_states = array_ops.concat(1, top_states)

    # Decoder.
    cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
        
    with variable_scope.variable_scope("embedding_attention_decoder"):
        embedding = variable_scope.get_variable("embedding", [num_decoder_symbols, embedding_size])
        loop_function = _extract_argmax_and_embed(embedding, None, True)
        emb_inp = [embedding_ops.embedding_lookup(embedding, i) for i in dec_inp]
        
        decoder_inputs = emb_inp
        initial_state = encoder_state
        output_size = cell.output_size
        
        # Attention Decoder
        with variable_scope.variable_scope("attention_decoder"):
            batch_size_ = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
            attn_length = attention_states.get_shape()[1].value
            attn_size = attention_states.get_shape()[2].value

            # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
            hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size])
            hidden_features = []
            v = []
            attention_vec_size = attn_size  # Size of query vectors for attention.
            
            for a in xrange(1):
                k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size])
                hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
                v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size]))

            state = tf.cond(use_initial > 0, lambda: initial_state, lambda: supplied_state)

            def attention(query):
                """Put attention masks on hidden using hidden_features and query."""
                ds = []  # Results of attention reads will be stored here.
                
                for a in xrange(1):
                    with variable_scope.variable_scope("Attention_%d" % a):
                        y = linear(query, attention_vec_size, True)
                        y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                        
                        # Attention mask is a softmax of v^T * tanh(...).
                        s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
                        a = nn_ops.softmax(s)
                        
                        # Now calculate the attention-weighted vector d.
                        d = math_ops.reduce_sum(array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2])
                        ds.append(array_ops.reshape(d, [-1, attn_size]))
                        
                return ds

            outputs = []
            prev = None
            batch_attn_size = array_ops.pack([batch_size_, attn_size])
            attns = [tf.cond(use_initial > 0, lambda: array_ops.zeros(batch_attn_size, dtype=dtypes.float32), lambda: supplied_attns) for _ in xrange(1)]
            
            for a in attns:  # Ensure the second shape of attention vectors is set.
                a.set_shape([None, attn_size])
            
            with variable_scope.variable_scope("loop_function", reuse=True):
                #inp = tf.cond(use_initial > 0, lambda: decoder_inputs[0], lambda: loop_function(supplied_prev, 0))
                inp = decoder_inputs[0]
                
            input_size = inp.get_shape().with_rank(2)[1]
            
            if input_size.value is None:
                raise ValueError("Could not infer input size from input: %s" % inp.name)
                
            x = linear([inp] + attns, input_size, True)
            
            # Run the RNN.
            cell_output, state = cell(x, state)
            
            # Run the attention mechanism.
            attns = attention(state)
            
            with variable_scope.variable_scope("AttnOutputProjection"):
                output = linear([cell_output] + attns, output_size, True)
                
        ntf_dec_outputs, ntf_dec_state, ntf_dec_attns = output, state, attns[0]

In [9]:
saver = tf.train.Saver(tf.all_variables(), max_to_keep=5)

# Restore variables
Optionally restore variables

In [10]:
resume_at = 650

if resume_at > 0:
    saver.restore(sess, 'checkpoints/saved-model-1off-attn-' + str(resume_at))

# Train

Do not initialize variables if restoring from a saved file.  
**Warning:** epoch numbers start from 0, and *will* overwrite your old saves!

In [11]:
if resume_at == 0:
    sess.run(tf.initialize_all_variables())

In [12]:
# Load data
train_x = np.load('data/mutated-train.npy')
train_y = np.load('data/fixes-train.npy')

assert(len(train_x) == len(train_y))
num_train = len(train_x)
print num_train

valid_x = np.load('data/mutated-validation.npy')
valid_y = np.load('data/fixes-validation.npy')

assert(len(valid_x) == len(valid_y))
num_validation = len(valid_x)
print num_validation

2080
640


In [80]:
def validate_batch(batch_id):
    X = valid_x[batch_id*batch_size:(batch_id+1)*batch_size]
    Y = valid_y[batch_id*batch_size:(batch_id+1)*batch_size]
    
    # Dimshuffle to seq_len * batch_size
    X = np.array(X).T
    Y = np.array(Y).T
    
    Y_hat = np.zeros(out_seq_length)
    
    q = Queue.PriorityQueue()
    q.put((0, [], []), False)
            
    feed_dict = {enc_inp[t]: X[t] for t in range(seq_length)}
    
    # Do it one symbol at a time
    for t in range(out_seq_length):
        q_next = Queue.PriorityQueue()
        
        # Discard entries
        while q.qsize() > 30:
            q.get()
        
        while not q.empty():
            l, s, _ = q.get()
            
            for t2 in range(len(s)+1):
                if t2 == 0:
                    feed_dict.update({use_initial: 1})
                    feed_dict.update({dec_inp[0]: [0]})
                    # Useless:
                    feed_dict.update({supplied_state: np.zeros((1, memory_dim * 2))})
                    feed_dict.update({supplied_prev: np.zeros((1, vocab_size + 1))})
                    feed_dict.update({supplied_attns: np.zeros((1, memory_dim))})
                else:
                    feed_dict.update({use_initial: 0})
                    feed_dict.update({supplied_state: prev_state})
                    feed_dict.update({supplied_prev: prev_output})
                    feed_dict.update({supplied_attns: prev_attns})
                    feed_dict.update({dec_inp[0]: [s[t2-1]]})

                prev_output, prev_state, prev_attns = sess.run((output, state, ntf_dec_attns), feed_dict)
                Y_hat[t2] = [logits_t.argmax() for logits_t in prev_output][0]
            
            probs = (prev_output[0]-np.min(prev_output[0]))
            probs = probs/np.sum(probs)
            bests = np.argpartition(probs, -5)[-5:]
            
            for best in bests:
                q_next.put((l + np.log(probs[best]), s + [best], Y_hat))
                
        q = q_next
    
    # Yes, there are 5 of them!
    Y_hats = []
    
    # Discard entries
    while q.qsize() > 30:
        q.get()
        
    while not q.empty():
        _, _, this_y_hat = q.get()
        Y_hats.append(this_y_hat)
    
    return X, Y.T[0], Y_hats

# Full Sequence Accuracy

In [81]:
accurate = 0
total = 0

for i in [64, 412, 571]:
    X, Y, Y_hats = validate_batch(i)
    
    for Y_hat in Y_hats:
        if np.array_equal(Y, Y_hat):
            accurate += 1
            break
    
    total += batch_size
    print "%d/%d" % (accurate, total)

0/1
0/2
0/3


In [82]:
float(accurate)/total

0.0