In [None]:
import numpy as np
import tensorflow as tf#r.12
import pandas as pd

### Loading toy data with 20 vocab size

In [None]:
source = open("data/s2s_sources.txt",'r')
source = [[int(word) for word in line.split()] for line in source]

target = open("data/s2s_targets.txt",'r')
target = [[int(word) for word in line.split()] for line in target]

### Hyper Parameters

In [None]:
ENCODER_VOCABSIZE = 20 + 3
DECODER_VOCABSIZE = 20 + 3

ENCODER_CELLSIZE = 512
DECODER_CELLSIZE = 512*2

BATCHSIZE = 100

NLAYERS = 3

EOS = 20
GO = 21
PAD = 22

In [None]:
#inverse encoder input: TBD
#r1.2 takes care of variable size input sequences
#make model such that it is independent of batch_size

### Encoder

In [None]:
encoder_inputs = tf.placeholder(tf.int32, [None, None], name="encoder_inputs") # [ BATCHSIZE, MAX_SEQLEN ]
encoder_embeddings = tf.get_variable(name="encoder_embedding_matrix",
                                     shape=[ENCODER_VOCABSIZE, ENCODER_CELLSIZE],
                                     initializer=tf.contrib.layers.xavier_initializer(uniform=True,seed=None,dtype=tf.float32),
                                     dtype=tf.float32)
encoder_inputs_embedded = tf.nn.embedding_lookup(encoder_embeddings, encoder_inputs) 
encoder_input_length = tf.placeholder(tf.int32, [None], name="encoder_input_length") # [ BATCHSIZE ]

In [None]:
e_cells = [tf.contrib.rnn.GRUCell(ENCODER_CELLSIZE) for _ in range(NLAYERS)]
encoder_cell = tf.contrib.rnn.MultiRNNCell(e_cells)

In [None]:
(outputs, output_states) = tf.nn.bidirectional_dynamic_rnn(encoder_cell, 
                                encoder_cell, 
                                encoder_inputs_embedded, 
                                sequence_length=encoder_input_length,
                                dtype=tf.float32)

In [None]:
encoder_outputs = tf.concat(outputs, 2) # [ BATCHSIZE, MAX_SEQLEN, 2*ENCODER_CELLSIZE]
encoder_final_states = [tf.concat(x, 1) for x in zip(output_states[0],output_states[1])] # [ BATCHSIZE, 2*ENCODER_CELLSIZE]

## Decoder

In [None]:
decoder_input = tf.placeholder(tf.int32, [None, None], name='decoder_input') # [ BATCHSIZE, SEQLEN ]
decoder_input_length = tf.placeholder(shape=[None],dtype=tf.int32,name='decoder_input_length')

decoder_embeddings = tf.get_variable(name="decoder_embedding_matrix",
                                     shape=[DECODER_VOCABSIZE, DECODER_CELLSIZE],
                                     initializer=tf.contrib.layers.xavier_initializer(),
                                     dtype=tf.float32)

decoder_input_embed = tf.nn.embedding_lookup(decoder_embeddings, decoder_input) 


decoder_targets = tf.placeholder(shape=[None, None],dtype=tf.int32,name='decoder_targets')
decoder_targets_length = tf.placeholder(shape=[None],dtype=tf.int32,name='decoder_targets_length')

batch_size_t = tf.placeholder(tf.int32, [1], name="batch_size_t")

In [None]:
attention_mech = tf.contrib.seq2seq.BahdanauAttention(DECODER_CELLSIZE, encoder_outputs,memory_sequence_length=encoder_input_length)
#num_units: convert memory(hs) W * hs and query(ht) into W * ht num_units size first
#memory: The memory to query; usually the output of an RNN encoder.
#normalize
#probability_fn: Converts the score to probabilities.  The default is @{tf.nn.softmax}.
#score_mask_value: (optional): The mask value for score before passing into `probability_fn`. The default is -inf. Only used if`memory_sequence_length` is not None.

#def __call__(query, previous_alignments):
#score = math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query),[2]) #v * tanh(W * hs + W * ht)
#alignments = self._probability_fn(score, previous_alignments) #previous_alignments are ignored in BahdanauAttention

##Applying attention wrapper on top most cell
d_cells = [tf.contrib.rnn.GRUCell(DECODER_CELLSIZE) for _ in range(NLAYERS)]
top_d_cell = tf.contrib.seq2seq.AttentionWrapper(d_cells[-1], attention_mech,output_attention=False)#read AttentionWrapper once more
d_cells[-1] = top_d_cell

#Step 1: Mix the `inputs` and previous step's `attention` output via `cell_input_fn`. array_ops.concat([inputs, state.attention], -1)
#Step 2: Call the wrapped `cell` with this input and its previous state.
#Step 3: Score & alignment the cell's output with `attention_mechanism`. alignments(a(s) = self._attention_mechanism(query=cell_output, previous_alignments=state.alignments)
#Step 5: Calculate the context vector as the inner product between the alignments and the attention_mechanism's values (memory). sigma(a(s)*hs)
#Step 6: attention_layer_size!=None, attention = DenseLayer(attention_layer_size)(array_ops.concat([cell_output, context], 1))
#        else:                       attention = context
#output_attention == true return attention, next_state or return output, next_state

##Updating state of top cell to be equivalent to attention wrapper cell
decoder_initial_states = encoder_final_states
top_d_state = top_d_cell.zero_state(batch_size_t, tf.float32)
top_d_state = top_d_state.clone(cell_state=decoder_initial_states[-1])
decoder_initial_states[-1] = top_d_state
decoder_initial_states = tuple(decoder_initial_states)

decoder_cell = tf.contrib.rnn.MultiRNNCell(d_cells)

In [None]:
decoder_helper = tf.contrib.seq2seq.TrainingHelper(decoder_input_embed, decoder_input_length)
#Training Helper
#sample(time, outputs) -> argmax(output, -1)
#next_inputs(time, outputs, state) -> (allFinished?, decoder_input_embed[time+1], state)

decoder_output_layer = tf.contrib.keras.layers.Dense(DECODER_VOCABSIZE)

decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, 
                                          decoder_helper, 
                                          decoder_initial_states, 
                                          decoder_output_layer)
#step(time, inputs, state)
#Step1: cell_outputs, cell_state = self._cell(inputs, state)
#step2: if self._output_layer is not None: cell_outputs = self._output_layer(cell_outputs)
#step3: sample_ids = self._helper.sample(time=time, outputs=cell_outputs, state=cell_state) which is just argmax
#step4: (finished, next_inputs, next_state) = self._helper.next_inputs(time=time,outputs=cell_outputs,state=cell_state,sample_ids=sample_ids)

In [None]:
final_outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(decoder)

In [None]:
Ylogits = final_outputs.rnn_output

In [None]:
Y_pred = tf.argmax(tf.nn.softmax(Ylogits),2)

In [None]:
loss_weights = tf.ones([BATCHSIZE,tf.reduce_max(final_sequence_lengths)], dtype=tf.float32, name="loss_weights")

In [None]:
loss = tf.contrib.seq2seq.sequence_loss(Ylogits, decoder_targets, loss_weights)
train_op = tf.train.AdamOptimizer(1e-4).minimize(loss)

## Training

In [None]:
def batch(inputs):
    sequence_lengths = [len(seq) for seq in inputs]
    batch_size = len(inputs)
    
    max_sequence_length = max(sequence_lengths)
    type(max_sequence_length)
    
    inputs_batch_major = np.ones(shape=[batch_size, max_sequence_length], dtype=np.int32)*PAD
    for i, seq in enumerate(inputs):
        for j, element in enumerate(seq):
            inputs_batch_major[i, j] = element
            
    return inputs_batch_major

def rnn_minibatch_sequencer(X, Y, batch_size, epochs):
    inputs = len(X)
    for ep in range(epochs):
        for i in range(int(inputs/batch_size)):
            encoder_input = X[i*batch_size: (i+1)*batch_size]
            encoder_input_len = [len(seq) for seq in encoder_input]
            y = Y[i*batch_size: (i+1)*batch_size]
            decoder_input = [[GO] + seq for seq in y]
            decoder_input_len = [len(seq) for seq in decoder_input]
            decoder_target =[(seq + [EOS]) for seq in y]
            decoder_target_len = [len(seq) for seq in decoder_target]
            yield batch(encoder_input),np.array(encoder_input_len),batch(decoder_input),\
            np.array(decoder_input_len),batch(decoder_target),np.array(decoder_target_len),np.array([batch_size]),ep        
        

In [None]:
#a,b,c,d,e,f,g,h = rnn_minibatch_sequencer(source, target, BATCHSIZE,10).__next__()


In [None]:
inn = tf.global_variables_initializer()
sess = tf.InteractiveSession()

sess.run(inn)

In [None]:
for a,b,c,d,e,f,g,h in rnn_minibatch_sequencer(source, target, BATCHSIZE, 10):
    feed_dict = {encoder_inputs:a,
               encoder_input_length:b,
               decoder_input:c,
               decoder_input_length:d,
               decoder_targets:e,
               decoder_targets_length:f,
               batch_size_t:g}
    
    _, pred, c = sess.run([train_op, Y_pred, loss], feed_dict=feed_dict)
    
    print("epoch {} loss {}".format(h, c))
    print(pred[:5])
    
    #e_s = sess.run([encoder_final_states],feed_dict=feed_dict)
    #print(e_s)
    #logi = sess.run([Ylogits],feed_dict=feed_dict)
    #print(logi)
    

In [None]:
encoder_final_states

## Inference

using greedy decoder

In [None]:
decoder_start_token = tf.placeholder(tf.int32, [None], name="decoder_start_token") # [ BATCHSIZE ]
decoder_end_token = EOS
decoder_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings,decoder_start_token,decoder_end_token)
#initialize = finished, first_inputs = embedding_lookup(start_token)
#next_inputs (time, outputs, state, sample_ids) -> (finished=sample_id==end_token, next_input=embedding_lookup(sample_ids), state)
#sample_id is argmax of the output, so decoder should give output which are after softmax

decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, decoder_helper, decoder_initial_states,decoder_output_layer)
final_outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(decoder)

In [None]:
#YlogitsTest = final_outputs.rnn_output
#Y_pred_test = tf.argmax(tf.nn.softmax(YlogitsTest),2)

In [None]:
input_seq = np.array([int(x) for x in '1 2 3 4 5 6'.split()], dtype=np.int32)
input_seq=input_seq.reshape((1,len(input_seq)))

In [None]:
input_seq_len = np.array(input_seq.shape[1]).reshape((1))

In [None]:
predicted_seq, predicted_seq_len = sess.run([final_outputs.sample_id, final_sequence_lengths],\
         feed_dict={decoder_start_token:np.array([GO]) , 
                    encoder_inputs:input_seq, 
                    encoder_input_length:input_seq_len,
                    batch_size_t:np.array([1])})

print("{}, {}".format(predicted_seq[0], predicted_seq_len[0]))#first prediction for batch

## Inference

using beam search decoder

In [None]:
beam_width = 5
beamsearch_decoder = tf.contrib.seq2seq.BeamSearchDecoder(decoder_cell,
                                                          decoder_embeddings,
                                                          decoder_start_token,
                                                          decoder_end_token,
                                                          decoder_initial_states,
                                                          beam_width,
                                                          decoder_output_layer)

#np.tile(decoder_start_token, beam_width) is applied 
#therefore, when decoder_start_token = [batch_size]
#then decoder_initial_states = ([batch_size*beam_width, cell_size])
#therefore, encoder_inputs = ([batch_size*beam_width, seq_len])
final_outputs_bs, final_state_bs, final_sequence_lengths_bs = tf.contrib.seq2seq.dynamic_decode(beamsearch_decoder)

predicted_seq_bs, predicted_seq_len_bs = sess.run([final_outputs_bs, final_sequence_lengths_bs],\
         feed_dict={decoder_start_token:np.array([GO]) , 
                    batch_size_t:np.array([beam_width]),
                  encoder_inputs: np.tile(input_seq,(beam_width,1)),
                  encoder_input_length:np.tile(input_seq_len,beam_width)})

print("{}, {}".format(predicted_seq_bs.predicted_ids[0].T, predicted_seq_len_bs[0]))#first prediction for batch

In [None]:
saver = tf.train.Saver()
save_path = saver.save(sess, "seq2seq_reverse.ckpt")
print("Model saved in file: %s" % save_path)