# QA using Seq2Seq Model with Context

###### Inspired from - Searching for one

In [1]:
import tensorflow as tf
from gensim.models import Word2Vec
import csv
from tensorflow.contrib import rnn
import pandas as pd
import numpy as np
from tensorflow.python.layers.core import Dense
import prepare_babi as pb
from functools import cmp_to_key
import random


In [2]:
# Parameters
learning_rate = 0.001
training_iters = 200000
batch_size = 15
display_step = 10
num_layers = 10

# Network Parameters
#n_steps = 12 # size of the sentences. Greater than will be clipped, less than will be padded by zero vector
answer_steps = 1
n_hidden = 128 # hidden layer num of features
vocab_size=100

# tf Graph input
#x = tf.placeholder("float", [None, None, n_input])
#y = tf.placeholder("float", [None, n_classes])
with open('data/qa2_two-supporting-facts_train.txt') as inp:
    data = pb.prepare_dataset(inp)

with open('data/qa2_two-supporting-facts_train.txt') as test_inp:
    test_data = pb.prepare_dataset(test_inp)


In [3]:
def get_cells():
    stacked_cells = []
    for i in range(num_layers):
        stacked_cells.append(tf.contrib.rnn.DropoutWrapper(rnn.GRUCell(n_hidden)))
    cell = rnn.MultiRNNCell(stacked_cells)
    return cell

In [4]:
#input_file = '/Users/mayoor/dev/qa/data/sample_cleaned.csv'

special_words = ['<UNK>','<EOS>','<GO>','<PAD>']

#with open(input_file) as csv_file:
#    data = pd.read_csv(csv_file,delimiter=',')
#print(data.head())

def get_all_vocab(data):
    vocab = []
    for st, q, ans in data:
        vocab += [w for w in q.split(' ')]
        vocab += [w for w in ans.split(' ')]
        for s in st:
            vocab += [w for w in s.split(' ')]
    return vocab

def get_all_lines_for_vectorization(data):
    lines = []
    for st, q, ans in data:
        for l in st:
            lines.append(l)
        lines.append(q)
        lines.append(ans)
    return lines

def get_tokenized_lines(data):
    lines = get_all_lines_for_vectorization(data)
    return [l.split(' ') for l in lines]


def getStoryQuestionAnswerLines(data):
    data_list = []
    for st, q, ans in data:
        words = " ".join(st).split(" ")
        data_list.append((words, q.split(' '), ans.split(' ')))
    return data_list

    

In [5]:
consol_list = get_all_vocab(data)
#print (len(consol_list))

#print (get_all_lines_for_vectorization(data))
word_ind = {}
for w in consol_list:
    if w not in word_ind:
        word_ind[w]=len(word_ind)

for d in special_words:
    word_ind[d]=len(word_ind)
    
print ('Total vocab size is : %s' % len(word_ind))
n_input = len(word_ind) #WordVector 

inverted_index = {}
for k in word_ind:
    inverted_index[word_ind[k]]=k

#y_data = [[1,0] if p == 'positive' else [0,1] for p in sentiment_list]
vector = Word2Vec(get_tokenized_lines(data) + [[d] for d in special_words], size=100, iter=50, min_count=1)
print ('Vector Size is %d' % len(vector.wv.vocab))
padding_vector = np.zeros(100)


Total vocab size is : 38
Vector Size is 38


In [6]:
vector.similar_by_word('mary')

[('john', 0.9948188662528992),
 ('daniel', 0.9919648170471191),
 ('sandra', 0.982620358467102),
 ('<PAD>', 0.07228146493434906),
 ('<EOS>', 0.07159482687711716),
 ('', 0.03617631644010544),
 ('is', 0.03473497927188873),
 ('where', 0.024126915261149406),
 ('<UNK>', -0.01084035262465477),
 ('garden', -0.023263201117515564)]

In [7]:
#Construct Embedding matrix - numpy matrix
vocab_size = len(vector.wv.vocab)
embeddings = np.zeros((vocab_size,100), dtype=np.float32)
for i in range(vocab_size):
    embeddings[i] = vector[inverted_index[i]]

print (embeddings)

[[ -1.14756741e-01  -1.27138031e+00   5.09362996e-01 ...,   3.83990496e-01
   -1.84832013e+00   1.86611605e+00]
 [ -4.09306958e-02  -1.70652354e+00  -5.55993319e-01 ...,   6.04525357e-02
   -8.37356389e-01   7.04301178e-01]
 [  3.57263237e-01   6.41192123e-02   6.06143296e-01 ...,  -3.21395040e-01
   -5.36073893e-02  -7.98516691e-01]
 ..., 
 [  4.88842791e-03  -4.15823795e-03   4.46959678e-03 ...,  -4.61358810e-03
    4.32615215e-03   1.83976453e-03]
 [  2.07603094e-03   1.85016659e-03   2.89434125e-03 ...,   2.23601446e-03
    2.63146474e-03  -3.05674947e-03]
 [ -3.39788618e-03  -4.63959156e-03   3.25530372e-03 ...,  -6.26060573e-05
    3.01822321e-03   1.54330861e-03]]


In [8]:
#Convert sentences to numbers and pad
def convert_tonum_pad(sentences, max_size):
    converted_sents = []
    for _s in sentences:
        words = _s
        _s_num = [word_ind[w] for w in words]
        if len(_s_num)<max_size:
            _s_num = _s_num + [word_ind['<PAD>']]*(max_size-len(_s_num))
        if len(_s_num)>max_size:
            _s_num = _s_num[0:max_size]
        converted_sents.append(_s_num)
    return converted_sents


In [9]:
#story, question, answer = getStoryQuestionAnswerLines(data)
def getAllData(data):
    train_tuple = getStoryQuestionAnswerLines(data)
    sorted_data_list = sorted(train_tuple, key=cmp_to_key(lambda x,y: -1 if len(x[0]) < len(y[0]) else 0))
    story = []
    question = []
    answer = []
    for t in sorted_data_list:
        story.append(t[0])
        question.append(t[1])
        answer.append(t[2])
    return story, question, answer


In [10]:
#add EOS to the question to indicate it is end of sentence.
def append_eos_to_question(sentences):
    eos_sent = []
    for sent in sentences:
        sent.append[word_ind['<EOS>']]
        eos_sent.append(sent)
    return eos_sent

In [11]:
def encoder(enc_input, batch_size):
    encoder_cells = get_cells()
    initial_state = encoder_cells.zero_state(batch_size, tf.float32)
    outputs, state = tf.nn.dynamic_rnn(encoder_cells, enc_input, initial_state=initial_state, swap_memory=True)
    return outputs, state

In [12]:
def bidirectional_encoder(enc_input, batch_size):
    encoder_cells_fw = get_cells()
    encoder_cells_bw = get_cells()
    initial_state_fw = encoder_cells.zero_state(batch_size, tf.float32)
    initial_state_bw = encoder_cells_bw.zero_state(batch_size, tf.float32)
    outputs, state = tf.nn.bidirectional_dynamic_rnn(encoder_cells_fw, encoder_cells_bw, enc_input, initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, swap_memory=True)
    outputs = tf.concat(outputs,2)
    return outputs, state

In [13]:
def decoder_training(dec_input, decoder_cell, initial_state, fc_layer, ans_length):
    
    helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_input, sequence_length=ans_length, time_major=False)
    #print (dec_input.get_shape())
    #print (decoder_cell.get_shape())
    #print (initial_state.get_shape())
    #print (ans_length.get_shape())
    decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell,helper,initial_state,fc_layer)
    training_logits, training_state, training_seq_len = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=False,
                                                           impute_finished=True,
                                                           maximum_iterations=answer_steps)
    return training_logits

In [14]:
def decoder_testing(decoder_cell, initial_state, embedding, start_token, end_token, batch_size, fc_layer):

    start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding, start_tokens, end_token)
    decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell,helper,initial_state, fc_layer)
    testing_logits, testing_state, testing_seq_len = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=False,
                                                           impute_finished=True,
                                                           maximum_iterations=answer_steps)
    return testing_logits

In [15]:
def build_decode_layer(enc_output, enc_state, dec_input, embeddings, batch_size, time_step, ans_length):
    decoder_cell = get_cells()

    attention = tf.contrib.seq2seq.BahdanauAttention(n_hidden, tf.concat(enc_output,1), time_step, normalize=False, name='BahdanauAttention')
    #print (enc_output[0].get_shape(), len(enc_output))
    decoder_cell = tf.contrib.seq2seq.AttentionWrapper(cell=decoder_cell, attention_mechanism=attention, attention_layer_size=n_hidden)
    
    #initial_state = tf.contrib.seq2seq.AttentionWrapperState(enc_state[0],
    #                                                         decoder_cell, 12, None, None)#.zero_state( batch_size, 
                                                                                             # tf.float32)
    initial_state = decoder_cell.zero_state( batch_size, tf.float32)
    
    initial_state = initial_state.clone(cell_state=enc_state)

    fully_connected_output_layer = Dense(len(word_ind),
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
    
    with tf.variable_scope("decode"):
        training_logits = decoder_training(dec_input, 
                                            decoder_cell, 
                                            initial_state, 
                                            fully_connected_output_layer, ans_length)
                                                   
    with tf.variable_scope("decode", reuse=True):
        inference_logits = decoder_testing(decoder_cell, initial_state, 
                                            embeddings, 
                                            word_ind['<GO>'], 
                                            word_ind['<EOS>'],
                                            batch_size,
                                            fully_connected_output_layer)
    return training_logits, inference_logits
    

In [16]:
def qa_model(enc_input, dec_input, batch_size, embeddings, time_step, ans_length):
    encoder_output, encode_state = encoder(enc_input, batch_size)
    training_logits, inference_logits = build_decode_layer(encoder_output, encode_state, dec_input, embeddings, batch_size, time_step, ans_length)
    
    return training_logits, inference_logits
    

In [17]:
train_graph = tf.Graph()

with train_graph.as_default():
    encoder_input = tf.placeholder(shape=(None,None,100),dtype=tf.float32, name="encode_input")
    decoder_input = tf.placeholder(shape=(None,answer_steps,100),dtype=tf.float32, name="decode_input")
    target = tf.placeholder(shape=(None,None),dtype=tf.int32, name="target")
    time_step = tf.placeholder(tf.int32, (None,), name='time_step')
    ans_length = tf.placeholder(tf.int32, (None,), name='ans_length')
    
    #e_inp = tf.unstack(encoder_input, n_steps, axis=1)
    #d_inp = tf.unstack(decoder_input, answer_steps, 1)

    training_logits, inference_logits = qa_model(encoder_input, decoder_input, batch_size, embeddings, time_step, ans_length)#(x, weights, biases)

    training_logits = tf.identity(training_logits.rnn_output, 'logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')
    
    # Create the weights for sequence_loss
    masks = tf.sequence_mask(ans_length, answer_steps, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            target,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
print("Graph is built.")

Graph is built.


In [18]:
def get_next_batch(batch_size, story, question, answer):
    max_data = len(story)
    #indexes = np.random.randint(max_data-batch_size, size=batch_size)
    global index
    if index+batch_size+1 > max_data:
        index = max_data-batch_size-1
    if index >= max_data-1:
        index = 0
    indexes = range(index, index+batch_size)
    index += batch_size
    
    story_question_batch = [story[ix]+question[ix] for ix in list(indexes)]
    answer_batch = [answer[iy] for iy in list(indexes)]
    story_question_max_len = max([len(s) for s in story_question_batch])
    answer_max_len = max([len(s) for s in answer_batch])
    prepared_data_question = convert_tonum_pad(story_question_batch, story_question_max_len)
    prepared_data_answer = convert_tonum_pad(answer_batch,answer_max_len)

    return np.asarray(prepared_data_question), np.asarray(prepared_data_answer), story_question_max_len, answer_max_len, answer_batch


In [19]:
def convert_sent_to_vec(sentences, eos=False):
    vec_sentences = []
    for sent in sentences:
        if eos == True:
            st = list(sent)
            st.append(word_ind['<EOS>'])
        else:
            st = sent
        vec_sentences.append([embeddings[s] for s in st])
    return np.asarray(vec_sentences)
    

In [20]:
index = 0
story, question, answer = getAllData(data)

test_x, test_y, xlen, ylen, test_y_actual = get_next_batch(10, story, question, answer)
x = convert_sent_to_vec(test_x,True)
y = convert_sent_to_vec(test_y)
print (x.shape)
print (y.shape)

(10, 25, 100)
(10, 1, 100)


In [21]:
def process_dec_input(target, batch_size):
    begin_token = np.tile(np.array(word_ind['<GO>']),batch_size).reshape(batch_size,1)
    z = np.delete(target,target.shape[1]-1,1)
    return np.hstack((begin_token,z))
    

In [24]:
# Launch the graph
training_iters = 9000
story, question, answer = getAllData(data)
test_story, test_question, test_answer = getAllData(test_data)
checkpoint_file = 'model/seqqacontext/seqqacontext.ckpt'
with tf.Session(graph=train_graph) as sess:
    index = 0
    sess.run(tf.global_variables_initializer())
    step = 1
    # Keep training until reach max iterations
    saver = tf.train.Saver() 
    try:
        saver.restore(sess,checkpoint_file)
    except:
        print ('No previously saved checkpoint found')
    while step * batch_size < training_iters:

        batch_x, batch_y, x_max_len, y_max_len, y_actual_train = get_next_batch(batch_size, story, question, answer)
        n_steps = x_max_len +1
        answer_steps = y_max_len
        
        x = convert_sent_to_vec(batch_x,True)
        dec_inp = process_dec_input(batch_y, batch_size)
        y = convert_sent_to_vec(dec_inp)
        old_loss = 5

        # Run optimization op (backprop)
        sess.run([train_op ], feed_dict={encoder_input: x, decoder_input: y, target:batch_y, time_step:[n_steps]*batch_size, ans_length:[answer_steps]*batch_size})
        if step % display_step == 0:

            # Calculate batch loss
            loss = sess.run(cost, feed_dict={encoder_input: x, decoder_input: y, target:batch_y, time_step:[n_steps]*batch_size, ans_length:[answer_steps]*batch_size})
            print ("Iter " + str(step*batch_size) + ", Minibatch Loss= " + \
                  "{:.6f}".format(loss) )
            if loss < old_loss:
                saver.save(sess, checkpoint_file)
            old_loss = loss
            
        step += 1
    print ("Optimization Finished!")
    if loss < old_loss:
        saver.save(sess, checkpoint_file)

    

INFO:tensorflow:Restoring parameters from model/seqqacontext/seqqacontext.ckpt
No previously saved checkpoint found
Iter 150, Minibatch Loss= 1.856541
Iter 300, Minibatch Loss= 2.227746
Iter 450, Minibatch Loss= 1.717780
Iter 600, Minibatch Loss= 1.943280
Iter 750, Minibatch Loss= 1.898941
Iter 900, Minibatch Loss= 1.751568
Iter 1050, Minibatch Loss= 1.505340
Iter 1200, Minibatch Loss= 1.363502
Iter 1350, Minibatch Loss= 1.347718
Iter 1500, Minibatch Loss= 1.344355
Iter 1650, Minibatch Loss= 1.340787
Iter 1800, Minibatch Loss= 1.328789
Iter 1950, Minibatch Loss= 1.340639
Iter 2100, Minibatch Loss= 1.336262
Iter 2250, Minibatch Loss= 1.327319
Iter 2400, Minibatch Loss= 2.514742
Iter 2550, Minibatch Loss= 5.276697
Iter 2700, Minibatch Loss= 1.350912
Iter 2850, Minibatch Loss= 1.343240
Iter 3000, Minibatch Loss= 1.340944
Iter 3150, Minibatch Loss= 1.340742
Iter 3300, Minibatch Loss= 1.337883
Iter 3450, Minibatch Loss= 1.284769
Iter 3600, Minibatch Loss= 1.037672
Iter 3750, Minibatch Loss=

In [27]:
index=0
correct = 0
total = 0
testing_iter = 900
checkpoint = "model/seqqacontext/seqqacontext.ckpt"

graph = tf.Graph()
with tf.Session(graph=graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)
    encoder_input = graph.get_tensor_by_name('encode_input:0')
    inference_logits = graph.get_tensor_by_name('predictions:0')
    time_step = graph.get_tensor_by_name('time_step:0')
    ans_length = graph.get_tensor_by_name('ans_length:0')

    for te in range(testing_iter):
        batch_x, batch_y, x_max_len, y_max_len, y_actual = get_next_batch(batch_size, test_story, test_question, test_answer)
        n_steps = x_max_len +1
        answer_steps = y_max_len
        x = convert_sent_to_vec(batch_x,True)
        #print (x.shape)
        #x_input = np.tile(x,(15,1,1))
        #print (x_input.shape)

        #an_logit = sess.run(inference_logits, feed_dict={encoder_input: x_input, time_step:[n_steps]*batch_size, ans_length:[answer_steps]*batch_size})
        an_logit = sess.run(inference_logits, feed_dict={encoder_input: x, time_step:[n_steps]*batch_size, ans_length:[answer_steps]*batch_size})
        total += len(an_logit)
        for i in range(len(an_logit)):
            fina_ans = an_logit[i]
            predicted = " ".join([inverted_index[ind] for ind in fina_ans.tolist()])
            actual = " ".join(y_actual[i])
            #print ('The question is: ', " ".join([inverted_index[ind] for ind in batch_x[i].tolist()]))
            #print ('The answer is: ', predicted)
            #print ('The right answer is: ', actual)
            if predicted.strip() == actual.strip():
                correct += 1
    print ('Training accuracy is '+ str((correct/total)*100))
    print ('{0} out of {1} predicted correctly'.format(correct, total))



INFO:tensorflow:Restoring parameters from model/seqqacontext/seqqacontext.ckpt
Training accuracy is 94.17777777777778
12714 out of 13500 predicted correctly
