# QA using Seq2Seq Model

###### Inspired from - Searching for one

In [1]:
import tensorflow as tf
from gensim.models import Word2Vec
import csv
from tensorflow.contrib import rnn
import pandas as pd
import numpy as np
from tensorflow.python.layers.core import Dense

In [40]:
# Parameters
learning_rate = 0.001
training_iters = 200000
batch_size = 15
display_step = 10
num_layers = 10

# Network Parameters
n_steps = 12 # size of the sentences. Greater than will be clipped, less than will be padded by zero vector
answer_steps = 6
n_hidden = 128 # hidden layer num of features
vocab_size=100

# tf Graph input
#x = tf.placeholder("float", [None, None, n_input])
#y = tf.placeholder("float", [None, n_classes])



In [3]:
def get_cells():
    stacked_cells = []
    for i in range(num_layers):
        stacked_cells.append(tf.contrib.rnn.DropoutWrapper(rnn.GRUCell(n_hidden)))
    cell = rnn.MultiRNNCell(stacked_cells)
    return cell

In [85]:
input_file = '/Users/mayoor/dev/qa/data/sample_cleaned.csv'

special_words = ['<UNK>','<EOS>','<GO>','<PAD>']

with open(input_file) as csv_file:
    data = pd.read_csv(csv_file,delimiter=',')
#print(data.head())
required_data = data[['Question','Answer']]
required_data.describe()
req_data = required_data.drop_duplicates()
question_list = req_data['Question'].tolist()
sent_length = []
sents = [s.strip().lower().split(' ') for s in question_list]
answer_list = req_data['Answer'].tolist()
ans = [s.strip().lower().split(' ') for s in answer_list]

for s in sents:
    sent_length.append(len(s))

print (sorted(sent_length))
consol_list = question_list + answer_list
word_ind = {}
for it in consol_list:
    words = it.lower().split(' ')
    for w in words:
        if w not in word_ind:
            word_ind[w]=len(word_ind)

for d in special_words:
    word_ind[d]=len(word_ind)
    
print ('Total vocab size is : %s' % len(word_ind))
n_input = len(word_ind) #WordVector 

inverted_index = {}
for k in word_ind:
    inverted_index[word_ind[k]]=k

#y_data = [[1,0] if p == 'positive' else [0,1] for p in sentiment_list]
vector = Word2Vec(sents + ans + [[d] for d in special_words], size=100, iter=50, min_count=1)
print ('Vector Size is %d' % len(vector.wv.vocab))
padding_vector = np.zeros(100)

[3, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 11, 11]
Total vocab size is : 150
Vector Size is 150


In [5]:
vector.similar_by_word('antietam')

[('lincoln', 0.9906107783317566),
 ('the', 0.9902212619781494),
 ('did', 0.990150511264801),
 ('in', 0.9899835586547852),
 ('by', 0.989866316318512),
 ('rebellion', 0.989572286605835),
 ('of', 0.9895237684249878),
 ('and', 0.9894744157791138),
 ('led', 0.9893594980239868),
 ('that', 0.9891302585601807)]

In [6]:
#Construct Embedding matrix - numpy matrix
vocab_size = len(vector.wv.vocab)
embeddings = np.zeros((vocab_size,100), dtype=np.float32)
for i in range(vocab_size):
    embeddings[i] = vector[inverted_index[i]]

print (embeddings)

[[ 0.04407597  0.05686799  0.04348769 ...,  0.00313386  0.04321606
   0.02507165]
 [ 0.01697349  0.02046946  0.00885339 ...,  0.0053508   0.00901733
   0.00411669]
 [ 0.06637037  0.07851136  0.0585053  ...,  0.00499923  0.05207355
   0.04243772]
 ..., 
 [ 0.00070225 -0.00194354  0.00174642 ...,  0.00302336 -0.00367921
  -0.00225345]
 [-0.00383884 -0.00479192  0.00487459 ..., -0.00045088  0.00294408
  -0.00367758]
 [-0.00047537 -0.00471014  0.0048356  ...,  0.00405425 -0.00191541
   0.00328755]]


In [7]:
#Convert sentences to numbers and pad
def convert_tonum_pad(sentences, max_size):
    converted_sents = []
    for _s in sentences:
        words = _s.lower().split(' ')
        _s_num = [word_ind[w] for w in words]
        if len(_s_num)<max_size:
            _s_num = _s_num + [word_ind['<PAD>']]*(max_size-len(_s_num))
        if len(_s_num)>max_size:
            _s_num = _s_num[0:max_size]
        converted_sents.append(_s_num)
    return converted_sents


In [8]:
prepared_data_question = convert_tonum_pad(question_list, n_steps-1)
prepared_data_answer = convert_tonum_pad(answer_list,answer_steps)


In [9]:
#add EOS to the question to indicate it is end of sentence.
def append_eos_to_question(sentences):
    eos_sent = []
    for sent in sentences:
        sent.append[word_ind['<EOS>']]
        eos_sent.append(sent)
    return eos_sent

In [10]:
def encoder(enc_input, batch_size):
    encoder_cells = get_cells()
    initial_state = encoder_cells.zero_state(batch_size, tf.float32)
    #outputs, state = tf.contrib.rnn.static_rnn(
    #    encoder_cells, enc_input, initial_state=initial_state)
    outputs, state = tf.nn.dynamic_rnn(encoder_cells, enc_input, initial_state=initial_state, swap_memory=True)
    return outputs, state

In [11]:
def decoder_training(dec_input, decoder_cell, initial_state, fc_layer, ans_length):
    
    helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_input, sequence_length=ans_length, time_major=False)
    #print (dec_input.get_shape())
    #print (decoder_cell.get_shape())
    #print (initial_state.get_shape())
    #print (ans_length.get_shape())
    decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell,helper,initial_state,fc_layer)
    training_logits, training_state, training_seq_len = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=False,
                                                           impute_finished=True,
                                                           maximum_iterations=answer_steps)
    return training_logits

In [12]:
def decoder_testing(decoder_cell, initial_state, embedding, start_token, end_token, batch_size, fc_layer):

    start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding, start_tokens, end_token)
    decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell,helper,initial_state, fc_layer)
    testing_logits, testing_state, testing_seq_len = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=False,
                                                           impute_finished=True,
                                                           maximum_iterations=answer_steps)
    return testing_logits

In [13]:
def build_decode_layer(enc_output, enc_state, dec_input, embeddings, batch_size, time_step, ans_length):
    decoder_cell = get_cells()

    attention = tf.contrib.seq2seq.BahdanauAttention(n_hidden, tf.concat(enc_output,1), time_step, normalize=False, name='BahdanauAttention')
    #print (enc_output[0].get_shape(), len(enc_output))
    decoder_cell = tf.contrib.seq2seq.AttentionWrapper(cell=decoder_cell, attention_mechanism=attention, attention_layer_size=n_hidden)
    
    #initial_state = tf.contrib.seq2seq.AttentionWrapperState(enc_state[0],
    #                                                         decoder_cell, 12, None, None)#.zero_state( batch_size, 
                                                                                             # tf.float32)
    initial_state = decoder_cell.zero_state( batch_size, tf.float32)
    
    initial_state = initial_state.clone(cell_state=enc_state)
    print (initial_state)

    fully_connected_output_layer = Dense(len(word_ind),
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
    
    with tf.variable_scope("decode"):
        training_logits = decoder_training(dec_input, 
                                            decoder_cell, 
                                            initial_state, 
                                            fully_connected_output_layer, ans_length)
                                                   
    with tf.variable_scope("decode", reuse=True):
        inference_logits = decoder_testing(decoder_cell, initial_state, 
                                            embeddings, 
                                            word_ind['<GO>'], 
                                            word_ind['<EOS>'],
                                            batch_size,
                                            fully_connected_output_layer)
    return training_logits, inference_logits
    

In [14]:
def qa_model(enc_input, dec_input, batch_size, embeddings, time_step, ans_length):
    encoder_output, encode_state = encoder(enc_input, batch_size)
    training_logits, inference_logits = build_decode_layer(encoder_output, encode_state, dec_input, embeddings, batch_size, time_step, ans_length)
    
    return training_logits, inference_logits
    

In [15]:
train_graph = tf.Graph()

with train_graph.as_default():
    encoder_input = tf.placeholder(shape=(None,n_steps,100),dtype=tf.float32, name="encode_input")
    decoder_input = tf.placeholder(shape=(None,answer_steps,100),dtype=tf.float32, name="decode_input")
    target = tf.placeholder(shape=(None,None),dtype=tf.int32, name="target")
    time_step = tf.placeholder(tf.int32, (None,), name='time_step')
    ans_length = tf.placeholder(tf.int32, (None,), name='ans_length')
    
    #e_inp = tf.unstack(encoder_input, n_steps, axis=1)
    #d_inp = tf.unstack(decoder_input, answer_steps, 1)

    training_logits, inference_logits = qa_model(encoder_input, decoder_input, batch_size, embeddings, time_step, ans_length)#(x, weights, biases)

    training_logits = tf.identity(training_logits.rnn_output, 'logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')
    
    # Create the weights for sequence_loss
    masks = tf.sequence_mask(ans_length, answer_steps, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            target,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
print("Graph is built.")

AttentionWrapperState(cell_state=(<tf.Tensor 'rnn/while/Exit_2:0' shape=(15, 128) dtype=float32>, <tf.Tensor 'rnn/while/Exit_3:0' shape=(15, 128) dtype=float32>, <tf.Tensor 'rnn/while/Exit_4:0' shape=(15, 128) dtype=float32>, <tf.Tensor 'rnn/while/Exit_5:0' shape=(15, 128) dtype=float32>, <tf.Tensor 'rnn/while/Exit_6:0' shape=(15, 128) dtype=float32>, <tf.Tensor 'rnn/while/Exit_7:0' shape=(15, 128) dtype=float32>, <tf.Tensor 'rnn/while/Exit_8:0' shape=(15, 128) dtype=float32>, <tf.Tensor 'rnn/while/Exit_9:0' shape=(15, 128) dtype=float32>, <tf.Tensor 'rnn/while/Exit_10:0' shape=(15, 128) dtype=float32>, <tf.Tensor 'rnn/while/Exit_11:0' shape=(15, 128) dtype=float32>), attention=<tf.Tensor 'AttentionWrapperZeroState/zeros_1:0' shape=(15, 128) dtype=float32>, time=<tf.Tensor 'AttentionWrapperZeroState/zeros:0' shape=() dtype=int32>, alignments=<tf.Tensor 'AttentionWrapperZeroState/zeros_2:0' shape=(15, 12) dtype=float32>, alignment_history=())
Graph is built.


In [16]:
def get_next_batch(batch_size):
    max_data = len(question_list)
    indexes = np.random.randint(max_data, size=batch_size)
    sentence_batch = [prepared_data_question[ix] for ix in list(indexes)]
    y_batch = [prepared_data_answer[iy] for iy in list(indexes)]
    
    return np.asarray(sentence_batch), np.asarray(y_batch)


In [23]:
def convert_sent_to_vec(sentences, eos=False):
    vec_sentences = []
    for sent in sentences:
        if eos == True:
            st = list(sent)
            st.append(word_ind['<EOS>'])
        else:
            st = sent
        vec_sentences.append([embeddings[s] for s in st])
    return np.asarray(vec_sentences)
    

In [26]:
test_x, test_y = get_next_batch(10)
x = convert_sent_to_vec(test_x,True)
y = convert_sent_to_vec(test_y)
print (x.shape)
print (y.shape)

(10, 12, 100)
(10, 6, 100)


In [82]:
def process_dec_input(target, batch_size):
    begin_token = np.tile(np.array(word_ind['<GO>']),batch_size).reshape(batch_size,1)
    z = np.delete(target,target.shape[1]-1,1)
    return np.hstack((begin_token,z))
    

In [83]:
test_x, test_y = get_next_batch(10)
print (test_y.shape)
new_y = process_dec_input(test_y, 10)
print (new_y)

(10, 6)
[[148 110 111 112 149 149]
 [148 104 149 149 149 149]
 [148 141 142 143 144 145]
 [148  84  85 149 149 149]
 [148 113 149 149 149 149]
 [148 104 149 149 149 149]
 [148 104 149 149 149 149]
 [148 114  62 149 149 149]
 [148   3   7   8 106   3]
 [148  54 149 149 149 149]]


In [86]:
# Launch the graph
training_iters = 4500
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    step = 1
    # Keep training until reach max iterations
    while step * batch_size < training_iters:

        batch_x, batch_y = get_next_batch(batch_size)
        x = convert_sent_to_vec(batch_x,True)
        dec_inp = process_dec_input(batch_y, batch_size)
        y = convert_sent_to_vec(dec_inp)

        # Run optimization op (backprop)
        sess.run([train_op ], feed_dict={encoder_input: x, decoder_input: y, target:batch_y, time_step:[n_steps]*batch_size, ans_length:[answer_steps]*batch_size})
        if step % display_step == 0:

            # Calculate batch loss
            loss = sess.run(cost, feed_dict={encoder_input: x, decoder_input: y, target:batch_y, time_step:[n_steps]*batch_size, ans_length:[answer_steps]*batch_size})
            print ("Iter " + str(step*batch_size) + ", Minibatch Loss= " + \
                  "{:.6f}".format(loss) + ", Training Accuracy= " + \
                  "{:.5f}".format(loss))
        step += 1
    print ("Optimization Finished!")
    
    batch_x, batch_y = get_next_batch(1)
    x = convert_sent_to_vec(batch_x,True)
    print (x.shape)
    x_input = np.tile(x,(15,1,1))
    print (x_input.shape)

    an_logit = sess.run(inference_logits, feed_dict={encoder_input: x_input, time_step:[n_steps]*batch_size, ans_length:[answer_steps]*batch_size})

    fina_ans = an_logit[0]
    print (batch_x[0])
    print (fina_ans)
    
    print ('The question is: ', [inverted_index[ind] for ind in batch_x[0].tolist()])
    print ('The answer is: ', [inverted_index[ind] for ind in fina_ans.tolist()])

    #test_len = 10
    #a = 0
    #for i in range(50):
    #    test_data, test_label = get_next_batch(test_len)
    #    x = convert_sent_to_vec(test_data,True)
    #    y = convert_sent_to_vec(test_label)
    #    a += sess.run(accuracy, feed_dict={encoder_input: x, decoder_input: y, target:batch_y, time_step:n_steps, ans_length:answer_steps})
        
    #print ("Testing Accuracy:", a/50)

Iter 150, Minibatch Loss= 2.605524, Training Accuracy= 2.60552
Iter 300, Minibatch Loss= 1.717576, Training Accuracy= 1.71758
Iter 450, Minibatch Loss= 1.411055, Training Accuracy= 1.41106
Iter 600, Minibatch Loss= 1.325067, Training Accuracy= 1.32507
Iter 750, Minibatch Loss= 1.916545, Training Accuracy= 1.91654
Iter 900, Minibatch Loss= 1.495909, Training Accuracy= 1.49591
Iter 1050, Minibatch Loss= 1.646598, Training Accuracy= 1.64660
Iter 1200, Minibatch Loss= 1.406796, Training Accuracy= 1.40680
Iter 1350, Minibatch Loss= 0.789078, Training Accuracy= 0.78908
Iter 1500, Minibatch Loss= 1.578618, Training Accuracy= 1.57862
Iter 1650, Minibatch Loss= 1.013793, Training Accuracy= 1.01379
Iter 1800, Minibatch Loss= 1.165276, Training Accuracy= 1.16528
Iter 1950, Minibatch Loss= 1.143929, Training Accuracy= 1.14393
Iter 2100, Minibatch Loss= 1.186432, Training Accuracy= 1.18643
Iter 2250, Minibatch Loss= 1.394265, Training Accuracy= 1.39426
Iter 2400, Minibatch Loss= 1.477121, Training 