In [1]:
import tensorflow as tf

In [2]:
import re
import tarfile
import collections
import numpy as np
import pandas as pd
from tqdm import *
import math

# Memory Networks

# preprocessing for memory network

In [3]:
def tokenize(sent):
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]

In [6]:
def parse_stories(lines):
    data = []
    story = []
    for line in lines:
        line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        if int(nid) == 1: story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            substory = [[str(i)+":"]+x for i,x in enumerate(story) if x]
            data.append((substory, q, a))
            story.append('')
        else: story.append(tokenize(line))
    return data

In [7]:
tar = tarfile.open("babi_tasks_1-20_v1-2.tar.gz")

In [8]:
challenges = {
    # QA1 with 10,000 samples
    'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt',
    # QA2 with 10,000 samples
    'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt',
    'two_supporting_facts_1k': 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt',
}
# challenge_type = 'single_supporting_fact_10k'

challenge_type = 'two_supporting_facts_1k'

challenge = challenges[challenge_type]

In [None]:
def get_stories(f):
    data = parse_stories(f.readlines())
    return [(story, q, answer) for story, q, answer in data]

In [None]:
train_stories = get_stories(tar.extractfile(challenge.format('train')))
test_stories = get_stories(tar.extractfile(challenge.format('test')))

In [None]:
test_stories[200]

In [None]:
stories = train_stories + test_stories

In [None]:
story_maxlen = max((len(s) for x, _, _ in stories for s in x))
story_maxsents = max((len(x) for x, _, _ in stories))
query_maxlen = max(len(x) for _, x, _ in stories)

story_maxlen , story_maxsents , query_maxlen

In [None]:
def do_flatten(el): 
    return isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes))
def flatten(l):
    for el in l:
        if do_flatten(el): yield from flatten(el)
        else: yield el

In [None]:
vocab = sorted(set(flatten(stories)))
vocab.insert(0, '<PAD>')
vocab_size = len(vocab)

In [None]:
story_maxsents, vocab_size, story_maxlen, query_maxlen, len(train_stories), len(test_stories)

In [None]:
test_stories[534][0] 

In [None]:
word_idx = dict((c, i) for i, c in enumerate(vocab))

In [None]:
def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    X = []; Xq = []; Y = []
    for story, query, answer in data:
        x = [[word_idx[w] for w in s] for s in story]
        xq = [word_idx[w] for w in query]
        y = [word_idx[answer]]
        X.append(x); Xq.append(xq); Y.append(y)
    return ([tf.contrib.keras.preprocessing.sequence.pad_sequences(x, maxlen=story_maxlen) for x in X],
            tf.contrib.keras.preprocessing.sequence.pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))

In [None]:
inputs_train, queries_train, answers_train = vectorize_stories(train_stories, 
     word_idx, story_maxlen, query_maxlen)
inputs_test, queries_test, answers_test = vectorize_stories(test_stories, 
     word_idx, story_maxlen, query_maxlen)

In [None]:
def stack_inputs(inputs):
    for i,it in enumerate(inputs):
        inputs[i] = np.concatenate([it, np.zeros((story_maxsents-it.shape[0],story_maxlen), 'int')])
    return np.stack(inputs)
inputs_train = stack_inputs(inputs_train)
inputs_test = stack_inputs(inputs_test)

In [None]:
inputs_train.shape, inputs_test.shape

In [None]:
print( queries_train.shape)

queries_train = queries_train[:,np.newaxis,:]
queries_test = queries_test[:,np.newaxis,:]

queries_train.shape , queries_test.shape

In [None]:
answers_train = answers_train.reshape( (-1) )
answers_test = answers_test.reshape( (-1) ) 

# Single Hop model

In [None]:
tf.reset_default_graph()

story = tf.placeholder( tf.int64 , shape = [None,10,8] )
quest = tf.placeholder( tf.int64 , shape = [None,1,4] )
answer = tf.placeholder( tf.int64 , shape = [None] )
is_training = tf.placeholder( tf.bool , shape = [] )

emb_mat = tf.get_variable( "emb_mat" , shape=[vocab_size,20] , dtype=tf.float32  )
emb_mat_2 = tf.get_variable( "emb_mat_2" , shape=[vocab_size,20] , dtype=tf.float32 )

z = tf.nn.embedding_lookup( emb_mat , story )
sent_emb1 = tf.reduce_sum( z, axis=2)

y = tf.nn.embedding_lookup( emb_mat , quest )
quest_emb = tf.reduce_sum( y, axis=2)

u = tf.matmul( sent_emb1 , quest_emb ,transpose_b=True)
weights = tf.nn.softmax(u,dim=1)

zz = tf.nn.embedding_lookup( emb_mat_2 , story )

score = tf.reduce_sum( weights * sent_emb2 , axis = 1)
score = tf.expand_dims( score , axis = 1) + quest_emb
score = tf.squeeze(score , axis=1 )

ans = tf.layers.dense( score , vocab_size )

loss = tf.losses.softmax_cross_entropy(  tf.one_hot(answer,vocab_size) , ans )

train_op = tf.train.AdamOptimizer().minimize(loss)

correct_indices = tf.argmax( ans , axis = 1)
correct = tf.reduce_sum( tf.cast( tf.equal( correct_indices , answer ) , dtype=tf.float32 ) )

In [None]:
def run_model(q1,q2,q3,session,l,t,c,saver,file="my_model",batch_size=64,epoch=5):
    
    train_indices = np.arange( q1.shape[0] )
    np.random.shuffle( train_indices )
    
    for e in tqdm_notebook( range(epoch) ,leave=False):
        
        correct = 0
        losses = []
        for cc in tqdm_notebook( range(int(math.ceil(q1.shape[0]/batch_size))) ,leave=False):
            
            start_idx = (cc*batch_size)%q1.shape[0]
            idx = train_indices[ start_idx : start_idx+batch_size]
            
            feed_dict = { story : q1[idx] , quest : q2[idx] , answer : q3[idx] , is_training: True } 
            actual_batch_size = q1[idx].shape[0]
            
            lr, corr, _ = session.run( [l,c,t] , feed_dict=feed_dict )
            
            losses.append(lr*actual_batch_size)
            correct += corr
        
        if saver != None:
            path = "tmp/" + file
            saver.save(sess , path , global_step = e)
        
        total_correct = correct/q1.shape[0]
        total_loss = np.sum(losses)/q1.shape[0]
        print("Epoch {2}, Overall loss = {0:.3g} and accuracy of {1:.3g}".format(total_loss,total_correct,e+1))

In [None]:
saver = tf.train.Saver() 
with tf.Session() as sess:
    tf.global_variables_initializer().run()
#     for i in range(10):
#         q,w,za = sess.run([weights,correct,train_op],feed_dict={ story:inputs_train[:5],quest:queries_train[:5],answer:answers_train[:5] })
#         print(q)
#         print(w)
#     saver.restore(sess,"tmp/nlp1-4")
    run_model(inputs_train,queries_train,answers_train,sess,loss,train_op,correct,saver,file="nlp1",batch_size=32)

# multi hop
##### change the challenge type to 'two supporting facts 1k'

In [None]:
inputs_train.shape, inputs_test.shape

In [None]:
queries_train.shape , answers_train.shape

In [None]:
tf.reset_default_graph()
tf.set_random_seed(2)

story = tf.placeholder( tf.int64 , shape = [None,88,8] )
quest = tf.placeholder( tf.int64 , shape = [None,1,5] )
answer = tf.placeholder( tf.int64 , shape = [None] )
is_training = tf.placeholder( tf.bool , shape = [] )

with tf.variable_scope("emb"):
    emb_mat_A = tf.get_variable( "emb_mat_A" , shape=[vocab_size,30] )
    emb_mat_C = tf.get_variable( "emb_mat_C" , shape=[vocab_size,30] )
    
z = tf.nn.embedding_lookup( emb_mat_A , story )
sent_emb_A = tf.reduce_sum( z, axis=2)

z = tf.nn.embedding_lookup( emb_mat_C , story )
sent_emb_C = tf.reduce_sum( z, axis=2)

z = tf.nn.embedding_lookup( emb_mat_A , quest )
quest_emb = tf.reduce_sum( z, axis=2)

num_supporting_facts = 2

# A_i = A , for all i
# C_i = C , for all i

for c in range( num_supporting_facts ):
    
    weights = tf.matmul( sent_emb_A , quest_emb , transpose_b=True)
    softmax_weights = tf.nn.softmax( weights, dim=1)

    o = tf.reduce_sum( softmax_weights * sent_emb_C,axis=1 , keep_dims=True) 

    check = None if c==0 else True
    quest_emb = tf.expand_dims( tf.layers.dense(tf.squeeze(quest_emb,axis=1),30,name="H",reuse=check),axis=1 )+o
    
ans = tf.layers.dense( tf.squeeze( quest_emb , axis =1) , vocab_size )

loss = tf.losses.softmax_cross_entropy(  tf.one_hot(answer,vocab_size) , ans )
train_op = tf.train.RMSPropOptimizer(0.005).minimize(loss)

correct_indices = tf.argmax( ans , axis = 1)
correct = tf.reduce_sum( tf.cast( tf.equal( correct_indices , answer ) , dtype=tf.float32 ) )

In [None]:
saver = tf.train.Saver() 
with tf.Session() as sess:
    tf.global_variables_initializer().run()
#     for i in range(50):
#         q,w = sess.run([correct,train_op],feed_dict={ story:inputs_train[:100],quest:queries_train[:100],answer:answers_train[:100] })
#         print(q)
#     saver.restore(sess,"tmp/nlp_self_paper-19")
    for loop in range(3):
        run_model(inputs_train,queries_train,answers_train,sess,loss,train_op,correct,saver,file="nlp2",batch_size=32,epoch=10)
        print("validation:--")
        run_model(inputs_test,queries_test,answers_test,sess,loss,loss,correct,saver=None,batch_size=32,epoch=1)

#     run_model(inputs_train,queries_train,answers_train,sess,loss,train_op,correct,saver,file="nlp_self_paper",batch_size=32,epoch=20)


# Attention

# Spelling Bee CMU preprocessing

In [3]:
lines = [l.strip().split("  ") for l in open("cmudict-0.7b", encoding='latin1') 
         if re.match('^[A-Z]', l)]
lines = [(w, ps.split()) for w, ps in lines]
lines[0], lines[-1]

(('A', ['AH0']), ('ZYWICKI', ['Z', 'IH0', 'W', 'IH1', 'K', 'IY0']))

In [4]:
phonemes = ["_"] + sorted(set(p for w, ps in lines for p in ps))
phonemes[:5]

['_', 'AA0', 'AA1', 'AA2', 'AE0']

In [5]:
len(phonemes)

70

In [6]:
p2i = dict((v, k) for k,v in enumerate(phonemes))
letters = "_abcdefghijklmnopqrstuvwxyz*"
l2i = dict((v, k) for k,v in enumerate(letters))

In [7]:
maxlen=15
pronounce_dict = {w.lower(): [p2i[p] for p in ps] for w, ps in lines if (5<=len(w)<=maxlen) and re.match("^[A-Z]+$", w)}
len(pronounce_dict)

108006

In [8]:
maxlen_p = max([len(v) for k,v in pronounce_dict.items()])

## maxlen is the length of words and maxlen_p is the lenght of the phenotic sounds

In [9]:
maxlen_p , maxlen

(16, 15)

In [10]:
pairs = np.random.permutation(list(pronounce_dict.keys()))
n = len(pairs)
input_ = np.zeros((n, maxlen_p), np.int32)
labels_ = np.zeros((n, maxlen), np.int32)

for i, k in enumerate(pairs):
    for j, p in enumerate(pronounce_dict[k]): input_[i][j] = p
    for j, letter in enumerate(k): labels_[i][j] = l2i[letter]

In [11]:
go_token = l2i["*"]
dec_input_ = np.concatenate([np.ones((n,1)) * go_token, labels_[:,:-1]], axis=1)

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
(input_train, input_test, labels_train, labels_test, dec_input_train, dec_input_test
    ) = train_test_split(input_, labels_, dec_input_, test_size=0.1)

In [14]:
input_train.shape , labels_train.shape

((97205, 16), (97205, 15))

In [15]:
dec_input_train.shape , labels_train.shape

((97205, 15), (97205, 15))

In [16]:
input_vocab_size, output_vocab_size = len(phonemes), len(letters)
input_vocab_size, output_vocab_size

(70, 28)

#### without teacher forcing
https://www.quora.com/What-is-the-teacher-forcing-in-RNN

In [19]:
dim = 240

In [1]:
tf.reset_default_graph()

X = tf.placeholder( tf.int64 , shape=[None,16] )
Y = tf.placeholder( tf.int64 , shape=[None,15] )

emb_mat = tf.get_variable( "emb_mat" , shape=[input_vocab_size,120] )

emb_x = tf.nn.embedding_lookup( emb_mat , X )

emb_x_time_major = tf.transpose( emb_x , [1,0,2])

cell_fw = tf.contrib.rnn.BasicLSTMCell(dim)
cell_bw = tf.contrib.rnn.BasicLSTMCell(dim)

bi_rnn_out , bi_rnn_state = tf.nn.bidirectional_dynamic_rnn( cell_fw , cell_bw , emb_x_time_major , 
                                                            time_major=True, dtype=tf.float32)
bi_rnn_out = tf.concat(bi_rnn_out, 2)

cell_second_layer = tf.contrib.rnn.BasicLSTMCell(dim)
rnn_out , rnn_state = tf.nn.dynamic_rnn(cell_second_layer , bi_rnn_out,
                                        time_major=True, dtype=tf.float32)
rnn_out_last = rnn_out[-1]

# decoder input is just the final context vector * num of outputs ( maxlen )
# here we should ideally concatenate the context vector at each time step
# to the output of the previous timestep answer
# intially it can be the start token concatenated with the context vector
# but its quite inconvinent to write such code in TF ( if not impossible )

repeat_rnn_last = tf.stack( [rnn_out_last]*maxlen , axis=1 )

last_cells = tf.contrib.rnn.BasicLSTMCell(dim)

final_out , final_out_state = tf.nn.dynamic_rnn( last_cells ,  repeat_rnn_last , dtype=tf.float32 , scope="rnn2")

result = tf.layers.dense( final_out , output_vocab_size )

loss = tf.losses.softmax_cross_entropy(  tf.one_hot( Y,output_vocab_size ) , result )
train_op = tf.train.AdamOptimizer().minimize(loss)

correct_indices = tf.argmax( result , axis = 2 )
correct = tf.reduce_sum( tf.cast( tf.equal( correct_indices , Y ) , dtype=tf.float32 ) ) / maxlen

NameError: name 'tf' is not defined

In [77]:
def run_model(q1,q2,q3,session,l,t,c,saver,file="my_model",batch_size=64,epoch=5):
    
    train_indices = np.arange( q1.shape[0] )
#     np.random.shuffle( train_indices )
    
    for e in tqdm_notebook( range(epoch) ,leave=False):
        
        correct = 0
        losses = []
        for cc in tqdm_notebook( range(int(math.ceil(q1.shape[0]/batch_size))) ,leave=False):
            
            start_idx = (cc*batch_size)%q1.shape[0]
            idx = train_indices[ start_idx : start_idx+batch_size]
            
            feed_dict = { X : q1[idx] , Y : q2[idx] , X_y : q3[idx] , seq_len : [15]*len(idx) } 
            actual_batch_size = q1[idx].shape[0]
            
            lr, corr, _ = session.run( [l,c,t] , feed_dict=feed_dict )
            
            losses.append(lr*actual_batch_size)
            correct += corr
        
        if saver != None:
            path = "tmp/" + file
            saver.save(sess , path , global_step = e)
        
        total_correct = correct/q1.shape[0]
        total_loss = np.sum(losses)/q1.shape[0]
        print("Epoch {2}, Overall loss = {0:.3g} and accuracy of {1:.3g}".format(total_loss,total_correct,e+1))

In [None]:
saver = tf.train.Saver() 
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    run_model(input_test,labels_test,sess,loss,train_op,correct,saver,file="nlp_pro",batch_size=512,epoch=1)
    
#     saver.restore(sess,"tmp/nlp_pro-3")
#     for i in range(10):
#     q = sess.run( correct_indices ,feed_dict={ X:input_train[:1000],Y:labels_train[:1000] })
#     print( np.sum( np.sum( np.equal(q,labels_train[:1000] ) ,axis=1) == 15) )

#     print( ''.join( [letters[i] for i in a] ) , ''.join( [letters[i] for i in b] ) )
#         print(q,w)
        
#     saver.restore(sess,"tmp/nlp_pro-3")    

# Attention

The best way to understand the encoder decoder model is to first study 

# Neural Machine Translation by Jointly Learning to Align and Translate
https://arxiv.org/abs/1409.0473
# Chris manning's nlp course
https://www.youtube.com/watch?v=IxQtK2SjWWM&list=PL3FW7Lu3i5Jsnh1rnUwq_TcylNr7EkRe6&index=11
# and by far the best video i have ever come across, my hero Quoc V le
https://www.youtube.com/watch?v=G5RY_SUJih4

# also it would be wonder ful to study just the readme of NMT implemented by google, i have later almost replicated there work in a neat way which makes it much easier to understand

https://github.com/tensorflow/nmt/blob/master/README.md

but it would be best if could yourself experiment with why creating two sperate graphs is better as opposed to messing all things up in one,
# one point to convince yourself why we are able to save from one graph and load in another is , by printing the trainable variables in both the case , and observing that both are identical so its correct to load it in a later graph

In [73]:
dim = 11

In [17]:
from tensorflow.python.layers.core import Dense

In [79]:
tf.reset_default_graph()

X = tf.placeholder( tf.int32 , shape=[None,16] )
X_y = tf.placeholder( tf.int32 , shape=[None,15] )
seq_len = tf.placeholder( tf.int32 , shape=[None] )

Y = tf.placeholder( tf.int32 , shape=[None,15] )

emb_mat = tf.get_variable( "emb_mat" , shape=[input_vocab_size,120] )
emb_out = tf.get_variable( "emb_out" , shape=[output_vocab_size,120] )

emb_x = tf.nn.embedding_lookup( emb_mat , X )
emb_x_y = tf.nn.embedding_lookup( emb_out , X_y )

with tf.name_scope("Encoder"):
    cell_fw = tf.contrib.rnn.BasicLSTMCell(dim)
    cell_bw = tf.contrib.rnn.BasicLSTMCell(dim)

    enc_rnn_out , enc_rnn_state = tf.nn.bidirectional_dynamic_rnn( cell_fw , cell_bw , emb_x , dtype=tf.float32)
    enc_rnn_out = tf.concat(enc_rnn_out, 2)

    c = tf.concat([enc_rnn_state[0][0],enc_rnn_state[1][0]],axis=1)
    h = tf.concat([enc_rnn_state[0][1],enc_rnn_state[1][1]],axis=1)

    enc_rnn_state = tf.contrib.rnn.LSTMStateTuple(c,h)

In [80]:
with tf.name_scope("Decoder"):
    
    mem_units = 2*dim
    dec_cell = tf.contrib.rnn.BasicLSTMCell( 2*dim )

    attn_mech = tf.contrib.seq2seq.BahdanauAttention( num_units = mem_units,  memory = enc_rnn_out, normalize=True)
    attn_cell = tf.contrib.seq2seq.AttentionWrapper( cell = dec_cell,attention_mechanism = attn_mech ) 

    batch_size = tf.shape(enc_rnn_out)[0]
    initial_state = attn_cell.zero_state( batch_size = batch_size , dtype=tf.float32 )
    initial_state = initial_state.clone(cell_state = enc_rnn_state)

    out_layer = Dense( output_vocab_size )
    
    with tf.variable_scope("Training"):
    
        helper = tf.contrib.seq2seq.TrainingHelper( inputs = emb_x_y , sequence_length = seq_len )
        decoder = tf.contrib.seq2seq.BasicDecoder( cell = attn_cell, helper = helper, initial_state = initial_state ,output_layer=out_layer ) 
        outputs, final_state, final_sequence_lengths= tf.contrib.seq2seq.dynamic_decode(decoder=decoder,impute_finished=True)

        training_logits = tf.identity(outputs.rnn_output )
        training_pred = tf.identity(outputs.sample_id )
    
    with tf.variable_scope("Inference"):
        
        start_tokens = tf.tile(tf.constant([27], dtype=tf.int32), [batch_size], name='start_tokens')
        end_token = 0
        
        inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(emb_out,start_tokens, end_token)
        inference_decoder = tf.contrib.seq2seq.BasicDecoder(attn_cell,inference_helper,initial_state, out_layer)
        inference_out,_,__ = tf.contrib.seq2seq.dynamic_decode(inference_decoder,impute_finished=True,maximum_iterations=maxlen)
        
        inference_logits = inference_out.rnn_output 
        inference_pred = inference_out.sample_id 
        
with tf.name_scope("Training_op"):
    
    loss = tf.losses.softmax_cross_entropy(  tf.one_hot( Y,output_vocab_size ) , training_logits )
    train_op = tf.train.AdamOptimizer().minimize(loss)

    correct = tf.reduce_sum( tf.cast( tf.equal( training_pred , Y ) , dtype=tf.float32 ) ) / maxlen

with tf.name_scope("Inference_op"):
    
    correct_inference = tf.reduce_sum( tf.cast( tf.equal( inference_pred , Y ), dtype=tf.float32 ) ) / maxlen 

In [22]:
for i in tf.get_collection("trainable_variables"):
    print(i)

<tf.Variable 'emb_mat:0' shape=(70, 120) dtype=float32_ref>
<tf.Variable 'emb_out:0' shape=(28, 120) dtype=float32_ref>
<tf.Variable 'bidirectional_rnn/fw/basic_lstm_cell/kernel:0' shape=(131, 44) dtype=float32_ref>
<tf.Variable 'bidirectional_rnn/fw/basic_lstm_cell/bias:0' shape=(44,) dtype=float32_ref>
<tf.Variable 'bidirectional_rnn/bw/basic_lstm_cell/kernel:0' shape=(131, 44) dtype=float32_ref>
<tf.Variable 'bidirectional_rnn/bw/basic_lstm_cell/bias:0' shape=(44,) dtype=float32_ref>
<tf.Variable 'memory_layer/kernel:0' shape=(22, 22) dtype=float32_ref>
<tf.Variable 'Training/decoder/attention_wrapper/basic_lstm_cell/kernel:0' shape=(164, 88) dtype=float32_ref>
<tf.Variable 'Training/decoder/attention_wrapper/basic_lstm_cell/bias:0' shape=(88,) dtype=float32_ref>
<tf.Variable 'Training/decoder/attention_wrapper/bahdanau_attention/query_layer/kernel:0' shape=(22, 22) dtype=float32_ref>
<tf.Variable 'Training/decoder/attention_wrapper/bahdanau_attention/attention_v:0' shape=(22,) dtyp

In [81]:
saver = tf.train.Saver()
with tf.Session() as sess:
    tf.global_variables_initializer().run()
#     saver.restore(sess,'tmp/nlp_seq2seq_final-0')
    run_model( input_train,labels_train,dec_input_train,sess,loss,train_op,correct,saver,file="nlp_sof")
#     q,w = sess.run( [correct_inference,inference_pred] ,feed_dict={ X:input_test[:1000],Y:labels_test[:1000] })
#     print(q)
#     print( np.sum( np.sum( np.equal(w,labels_test[:1000] ) ,axis=1) == 15) )
#     print(w[:10])
#     print( ''.join( [letters[i] for i in a] ) , ''.join( [letters[i] for i in b] ) )
#     print(q,w)
        
    
#     for i in range(100):
#         feed_dict = { X:input_train[:10] , X_y:dec_input_train[:10] , Y:labels_train[:10] , seq_len:[15]*10 }
#         q,_,__ = sess.run([loss,correct,train_op] , feed_dict=feed_dict )
#         print( q,_ )
#         if i==99:
#             print( sess.run( [training_pred] , feed_dict=feed_dict ) )

Epoch 1, Overall loss = 3.33 and accuracy of 0.016


Epoch 2, Overall loss = 3.29 and accuracy of 0.133


Epoch 3, Overall loss = 3.25 and accuracy of 0.471


Epoch 4, Overall loss = 3.21 and accuracy of 0.476


Epoch 5, Overall loss = 3.17 and accuracy of 0.476


In [91]:
w[:10]

array([[22,  5, 18,  9, 20, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [23,  9, 12,  9, 19, 20,  1, 14,  0,  0,  0,  0,  0,  0,  0],
       [19, 21, 20, 20,  4, 15, 15, 14,  0,  0,  0,  0,  0,  0,  0],
       [12,  1,  3, 20, 20, 18,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3,  5, 12,  3, 14,  5, 18,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3,  9,  9,  3, 14,  9,  1, 14, 19,  0,  0,  0,  0,  0,  0],
       [ 6, 15, 13,  9, 14, 14,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [12,  1,  2, 18,  5,  3, 11,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 2,  1, 14,  7, 15, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 7,  5, 18, 19, 20,  9, 14, 14,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)

In [92]:
for a,b in zip( w[:100] , labels_test[:100] ):
    k,l = ''.join( [letters[i] for i in a] ) , ''.join( [letters[i] for i in b] ) 
    print(  k,l,k==l )
        

veritt_________ verit__________ False
wilistan_______ williston______ False
suttdoon_______ southdown______ False
lacttr_________ lacter_________ False
celcner________ kelchner_______ False
ciicnians______ chechnyans_____ False
fominn_________ foaming________ False
labreck________ labrecque______ False
bangoo_________ bongo__________ False
gerstinn_______ gerstein_______ False
geaaroo________ geoffroy_______ False
lahhi__________ layhee_________ False
swadd__________ swayed_________ False
tinnn__________ tingen_________ False
pilts__________ piltz__________ False
vanalstinn_____ vanalstine_____ False
aamarck________ avmark_________ False
garder_________ gaarder________ False
sferical_______ spherical______ False
palppttting____ palpitating____ False
menn___________ menia__________ False
prlaactan______ prolactin______ False
storms_________ stormes________ False
sider__________ sidor__________ False
mmprttntlly____ importantly____ False
bottnatt_______ boitnott_______ False
plattinn____

# clever way to implement two seperate graphs with the same trainable variables

the thing in which i got stuck for  a long time was , for beam search decoding we are suppose to tile the encoder outputs and states beam with times , we are not supposed to do so with the start token , this is really confusing and lacks consistency and clarity , the wrapper functions should themselves do the tiling for the encoder outputs and states , because of this i was stuck and assumed wrongly that the variables for attention would be different which wasnt the case and as someone pointed in the github issue i had posted i was able to see the mistake.

# tensorflow / nmt replicated in a simple way

### greedy decoding and beam search with beam width one is equivalent

obvious pointers , which is not so obvious for most of us

beam search only makes sense in the time of inference
using bi directional lstm in the decoder side doesnt make sense to me
##### for decoder we can make initial state as the encoder final state but that doesnt improve performance as mentioned by chris manning is the cs 224n in lecture 10 , so most of the figures shown for seq2seq model are misleading
eg , 
https://github.com/tensorflow/nmt/blob/master/nmt/g3doc/img/seq2seq.jpg
### its better you see the models as,
https://syncedreview.com/2017/07/11/massive-exploration-of-neural-machine-translation-architectures/

https://devblogs.nvidia.com/parallelforall/introduction-neural-machine-translation-gpus-part-2/

In [57]:
def get_placeholders():
    X = tf.placeholder( tf.int32 , shape=[None,16] )
    X_y = tf.placeholder( tf.int32 , shape=[None,15] )
    seq_len = tf.placeholder( tf.int32 , shape=[None] )

    Y = tf.placeholder( tf.int32 , shape=[None,15] )
    
    return ( X , Y , X_y , seq_len )

In [59]:
def _decoder( encoder_outputs , encoder_state ,source_sequence_length , batch_size , mode , beam_width ):
    
    num_units = 2*dim
    memory = encoder_outputs

    if mode == "infer":
        memory = tf.contrib.seq2seq.tile_batch( memory, multiplier=beam_width )
        source_sequence_length = tf.contrib.seq2seq.tile_batch( source_sequence_length, multiplier=beam_width)
        encoder_state = tf.contrib.seq2seq.tile_batch( encoder_state, multiplier=beam_width )
        batch_size = batch_size * beam_width
    else:
        batch_size = batch_size

    attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units = num_units, memory=memory, 
                                                               normalize=True,
                                                               memory_sequence_length=source_sequence_length)

    cell = tf.contrib.rnn.BasicLSTMCell( 2*dim )

    cell = tf.contrib.seq2seq.AttentionWrapper( cell,
                                                attention_mechanism,
                                                attention_layer_size=num_units,
                                                name="attention")

    decoder_initial_state = cell.zero_state(batch_size, tf.float32).clone( cell_state=encoder_state )

    return cell, decoder_initial_state

In [87]:
def Decoder( mode , enc_rnn_out , enc_rnn_state , seq_len ,emb_x_y , emb_out):
    
    with tf.variable_scope("Decoder") as decoder_scope:

        mem_units = 2*dim
        out_layer = Dense( output_vocab_size )
        batch_size = tf.shape(enc_rnn_out)[0]
        beam_width = 3

        cell , initial_state = _decoder( enc_rnn_out ,enc_rnn_state , seq_len , batch_size , mode , beam_width )

        if mode == "train":

            helper = tf.contrib.seq2seq.TrainingHelper( inputs = emb_x_y , sequence_length = seq_len )
            decoder = tf.contrib.seq2seq.BasicDecoder( cell = cell, helper = helper, initial_state = initial_state,output_layer=out_layer) 
            outputs, final_state, final_sequence_lengths= tf.contrib.seq2seq.dynamic_decode(decoder=decoder,
                                                                                        maximum_iterations=maxlen,
                                                                                           scope=decoder_scope)

            logits = outputs.rnn_output
            sample_ids = outputs.sample_id

        else:

            start_tokens = tf.tile(tf.constant([27], dtype=tf.int32), [ batch_size ] )
            end_token = 0

            my_decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell = cell,
                                                               embedding = emb_out,
                                                               start_tokens = start_tokens,
                                                               end_token = end_token,
                                                               initial_state = initial_state,
                                                               beam_width = beam_width,
                                                               output_layer = out_layer )

            outputs, t1 , t2 = tf.contrib.seq2seq.dynamic_decode(  my_decoder,
                                                                   maximum_iterations=maxlen,scope=decoder_scope )

            logits = tf.no_op()
            sample_ids = outputs.predicted_ids
        
    return logits , sample_ids

In [100]:
def construct_graph(mode,inp):

    X , Y , X_y , seq_len = inp
    
    emb_mat = tf.get_variable( "emb_mat" , shape=[input_vocab_size,120] )
    emb_out = tf.get_variable( "emb_out" , shape=[output_vocab_size,120] )

    emb_x = tf.nn.embedding_lookup( emb_mat , X )
    emb_x_y = tf.nn.embedding_lookup( emb_out , X_y )

    with tf.name_scope("Encoder"):
        cell_fw = tf.contrib.rnn.BasicLSTMCell(dim)
        cell_bw = tf.contrib.rnn.BasicLSTMCell(dim)

        enc_rnn_out , enc_rnn_state = tf.nn.bidirectional_dynamic_rnn( cell_fw , cell_bw , emb_x , dtype=tf.float32)
        enc_rnn_out = tf.concat(enc_rnn_out, 2)

        c = tf.concat([enc_rnn_state[0][0],enc_rnn_state[1][0]],axis=1)
        h = tf.concat([enc_rnn_state[0][1],enc_rnn_state[1][1]],axis=1)

        enc_rnn_state = tf.contrib.rnn.LSTMStateTuple(c,h)

    logits , sample_ids = Decoder(mode, enc_rnn_out , enc_rnn_state , seq_len , emb_x_y, emb_out)
    
    if mode == "train":

        loss = tf.losses.softmax_cross_entropy(  tf.one_hot( Y,output_vocab_size ) , logits )
        train_op = tf.train.AdamOptimizer().minimize(loss)

        correct = tf.reduce_sum( tf.cast( tf.equal( sample_ids , Y ) , dtype=tf.float32 ) ) / maxlen
    else:
#         sample_ids = tf.transpose( sample_ids , [2,0,1] )[0]
        correct = None
#         correct = tf.reduce_sum( tf.cast( tf.equal( sample_ids , Y ) , dtype=tf.float32 ) ) / maxlen
        loss = None
        train_op = None
        
    return train_op , loss , correct , sample_ids , logits

In [28]:
dim = 11

In [85]:
def run_model(inp,q1,q2,q3,session,l,t,c,saver,file="my_model",batch_size=64,epoch=5):
    
    train_indices = np.arange( q1.shape[0] )
    
    for e in tqdm_notebook( range(epoch) ,leave=False):
        
        correct = 0
        losses = []
        for cc in tqdm_notebook( range(int(math.ceil(q1.shape[0]/batch_size))) ,leave=False):
            
            start_idx = (cc*batch_size)%q1.shape[0]
            idx = train_indices[ start_idx : start_idx+batch_size]
            
            feed_dict = { inp[0]: q1[idx] , inp[1]: q2[idx] , inp[2]: q3[idx] , inp[3]: [15]*len(idx) } 
            actual_batch_size = q1[idx].shape[0]
            
            lr, corr, _ = session.run( [l,c,t] , feed_dict=feed_dict )
            
            losses.append(lr*actual_batch_size)
            correct += corr
        
        if saver != None:
            path = "tmp/" + file
            saver.save(session , path , global_step = e)
        
        total_correct = correct/q1.shape[0]
        total_loss = np.sum(losses)/q1.shape[0]
        print("Epoch {2}, Overall loss = {0:.3g} and accuracy of {1:.3g}".format(total_loss,total_correct,e+1))

In [105]:
tf.reset_default_graph()

train_graph = tf.Graph()
infer_graph = tf.Graph()

with train_graph.as_default():
    
    train_input = get_placeholders()
    train_op, loss , correct,sample_ids,logits = construct_graph("train",train_input)
    initializer = tf.global_variables_initializer()
    train_saver = tf.train.Saver()
    
with infer_graph.as_default():
    
    infer_input = get_placeholders()
    _ , _ , correct_test , pred_ids,_ = construct_graph("infer",infer_input)
    infer_saver = tf.train.Saver()
    
train_sess = tf.Session(graph=train_graph)
infer_sess = tf.Session(graph=infer_graph)

train_sess.run(initializer)

# for i in range(100):
#     q,w,e,r,__ = train_sess.run( [train_op,loss,correct,sample_ids,logits],feed_dict={ train_input[0]:input_test[:10],train_input[1]:labels_test[:10],train_input[2]:dec_input_test[:10],train_input[3]:[15]*10 })
#     print(q,w,e)
#     print(r)
#     print(__.shape)

run_model( train_input,input_train,labels_train,dec_input_train,train_sess,loss,train_op,correct,train_saver,file="nlp_nmt",epoch=3)

infer_saver.restore(infer_sess, "tmp/nlp_nmt-2" )
q = infer_sess.run( pred_ids,feed_dict={ infer_input[0]:input_test[:100],infer_input[1]:labels_test[:100],infer_input[2]:dec_input_test[:100],infer_input[3]:[15]*100 })
# print(q)
# print(w)

Epoch 1, Overall loss = 1.11 and accuracy of 0.683


Epoch 2, Overall loss = 0.463 and accuracy of 0.864


Epoch 3, Overall loss = 0.34 and accuracy of 0.896
INFO:tensorflow:Restoring parameters from tmp/nlp_nmt-2


In [109]:
z = np.transpose( q,(2,0,1) )
z[1][:10]

array([[ 2, 12,  5, 13,  5, 19,  8,  0, -1, -1, -1, -1, -1, -1],
       [26,  9,  9, 18,  9, 14, 19,  0, -1, -1, -1, -1, -1, -1],
       [21, 14,  4,  5, 18, 11, 15,  5, 20,  0, -1, -1, -1, -1],
       [ 1, 12,  2, 21, 13,  0, -1, -1, -1, -1, -1, -1, -1, -1],
       [ 5,  7, 26, 21, 12, 20,  5,  4,  0, -1, -1, -1, -1, -1],
       [14, 15, 21, 20, 18,  1, 20,  9, 22,  5,  0, -1, -1, -1],
       [ 2,  5,  9,  3, 15, 14, 15, 18,  0, -1, -1, -1, -1, -1],
       [19,  8, 15,  1,  2,  0, -1, -1, -1, -1, -1, -1, -1, -1],
       [11, 15, 14,  6, 15, 18, 13,  0, -1, -1, -1, -1, -1, -1],
       [13,  9, 14, 14,  9, 22,  1, 14,  0, -1, -1, -1, -1, -1]], dtype=int32)

In [110]:
for cc in range(3):
    for a,b in zip(z[cc],labels_test[:100]):
        print( ''.join( [letters[i] for i in a] ) , ''.join( [letters[i] for i in b] ) )
    

blemish_****** blemish________
ziirans_****** zaireans_______
undercote_**** undercoat______
albam_******** album__________
exulted_****** exulted________
nutrative_**** nutritive______
bichonor_***** baikonur_______
shoba_******** shobe__________
conform_****** conform________
minivan_****** minivan________
troiano_****** troiano________
munic_******** munich_________
saliont_****** salient________
jalif_******** jolliff________
padolski_***** podolsky_______
pablisity_**** publicity______
garfield_***** garfield_______
padle_******** paddle_________
goberstin_**** goberstein_____
churchous_**** churchhouse____
pichan_******* pichon_________
marmer_******* marmor_________
indistinked_** indistinct_____
bottomfish_*** bottomfish_____
meramontes_*** miramontes_____
taveras_****** taveras________
astentatious_* ostentatious___
steage_******* steege_________
zipphal_****** zipfel_________
show_********* schow__________
sudennas_***** suddenness_____
barkes_******* barkus_________
furino_*

## english is a hard language to read and say there is much differnce i suspect these results would be brilliantly better for my mother tongue Hindi / Nepali :p 