## NMT for English to Hindi

In [72]:
import os
import tensorflow as tf
import random
import numpy as np
import math
from tensorflow.python.layers.core import Dense

### Loading data

In [73]:
english = []
hindi = []
with open(os.path.join('hin-eng', 'hin.txt')) as f:
    for line in f:
        eng, hin = line.split('\t')
        # hindi is having a newline character '\n' at the end so excluding it
        hin = hin[:-1]
        english.append(eng.lower())
        hindi.append(hin)
        
    assert len(english) == len(hindi)
    
    print(english[:10])
    print(hindi[:10])
    
    print('\nlength of english sentences:', len(english))
    print('length of hindi sentences:', len(hindi))

['wow!', 'help!', 'jump.', 'jump.', 'jump.', 'hello!', 'hello!', 'cheers!', 'cheers!', 'got it?']
['वाह!', 'बचाओ!', 'उछलो.', 'कूदो.', 'छलांग.', 'नमस्ते।', 'नमस्कार।', 'वाह-वाह!', 'चियर्स!', 'समझे कि नहीं?']

length of english sentences: 2867
length of hindi sentences: 2867


### Getting the length of unique sentences

In [74]:
unique_eng_sentences = set(english)
unique_hin_sentences = set(hindi)

print(len(unique_eng_sentences))
print(len(unique_hin_sentences))

2660
2788


### Creating the dictionary

In [75]:
eng_words = []
hin_words = []

for sent in english:
    sentence = sent.split(' ')
    eng_words.extend(sentence)

unq_eng_words = set(eng_words)
print('unique english words :', len(unq_eng_words))

for sent in hindi:
    sentence = sent.split(' ')
    hin_words.extend(sentence)

unq_hin_words = set(hin_words)
print('unique hindi words:', len(unq_hin_words))

eng_dict = {'<sos>':0, '<eos>':1, '<PAD>':2}

for wrd in unq_eng_words:
    eng_dict[wrd] = len(eng_dict)
rev_eng_dict = dict(zip(eng_dict.values(), eng_dict.keys()))

print('\nenglish dict:\n', list(eng_dict.items())[0:10])
print('\nreverse english dict:\n', list(rev_eng_dict.items())[0:10])

hin_dict = {'<sos>':0, '<eos>':1, '<PAD>':2}

for wrd in unq_hin_words:
    hin_dict[wrd] = len(hin_dict)
    
rev_hin_dict = dict(zip(hin_dict.values(), hin_dict.keys()))

print('\nhindi dict:\n', list(hin_dict.items())[0:10])
print('\nreverse hindi dict:\n', list(rev_hin_dict.items())[0:10])

eng_vocab_size = len(eng_dict)
hin_vocab_size = len(hin_dict) 

print('\neng vocab size:', eng_vocab_size)
print('hin vocab size:', hin_vocab_size)


unique english words : 3307
unique hindi words: 3156

english dict:
 [('street', 3), ('money?', 4), ('entered', 5), ('slowly?', 6), ('empty', 8), ('kilo?', 9), ('get', 10), ('burning.', 1110), ('scolded', 11), ('amateur', 12)]

reverse english dict:
 [(0, '<sos>'), (1, '<eos>'), (2, '<PAD>'), (3, 'street'), (4, 'money?'), (5, 'entered'), (6, 'slowly?'), (7, 'need'), (8, 'empty'), (9, 'kilo?')]

hindi dict:
 [('सुलाना', 8), ('मेज़', 6), ('सुख', 7), ('अजीब', 5), ('मांग', 9), ('इकट्ठा', 10), ('पूरा', 11), ('उसने', 15), ('विदेश', 16), ('तकलीफ', 17)]

reverse hindi dict:
 [(0, '<sos>'), (1, '<eos>'), (2, '<PAD>'), (3, 'थी।'), (4, 'I'), (5, 'अजीब'), (6, 'मेज़'), (7, 'सुख'), (8, 'सुलाना'), (9, 'मांग')]

eng vocab size: 3310
hin vocab size: 3159


### Creating train and test set

In [76]:
test_ind = [random.randint(0, len(english)) for i in range(50)]

english_train = [english[i] for i in range(len(english)) if i not in test_ind]
hindi_train = [hindi[i] for i in range(len(hindi)) if i not in test_ind]

english_test = [english[i] for i in test_ind]
hindi_test = [hindi[i] for i in test_ind]

print('length of english train:', len(english_train))
print('length of hindi train:', len(hindi_train))

print('length of english test:', len(english_test))
print('length of hindi test:', len(hindi_test))

for i in range(10):
    print('\neng:', english_test[i])
    print('hin:', hindi_test[i])

length of english train: 2817
length of hindi train: 2817
length of english test: 50
length of hindi test: 50

eng: what have you come here for?
hin: आप लोग यहाँ क्या करने के लिए आए हैं?

eng: what time does your watch say it is now?
hin: तुम्हारी घड़ी अब क्या समय बता रही है?

eng: both of my sisters are married.
hin: मेरी दोनो बहनें शादीशुदा हैं।

eng: these are our books.
hin: ये हमारी किताबें हैं।

eng: three men broke out of prison yesterday.
hin: कल तीन आदमी जेल से भाग गए।

eng: i heard someone knocking.
hin: मुझे किसी के खटखटाने की आवाज़ सुनाई दी।

eng: he came to see you yesterday.
hin: वह कल तुमसे मिलने आया था।

eng: according to him, she is not coming.
hin: उसके हिसाब से वह नहीं आ रही।

eng: this is the village where my father was born.
hin: यह वह गाँव है जहाँ मेरे पिताजी पैदा हुए थे।

eng: i was waiting for a taxi.
hin: मैं टैक्सी का इंतेज़ार कर रहा था।


### Max no. of words in a sentence

In [77]:
en = []
hi = []

for sent in english:
    sentence = sent.split(' ')
    en.append(sentence)

for sent in hindi:
    sentence = sent.split(' ')
    hi.append(sentence)
    
print('max length of english sentence:', len(max(en, key = len)))
print('max length of hindi sentence:', len(max(hi, key = len)))


max length of english sentence: 22
max length of hindi sentence: 25


### Converting from words to numbers

In [78]:
# +2 for <sos> and <eos>
eng_max_len = len(max(en, key = len)) + 2
hin_max_len = len(max(hi, key = len)) + 2

train_input = []
train_output = []

for eng_sent, hin_sent in zip(english_train, hindi_train):
    
    numeric_eng_sent = [eng_dict['<sos>']]
    numeric_hin_sent = [hin_dict['<sos>']]
    
    
    for wrds in eng_sent.split():
        numeric_eng_sent.append(eng_dict[wrds])
    
    for wrds in hin_sent.split():
        numeric_hin_sent.append(hin_dict[wrds])
    
    if len(numeric_eng_sent) < eng_max_len:
        
        [numeric_eng_sent.append(eng_dict['<PAD>']) 
                            for i in range(eng_max_len - len(numeric_eng_sent) - 1)]
        
        numeric_eng_sent.append(eng_dict['<eos>'])
    
    train_input.append(numeric_eng_sent)
    
    if len(numeric_hin_sent) < hin_max_len:
        
        [numeric_hin_sent.append(hin_dict['<PAD>']) 
                            for i in range(hin_max_len - len(numeric_hin_sent) - 1)]
        numeric_hin_sent.append(hin_dict['<eos>'])
    
    train_output.append(numeric_hin_sent)


test_input = []
test_output = []

for eng_sent, hin_sent in zip(english_test, hindi_test):
    
    numeric_eng_sent = [eng_dict['<sos>']]
    numeric_hin_sent = [hin_dict['<sos>']]
    
    for wrds in eng_sent.split():
        numeric_eng_sent.append(eng_dict[wrds])
    
    for wrds in hin_sent.split():
        numeric_hin_sent.append(hin_dict[wrds])
    
    if len(numeric_eng_sent) < eng_max_len:
        
        [numeric_eng_sent.append(eng_dict['<PAD>']) 
                            for i in range(eng_max_len - len(numeric_eng_sent) - 1)]
        
        numeric_eng_sent.append(eng_dict['<eos>'])
    
    test_input.append(numeric_eng_sent)
    
    if len(numeric_hin_sent) < hin_max_len:
        
        [numeric_hin_sent.append(hin_dict['<PAD>']) 
                            for i in range(hin_max_len - len(numeric_hin_sent) - 1)]
        numeric_hin_sent.append(hin_dict['<eos>'])
    
    test_output.append(numeric_hin_sent)



### Checking if every thing is working fine

In [88]:
wrd_sent = []
for i in range(10):
    num_sent = test_input[i]
    print('\nnumeric and word sequence:\n', num_sent)
    for j in range(eng_max_len):
        wrd_sent.append(rev_eng_dict[num_sent[j]])
        
    print(wrd_sent)
    
    wrd_sent = []

wrd_sent = []    
for i in range(10):
    num_sent = test_input[i]
    print('\nnumeric and word sequence:\n', num_sent)
    for j in range(eng_max_len):
        wrd_sent.append(rev_eng_dict[num_sent[j]])
        
    print(wrd_sent)
    
    wrd_sent = []


numeric and word sequence:
 [0, 1335, 168, 767, 29, 757, 2991, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1]
['<sos>', 'what', 'have', 'you', 'come', 'here', 'for?', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<eos>']

numeric and word sequence:
 [0, 1335, 1238, 1207, 1974, 2561, 1527, 1599, 357, 1174, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1]
['<sos>', 'what', 'time', 'does', 'your', 'watch', 'say', 'it', 'is', 'now?', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<eos>']

numeric and word sequence:
 [0, 2740, 2395, 2895, 1497, 110, 866, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1]
['<sos>', 'both', 'of', 'my', 'sisters', 'are', 'married.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<eos>']

numeric and

In [89]:
wrd_sent = []
for i in range(10):
    num_sent = test_output[i]
    print('\nnumeric and word sequence:\n', num_sent)
    for j in range(hin_max_len):
        wrd_sent.append(rev_hin_dict[num_sent[j]])
        
    print(wrd_sent)
    
    wrd_sent = []

wrd_sent = []    
for i in range(10):
    num_sent = test_output[i]
    print('\nnumeric and word sequence:\n', num_sent)
    for j in range(hin_max_len):
        wrd_sent.append(rev_hin_dict[num_sent[j]])
        
    print(wrd_sent)
    
    wrd_sent = []


numeric and word sequence:
 [0, 1899, 1224, 2876, 3089, 2099, 2140, 3091, 2599, 656, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1]
['<sos>', 'आप', 'लोग', 'यहाँ', 'क्या', 'करने', 'के', 'लिए', 'आए', 'हैं?', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<eos>']

numeric and word sequence:
 [0, 1553, 836, 917, 3089, 1091, 1466, 417, 2731, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1]
['<sos>', 'तुम्हारी', 'घड़ी', 'अब', 'क्या', 'समय', 'बता', 'रही', 'है?', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<eos>']

numeric and word sequence:
 [0, 263, 357, 2079, 1135, 1586, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1]
['<sos>', 'मेरी', 'दोनो', 'बहनें', 'शादीशुदा', 'हैं।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '

### Making train and test i/p and o/p numpy arrays

In [41]:
train_input = np.array(train_input, dtype=np.int32)
train_output = np.array(train_output, dtype=np.int32)

test_input = np.array(test_input,dtype=np.int32)
test_output = np.array(test_output,dtype=np.int32)

### Word embeddings using skip-gram

In [42]:
batch_size = 64
embedding_size = 128

sentences = []
window_size = 1
data = []

num_sampled = 20

for sent in english:
    sentence = sent.split(' ')
    sentences.append(sentence)
    


for sent in sentences:
    for ind, wrd in enumerate(sent):
        for cont_wrd in sent[max(ind - window_size, 0) : min(ind + window_size, len(sent)) + 1]:
            if wrd not in cont_wrd:
                data.append([wrd, cont_wrd])
                
data[0:10]

[['got', 'it?'],
 ['it?', 'got'],
 ["i'm", 'ok.'],
 ['ok.', "i'm"],
 ['come', 'in.'],
 ['in.', 'come'],
 ['get', 'out!'],
 ['out!', 'get'],
 ['go', 'away!'],
 ['away!', 'go']]

In [43]:
data_train = []
data_label = []

for inp in data:
    
    data_train.append(eng_dict[inp[0]])
    data_label.append(eng_dict[inp[1]])

print('data train: {}' .format(data_train[0:10]))
print('data label: {}' .format(data_label[0:10]))

data train: [1455, 434, 2457, 2679, 29, 1769, 10, 89, 394, 2682]
data label: [434, 1455, 2679, 2457, 1769, 29, 89, 10, 2682, 394]


### Define i/p and o/p

In [44]:
tf.reset_default_graph()
train_dataset = tf.placeholder(tf.int32, shape = [batch_size])
train_labels = tf.placeholder(tf.int32, shape = [batch_size, 1])

### Model Parameters and Other Variables

In [45]:
embeddings = tf.Variable(tf.random_uniform([eng_vocab_size, embedding_size], -1.0, 1.0))

softmax_weights = tf.Variable(tf.truncated_normal([eng_vocab_size, embedding_size],
                                stddev=0.5 / math.sqrt(embedding_size))
                                )
softmax_biases = tf.Variable(tf.random_uniform([eng_vocab_size],0.0,0.01))


### Computations

In [46]:
embed = tf.nn.embedding_lookup(embeddings, train_dataset)

loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(
                    weights=softmax_weights, biases=softmax_biases, inputs=embed,
                    labels=train_labels, num_sampled=num_sampled, num_classes=eng_vocab_size)
                    )

### Optimizer

In [47]:
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

### Running the skip gram algorithm

In [16]:

num_steps = 10000
skip_losses = []

with tf.Session() as session:
    tf.global_variables_initializer().run()
    average_loss = 0
    
    for step in range(num_steps):
        for batch_idx in range(len(data_train) // batch_size):
            
            batch_data = data_train[batch_size*batch_idx : batch_size*batch_idx + batch_size]
            batch_data = np.asarray(batch_data)
            
            batch_label = data_label[batch_size*batch_idx : batch_size*batch_idx + batch_size]
            batch_label = np.asarray(batch_label)
            batch_label = batch_label.reshape((batch_size, 1))
            
            
            feed_dict = {train_dataset: batch_data, train_labels: batch_label}
            _, l = session.run([optimizer, loss], feed_dict = feed_dict)
            
            average_loss += l
            
            if (step + 1) % 200 == 0:
                if step > 0:
                    average_loss / 2000
                skip_losses.append(average_loss)
        if (step + 1) % 200 == 0:        
            print('Average loss at step %d: %f' % (step+1, average_loss))
            average_loss = 0
            
    embeddings_eng = embeddings.eval()

np.save('eng-embeddings.npy', embeddings_eng)


Average loss at step 200: 19205.195217
Average loss at step 400: 15323.026056
Average loss at step 600: 14694.902463
Average loss at step 800: 14464.209200
Average loss at step 1000: 14124.243824
Average loss at step 1200: 13918.376618
Average loss at step 1400: 13726.682820
Average loss at step 1600: 13738.026134
Average loss at step 1800: 13619.683788
Average loss at step 2000: 13598.833608
Average loss at step 2200: 13503.690409
Average loss at step 2400: 13394.978613
Average loss at step 2600: 13423.626555
Average loss at step 2800: 13325.297071
Average loss at step 3000: 13338.561650
Average loss at step 3200: 13290.745211
Average loss at step 3400: 13216.879252
Average loss at step 3600: 13187.814978
Average loss at step 3800: 13183.623336
Average loss at step 4000: 13103.349944
Average loss at step 4200: 13151.529444
Average loss at step 4400: 13082.631723
Average loss at step 4600: 13068.537695
Average loss at step 4800: 13083.604540
Average loss at step 5000: 13072.776033
Aver

### Word embeddings Hindi

In [48]:
batch_size = 64
embedding_size = 128

sentences = []
window_size = 1
data_hindi = []

num_sampled = 20

for sent in hindi:
    sentence = sent.split(' ')
    sentences.append(sentence)
    


for sent in sentences:
    for ind, wrd in enumerate(sent):
        for cont_wrd in sent[max(ind - window_size, 0) : min(ind + window_size, len(sent)) + 1]:
            if wrd not in cont_wrd:
                data_hindi.append([wrd, cont_wrd])
                
data_hindi[0:10]

[['समझे', 'कि'],
 ['कि', 'समझे'],
 ['कि', 'नहीं?'],
 ['नहीं?', 'कि'],
 ['मैं', 'ठीक'],
 ['ठीक', 'मैं'],
 ['ठीक', 'हूँ।'],
 ['हूँ।', 'ठीक'],
 ['बहुत', 'बढ़िया!'],
 ['बढ़िया!', 'बहुत']]

In [49]:
data_hindi_train = []
data_hindi_label = []

for inp in data_hindi:
    
    data_hindi_train.append(hin_dict[inp[0]])
    data_hindi_label.append(hin_dict[inp[1]])

print('data train: {}' .format(data_hindi_train[0:10]))
print('data label: {}' .format(data_hindi_label[0:10]))

data train: [2611, 3069, 3069, 441, 1581, 190, 190, 2834, 1012, 2983]
data label: [3069, 2611, 441, 3069, 190, 1581, 2834, 190, 2983, 1012]


### Define i/p and o/p

In [50]:
tf.reset_default_graph()
train_hindi_dataset = tf.placeholder(tf.int32, shape = [batch_size])
train_hindi_labels = tf.placeholder(tf.int32, shape = [batch_size, 1])

### Model parameters and other variables

In [51]:
embeddings_hindi = tf.Variable(tf.random_uniform([hin_vocab_size, embedding_size], -1.0, 1.0))

softmax_weights_hindi = tf.Variable(tf.truncated_normal([hin_vocab_size, embedding_size],
                                stddev=0.5 / math.sqrt(embedding_size))
                                )
softmax_biases_hindi = tf.Variable(tf.random_uniform([hin_vocab_size],0.0,0.01))


### Computations

In [52]:
embed_hindi = tf.nn.embedding_lookup(embeddings_hindi, train_hindi_dataset)

loss_hindi = tf.reduce_mean(tf.nn.sampled_softmax_loss(
                    weights=softmax_weights_hindi, biases=softmax_biases_hindi, inputs=embed_hindi,
                    labels=train_hindi_labels, num_sampled=num_sampled, num_classes=hin_vocab_size)
                    )

### Optimizer

In [53]:
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss_hindi)

### Running the skip gram loss algorithm for hindi

In [23]:

num_steps = 10000
skip_losses_hindi = []

with tf.Session() as session:
    tf.global_variables_initializer().run()
    average_loss_hindi = 0
    
    for step in range(num_steps):
        for batch_idx in range(len(data_hindi_train) // batch_size):
            
            batch_data_hindi = data_hindi_train[batch_size*batch_idx : batch_size*batch_idx + batch_size]
            batch_data_hindi = np.asarray(batch_data_hindi)
            
            batch_label_hindi = data_hindi_label[batch_size*batch_idx : batch_size*batch_idx + batch_size]
            batch_label_hindi = np.asarray(batch_label_hindi)
            batch_label_hindi = batch_label_hindi.reshape((batch_size, 1))
            
            
            feed_dict_hindi = {train_hindi_dataset: batch_data_hindi, train_hindi_labels: batch_label_hindi}
            _, l_hindi = session.run([optimizer, loss_hindi], feed_dict = feed_dict_hindi)
            
            average_loss_hindi += l_hindi
            
            if (step + 1) % 200 == 0:
                if step > 0:
                    average_loss_hindi / 2000
                skip_losses_hindi.append(average_loss_hindi)
        if (step + 1) % 200 == 0:        
            print('Average loss at step %d: %f' % (step+1, average_loss_hindi))
            average_loss_hindi = 0
            
        embeddings_hin = embeddings_hindi.eval()

np.save('hin-embeddings.npy', embeddings_hin)


Average loss at step 200: 21199.986116
Average loss at step 400: 16691.466200
Average loss at step 600: 15741.420610
Average loss at step 800: 15352.804156
Average loss at step 1000: 15067.859991
Average loss at step 1200: 14828.038417
Average loss at step 1400: 14764.958345
Average loss at step 1600: 14553.244605
Average loss at step 1800: 14485.796032
Average loss at step 2000: 14287.486551
Average loss at step 2200: 14335.235766
Average loss at step 2400: 14216.886235
Average loss at step 2600: 14138.943719
Average loss at step 2800: 14170.188631
Average loss at step 3000: 14045.531303
Average loss at step 3200: 14050.835196
Average loss at step 3400: 13931.026932
Average loss at step 3600: 13968.309216
Average loss at step 3800: 13891.555790
Average loss at step 4000: 13844.941279
Average loss at step 4200: 13868.686502
Average loss at step 4400: 13820.967113
Average loss at step 4600: 13806.205545
Average loss at step 4800: 13781.171436
Average loss at step 5000: 13686.674879
Aver

### NMT using TF seq2seq library

### Data Generations for MT

In [54]:
emb_mat = np.load('eng-embeddings.npy')
input_size = emb_mat.shape[1]

class DataGeneratorMT(object):
    
    def __init__(self,batch_size,num_unroll,is_source, is_train):
        global input_size
        self._batch_size = batch_size
        self._num_unroll = num_unroll
        self._cursor = [0 for offset in range(self._batch_size)]
        
        
        self._sent_ids = None
        
        self._is_source = is_source
        self._is_train = is_train
                
    def next_batch(self, sent_ids):
        
        
        if self._is_source:
            max_sent_length = eng_max_len
        else:
            max_sent_length = hin_max_len
            
        batch_data = np.zeros((self._batch_size),dtype=np.float32)
        batch_labels = np.zeros((self._batch_size),dtype=np.float32)
        
        
        for b in range(self._batch_size):
            
            sent_id = sent_ids[b]
            
            
            if self._is_source:
                
                if self._is_train:
                    sent_text = train_input[sent_id]
                else:
                    sent_text = test_input[sent_id]
                             
                batch_data[b] = sent_text[self._cursor[b]]
                batch_labels[b] = sent_text[self._cursor[b]+1]
            
            else:
                
                if self._is_train:
                    sent_text = train_output[sent_id]
                else:
                    sent_text = test_output[sent_id]
                
                
                if sent_text[self._cursor[b]]!=hin_dict['<sos>']:
                    batch_data[b] = sent_text[self._cursor[b]]
                else:
                    batch_data[b] = sent_text[self._cursor[b]]
                
                batch_labels[b] = sent_text[self._cursor[b]+1]
            
            self._cursor[b] = (self._cursor[b]+1)%(max_sent_length-1)
             
        return batch_data,batch_labels
        
    def unroll_batches(self,sent_ids):
        
        
        if sent_ids is not None:
            
            self._sent_ids = sent_ids
            
            self._cursor = [0 for _ in range(self._batch_size)]
                
        unroll_data,unroll_labels = [],[]
        
        for ui in range(self._num_unroll):
            
            if self._is_source:
                data, labels = self.next_batch(self._sent_ids)
            else:
                data, labels = self.next_batch(self._sent_ids)
                    
            unroll_data.append(data)
            unroll_labels.append(labels)
        
        return unroll_data, unroll_labels, self._sent_ids
    
    def reset_indices(self):
        self._cursor = [0 for offset in range(self._batch_size)]
        
dg = DataGeneratorMT(batch_size=5,num_unroll=20,is_source=True, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,1,2,3,4])

print('Source data')
for _, lbl in zip(u_data,u_labels):
    print([rev_eng_dict[w] for w in lbl.tolist()])

dg = DataGeneratorMT(batch_size=5,num_unroll=20,is_source=False, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,2,3,4,5])
print('\nTarget data batch')
for d_i,(_, lbl) in enumerate(zip(u_data,u_labels)):
    print([rev_hin_dict[w] for w in lbl.tolist()])

Source data
['wow!', 'help!', 'jump.', 'jump.', 'jump.']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']

Target data batch
['वाह!', 'उछलो.', 'कूदो.', 'छलांग.', 'नमस्ते।']
['

### Hyperparameters

In [55]:
vocab_size = 3158
num_units = 128
batch_size = 4
source_sequence_length = 20
target_sequence_length = 22
learning_rate = 0.01

### Defining the TensorFlow inputs and outputs

In [56]:
tf.reset_default_graph()

enc_train_inputs = []
dec_train_inputs = []
dec_train_labels = []

encoder_emb_layer = tf.convert_to_tensor(np.load('eng-embeddings.npy'))
decoder_emb_layer = tf.convert_to_tensor(np.load('hin-embeddings.npy'))

for ui in range(source_sequence_length):
    enc_train_inputs.append(tf.placeholder(tf.int32, shape=[batch_size],name='train_inputs_%d'%ui))

for ui in range(target_sequence_length):
    dec_train_inputs.append(tf.placeholder(tf.int32, shape=[batch_size],name='train_inputs_%d'%ui))
    dec_train_labels.append(tf.placeholder(tf.int32, shape=[batch_size],name='train_outputs_%d'%ui))
    
encoder_emb_inp = [tf.nn.embedding_lookup(encoder_emb_layer, src) for src in enc_train_inputs]
encoder_emb_inp = tf.stack(encoder_emb_inp)

decoder_emb_inp = [tf.nn.embedding_lookup(decoder_emb_layer, src) for src in dec_train_inputs]
decoder_emb_inp = tf.stack(decoder_emb_inp)

### Encoder

In [57]:
encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)

initial_state = encoder_cell.zero_state(batch_size, dtype=tf.float32)

encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
    encoder_cell, encoder_emb_inp, initial_state=initial_state,
    sequence_length=[source_sequence_length for _ in range(batch_size)], 
    time_major=True, swap_memory=True)

### Decoder

In [58]:
decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)

projection_layer = Dense(units=vocab_size, use_bias=True)

helper = tf.contrib.seq2seq.TrainingHelper(
    decoder_emb_inp, [target_sequence_length for _ in range(batch_size)], time_major=True)

decoder = tf.contrib.seq2seq.BasicDecoder(
        decoder_cell, helper, encoder_state,
        output_layer=projection_layer)

outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder, output_time_major=True,
                swap_memory=True
                )

logits = outputs.rnn_output
train_prediction = outputs.sample_id

crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
    labels=dec_train_labels, logits=logits)
loss = tf.reduce_mean(crossent)

### Optimizer

In [59]:
optimizer = tf.train.AdamOptimizer(learning_rate)

gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
optimize = optimizer.apply_gradients(zip(gradients, v))

sess = tf.InteractiveSession()



### Running the Seq2seq NMT

In [60]:
loss_over_time = []
tf.global_variables_initializer().run()

src_word_embeddings = np.load('eng-embeddings.npy')
tgt_word_embeddings = np.load('hin-embeddings.npy')

num_steps = 10001
avg_loss = 0

enc_data_generator = DataGeneratorMT(
    batch_size=batch_size,num_unroll=source_sequence_length,is_train=True, is_source=True)
dec_data_generator = DataGeneratorMT(
    batch_size=batch_size,num_unroll=target_sequence_length,is_train=True, is_source=False)

for step in range(num_steps):

    
    print('.',end='')
    if (step+1)%100==0:
        print('')
        
    sent_ids = np.random.randint(low=0,high=train_input.shape[0],size=(batch_size))

    eu_data, eu_labels, _ = enc_data_generator.unroll_batches(sent_ids=sent_ids)
    
    du_data, du_labels, _ = dec_data_generator.unroll_batches(sent_ids=sent_ids)
    
    feed_dict = {}
    for ui,(dat,lbl) in enumerate(zip(eu_data,eu_labels)):            
        feed_dict[enc_train_inputs[ui]] = dat                
    
    for ui,(dat,lbl) in enumerate(zip(du_data,du_labels)):            
        feed_dict[dec_train_inputs[ui]] = dat
        feed_dict[dec_train_labels[ui]] = lbl

    
    _,l,tr_pred = sess.run([optimize,loss,train_prediction], feed_dict=feed_dict)
    tr_pred = tr_pred.flatten()
    
    if (step+1)%100==0:  
        
        print('Step ',step+1)

        print_str = 'Actual: '
        for w in np.concatenate(du_labels,axis=0)[::batch_size].tolist():
            print_str += rev_hin_dict[w] + ' '                    
            if rev_hin_dict[w] == '<eos>':
                break
                      
        print(print_str)
        print()
        
        print_str = 'Predicted: '
        for w in tr_pred[::batch_size].tolist():
            print_str += rev_hin_dict[w] + ' '
            if rev_hin_dict[w] == '<eos>':
                break
        print(print_str)
       
        print('\n')
        
        rand_idx = np.random.randint(low=1,high=batch_size)
        print_str = 'Actual: '
        for w in np.concatenate(du_labels,axis=0)[rand_idx::batch_size].tolist():
            print_str += rev_hin_dict[w] + ' '
            if rev_hin_dict[w] == '<eos>':
                break
        print(print_str)

            
        print()
        print_str = 'Predicted: '
        for w in tr_pred[rand_idx::batch_size].tolist():
            print_str += rev_hin_dict[w] + ' '
            if rev_hin_dict[w] == '<eos>':
                break
        print(print_str)
        print() 
    
    avg_loss += l 
    
    if (step+1)%500==0:
        print('============= Step ', str(step+1), ' =============')
        print('\t Loss: ',avg_loss/500.0)
        
        loss_over_time.append(avg_loss/500.0)
             
        avg_loss = 0.0

.......................................................

InvalidArgumentError: Received a label value of 3158 which is outside the valid range of [0, 3158).  Label values: 15 2253 483 1581 2638 2759 2337 754 1549 3158 1050 799 2367 153 838 2 1568 2 863 2 3098 2 1895 2 491 2 1821 2 2 2 286 2 2 2 2454 2 2 2 954 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
	 [[Node: SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits = SparseSoftmaxCrossEntropyWithLogits[T=DT_FLOAT, Tlabels=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"](SparseSoftmaxCrossEntropyWithLogits/Reshape, SparseSoftmaxCrossEntropyWithLogits/Reshape_1)]]

Caused by op 'SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits', defined at:
  File "/home/monu/anaconda3/envs/tensorflow/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/monu/anaconda3/envs/tensorflow/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/monu/.local/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/monu/.local/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/monu/.local/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/home/monu/.local/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/monu/.local/lib/python3.5/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/monu/.local/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/monu/.local/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/monu/.local/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/monu/.local/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/monu/.local/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/monu/.local/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/monu/.local/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/monu/.local/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/monu/.local/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/monu/.local/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/monu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/monu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/monu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-58-b486a1ce8578>", line 21, in <module>
    labels=dec_train_labels, logits=logits)
  File "/home/monu/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/nn_ops.py", line 2072, in sparse_softmax_cross_entropy_with_logits
    precise_logits, labels, name=name)
  File "/home/monu/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 7479, in sparse_softmax_cross_entropy_with_logits
    labels=labels, name=name)
  File "/home/monu/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/monu/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
    op_def=op_def)
  File "/home/monu/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1718, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): Received a label value of 3158 which is outside the valid range of [0, 3158).  Label values: 15 2253 483 1581 2638 2759 2337 754 1549 3158 1050 799 2367 153 838 2 1568 2 863 2 3098 2 1895 2 491 2 1821 2 2 2 286 2 2 2 2454 2 2 2 954 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
	 [[Node: SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits = SparseSoftmaxCrossEntropyWithLogits[T=DT_FLOAT, Tlabels=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"](SparseSoftmaxCrossEntropyWithLogits/Reshape, SparseSoftmaxCrossEntropyWithLogits/Reshape_1)]]
