## NMT for English to Hindi

In [1]:
import os
import tensorflow as tf
import random
import numpy as np
import math
from tensorflow.python.layers.core import Dense

  from ._conv import register_converters as _register_converters


### Loading data

In [2]:
english = []
hindi = []
with open(os.path.join('hin-eng', 'hin.txt')) as f:
    for line in f:
        eng, hin = line.split('\t')
        # hindi is having a newline character '\n' at the end so excluding it
        hin = hin[:-1]
        english.append(eng.lower())
        hindi.append(hin)
        
    assert len(english) == len(hindi)
    
    print(english[:10])
    print(hindi[:10])
    
    print('\nlength of english sentences:', len(english))
    print('length of hindi sentences:', len(hindi))

['wow!', 'help!', 'jump.', 'jump.', 'jump.', 'hello!', 'hello!', 'cheers!', 'cheers!', 'got it?']
['वाह!', 'बचाओ!', 'उछलो.', 'कूदो.', 'छलांग.', 'नमस्ते।', 'नमस्कार।', 'वाह-वाह!', 'चियर्स!', 'समझे कि नहीं?']

length of english sentences: 2867
length of hindi sentences: 2867


### Getting the length of unique sentences

In [3]:
unique_eng_sentences = set(english)
unique_hin_sentences = set(hindi)

print(len(unique_eng_sentences))
print(len(unique_hin_sentences))

2660
2788


### Creating the dictionary

In [4]:
eng_words = []
hin_words = []

for sent in english:
    sentence = sent.split(' ')
    eng_words.extend(sentence)

unq_eng_words = set(eng_words)
print('unique english words :', len(unq_eng_words))

for sent in hindi:
    sentence = sent.split(' ')
    hin_words.extend(sentence)

unq_hin_words = set(hin_words)
print('unique hindi words:', len(unq_hin_words))

eng_dict = {'<sos>':0, '<eos>':1, '<PAD>':2}

for wrd in unq_eng_words:
    eng_dict[wrd] = len(eng_dict)
rev_eng_dict = dict(zip(eng_dict.values(), eng_dict.keys()))

print('\nenglish dict:\n', list(eng_dict.items())[0:10])
print('\nreverse english dict:\n', list(rev_eng_dict.items())[0:10])

hin_dict = {'<sos>':0, '<eos>':1, '<PAD>':2}

for wrd in unq_hin_words:
    hin_dict[wrd] = len(hin_dict)
    
rev_hin_dict = dict(zip(hin_dict.values(), hin_dict.keys()))

print('\nhindi dict:\n', list(hin_dict.items())[0:10])
print('\nreverse hindi dict:\n', list(rev_hin_dict.items())[0:10])

eng_vocab_size = len(eng_dict)
hin_vocab_size = len(hin_dict) 

print('\neng vocab size:', eng_vocab_size)
print('hin vocab size:', hin_vocab_size)


unique english words : 3307
unique hindi words: 3156

english dict:
 [('walk.', 3), ('taste?', 548), ('glasses', 5), ('affected', 4), ('page.', 7), ('sentenced', 8), ('next', 9), ('dog', 12), ('middle', 1629), ('drive', 13)]

reverse english dict:
 [(0, '<sos>'), (1, '<eos>'), (2, '<PAD>'), (3, 'walk.'), (4, 'affected'), (5, 'glasses'), (6, 'black?'), (7, 'page.'), (8, 'sentenced'), (9, 'next')]

hindi dict:
 [('मरना', 2988), ('अपौइंटमेंट', 3), ('सीड़ियों', 4), ('करूँ', 1573), ('गर्व', 5), ('चीनी?', 6), ('पैरों', 7), ('होती', 8), ('ताकतवर', 9), ('मील', 11)]

reverse hindi dict:
 [(0, '<sos>'), (1, '<eos>'), (2, '<PAD>'), (3, 'अपौइंटमेंट'), (4, 'सीड़ियों'), (5, 'गर्व'), (6, 'चीनी?'), (7, 'पैरों'), (8, 'होती'), (9, 'ताकतवर')]

eng vocab size: 3310
hin vocab size: 3159


### Creating train and test set

In [5]:
test_ind = [random.randint(0, len(english)) for i in range(50)]

english_train = [english[i] for i in range(len(english)) if i not in test_ind]
hindi_train = [hindi[i] for i in range(len(hindi)) if i not in test_ind]

english_test = [english[i] for i in test_ind]
hindi_test = [hindi[i] for i in test_ind]

print('length of english train:', len(english_train))
print('length of hindi train:', len(hindi_train))

print('length of english test:', len(english_test))
print('length of hindi test:', len(hindi_test))

for i in range(10):
    print('\neng:', english_test[i])
    print('hin:', hindi_test[i])

length of english train: 2817
length of hindi train: 2817
length of english test: 50
length of hindi test: 50

eng: i am tired of my work.
hin: मैं अपने काम से थक चुका हूँ।

eng: don't force the child to eat.
hin: बच्चे को ज़बरदस्ती मत खिलाओ।

eng: my hands are dirty. i have been repairing my bicycle.
hin: मेरे हाथ गंदे हैं। मैं अपनी साईकल ठीक कर रहा था।

eng: they were afraid of you.
hin: उन्हें तुमसे डर लगता था।

eng: go tell him yourself.
hin: उसको अपने-आप जाकर बताओ।

eng: if it had not been for her help, you would never have done it.
hin: उसकी मदद के बिना तुम नहीं कर पाते।

eng: they were scolded by the teacher.
hin: उन्हें अपनी टीचर से डाँट पड़ी।

eng: he was more than a king.
hin: वह एक राजा से बहुत ज़्यादा था।

eng: i have no idea to what extent i can trust them.
hin: मुझे नहीं पता मैं उनपर कितना भरोसा कर सकता हूँ।

eng: i love you.
hin: मैं आपसे प्यार करती हूँ।


### Max no. of words in a sentence

In [6]:
en = []
hi = []

for sent in english:
    sentence = sent.split(' ')
    en.append(sentence)

for sent in hindi:
    sentence = sent.split(' ')
    hi.append(sentence)
    
print('max length of english sentence:', len(max(en, key = len)))
print('max length of hindi sentence:', len(max(hi, key = len)))


max length of english sentence: 22
max length of hindi sentence: 25


### Converting from words to numbers

In [7]:
# +2 for <sos> and <eos>
eng_max_len = len(max(en, key = len)) + 2
hin_max_len = len(max(hi, key = len)) + 2

train_input = []
train_output = []

for eng_sent, hin_sent in zip(english_train, hindi_train):
    
    numeric_eng_sent = [eng_dict['<sos>']]
    numeric_hin_sent = [hin_dict['<sos>']]
    
    
    for wrds in eng_sent.split():
        numeric_eng_sent.append(eng_dict[wrds])
    
    for wrds in hin_sent.split():
        numeric_hin_sent.append(hin_dict[wrds])
    
    if len(numeric_eng_sent) < eng_max_len:
        
        [numeric_eng_sent.append(eng_dict['<PAD>']) 
                            for i in range(eng_max_len - len(numeric_eng_sent) - 1)]
        
        numeric_eng_sent.append(eng_dict['<eos>'])
    
    train_input.append(numeric_eng_sent)
    
    if len(numeric_hin_sent) < hin_max_len:
        
        [numeric_hin_sent.append(hin_dict['<PAD>']) 
                            for i in range(hin_max_len - len(numeric_hin_sent) - 1)]
        numeric_hin_sent.append(hin_dict['<eos>'])
    
    train_output.append(numeric_hin_sent)


test_input = []
test_output = []

for eng_sent, hin_sent in zip(english_test, hindi_test):
    
    numeric_eng_sent = [eng_dict['<sos>']]
    numeric_hin_sent = [hin_dict['<sos>']]
    
    for wrds in eng_sent.split():
        numeric_eng_sent.append(eng_dict[wrds])
    
    for wrds in hin_sent.split():
        numeric_hin_sent.append(hin_dict[wrds])
    
    if len(numeric_eng_sent) < eng_max_len:
        
        [numeric_eng_sent.append(eng_dict['<PAD>']) 
                            for i in range(eng_max_len - len(numeric_eng_sent) - 1)]
        
        numeric_eng_sent.append(eng_dict['<eos>'])
    
    test_input.append(numeric_eng_sent)
    
    if len(numeric_hin_sent) < hin_max_len:
        
        [numeric_hin_sent.append(hin_dict['<PAD>']) 
                            for i in range(hin_max_len - len(numeric_hin_sent) - 1)]
        numeric_hin_sent.append(hin_dict['<eos>'])
    
    test_output.append(numeric_hin_sent)



### Checking if every thing is working fine

In [8]:
wrd_sent = []
for i in range(10):
    num_sent = test_input[i]
    print('\nnumeric and word sequence:\n', num_sent)
    for j in range(eng_max_len):
        wrd_sent.append(rev_eng_dict[num_sent[j]])
        
    print(wrd_sent)
    
    wrd_sent = []

wrd_sent = []    
for i in range(10):
    num_sent = test_input[i]
    print('\nnumeric and word sequence:\n', num_sent)
    for j in range(eng_max_len):
        wrd_sent.append(rev_eng_dict[num_sent[j]])
        
    print(wrd_sent)
    
    wrd_sent = []


numeric and word sequence:
 [0, 3287, 1594, 159, 2294, 53, 2191, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1]
['<sos>', 'i', 'am', 'tired', 'of', 'my', 'work.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<eos>']

numeric and word sequence:
 [0, 1016, 831, 1648, 781, 1926, 3298, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1]
['<sos>', "don't", 'force', 'the', 'child', 'to', 'eat.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<eos>']

numeric and word sequence:
 [0, 53, 2526, 2622, 299, 3287, 3131, 2852, 2368, 53, 1782, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1]
['<sos>', 'my', 'hands', 'are', 'dirty.', 'i', 'have', 'been', 'repairing', 'my', 'bicycle.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<eos>']

numeric and

In [9]:
wrd_sent = []
for i in range(10):
    num_sent = test_output[i]
    print('\nnumeric and word sequence:\n', num_sent)
    for j in range(hin_max_len):
        wrd_sent.append(rev_hin_dict[num_sent[j]])
        
    print(wrd_sent)
    
    wrd_sent = []

wrd_sent = []    
for i in range(10):
    num_sent = test_output[i]
    print('\nnumeric and word sequence:\n', num_sent)
    for j in range(hin_max_len):
        wrd_sent.append(rev_hin_dict[num_sent[j]])
        
    print(wrd_sent)
    
    wrd_sent = []


numeric and word sequence:
 [0, 1479, 2648, 2764, 1856, 2008, 1806, 1465, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1]
['<sos>', 'मैं', 'अपने', 'काम', 'से', 'थक', 'चुका', 'हूँ।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<eos>']

numeric and word sequence:
 [0, 632, 37, 374, 1165, 2620, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1]
['<sos>', 'बच्चे', 'को', 'ज़बरदस्ती', 'मत', 'खिलाओ।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<eos>']

numeric and word sequence:
 [0, 873, 2536, 2038, 2994, 1479, 1822, 347, 997, 1160, 1288, 223, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1]
['<sos>', 'मेरे', 'हाथ', 'गंदे', 'हैं।', 'मैं', 'अपनी', 'साईकल', 'ठीक', 'कर', 'रहा', 'था।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PA

### Making train and test i/p and o/p numpy arrays

In [10]:
train_input = np.array(train_input, dtype=np.int32)
train_output = np.array(train_output, dtype=np.int32)

test_input = np.array(test_input,dtype=np.int32)
test_output = np.array(test_output,dtype=np.int32)

### Word embeddings using skip-gram

### Preparing data for skip-gram

In [11]:
batch_size = 64
embedding_size = 128

sentences = []
window_size = 1
data = []

num_sampled = 20

for sent in english:
    sentence = sent.split(' ')
    sentences.append(sentence)
    


for sent in sentences:
    for ind, wrd in enumerate(sent):
        for cont_wrd in sent[max(ind - window_size, 0) : min(ind + window_size, len(sent)) + 1]:
            if wrd not in cont_wrd:
                data.append([wrd, cont_wrd])
                
data[0:10]

[['got', 'it?'],
 ['it?', 'got'],
 ["i'm", 'ok.'],
 ['ok.', "i'm"],
 ['come', 'in.'],
 ['in.', 'come'],
 ['get', 'out!'],
 ['out!', 'get'],
 ['go', 'away!'],
 ['away!', 'go']]

In [12]:
data_train = []
data_label = []

for inp in data:
    
    data_train.append(eng_dict[inp[0]])
    data_label.append(eng_dict[inp[1]])

print('data train: {}' .format(data_train[0:10]))
print('data label: {}' .format(data_label[0:10]))

data train: [3074, 62, 1365, 1491, 1278, 3159, 2686, 171, 935, 1911]
data label: [62, 3074, 1491, 1365, 3159, 1278, 171, 2686, 1911, 935]


### Define i/p and o/p

In [13]:
tf.reset_default_graph()
train_dataset = tf.placeholder(tf.int32, shape = [batch_size])
train_labels = tf.placeholder(tf.int32, shape = [batch_size, 1])

### Model Parameters and Other Variables

In [20]:
embeddings = tf.Variable(tf.random_uniform([eng_vocab_size, embedding_size], -1.0, 1.0))

softmax_weights = tf.Variable(tf.truncated_normal([eng_vocab_size, embedding_size],
                                stddev=0.5 / math.sqrt(embedding_size))
                                )
softmax_biases = tf.Variable(tf.random_uniform([eng_vocab_size],0.0,0.01))


### Computations

In [22]:
embed = tf.nn.embedding_lookup(embeddings, train_dataset)

loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(
                    weights=softmax_weights, biases=softmax_biases, inputs=embed,
                    labels=train_labels, num_sampled=num_sampled, num_classes=eng_vocab_size)
                    )

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



### Optimizer

In [23]:
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

### Running the skip gram algorithm

In [16]:

num_steps = 10000
skip_losses = []

with tf.Session() as session:
    tf.global_variables_initializer().run()
    average_loss = 0
    
    for step in range(num_steps):
        for batch_idx in range(len(data_train) // batch_size):
            
            batch_data = data_train[batch_size*batch_idx : batch_size*batch_idx + batch_size]
            batch_data = np.asarray(batch_data)
            
            batch_label = data_label[batch_size*batch_idx : batch_size*batch_idx + batch_size]
            batch_label = np.asarray(batch_label)
            batch_label = batch_label.reshape((batch_size, 1))
            
            
            feed_dict = {train_dataset: batch_data, train_labels: batch_label}
            _, l = session.run([optimizer, loss], feed_dict = feed_dict)
            
            average_loss += l
            
            if (step + 1) % 200 == 0:
                if step > 0:
                    average_loss / 2000
                skip_losses.append(average_loss)
        if (step + 1) % 200 == 0:        
            print('Average loss at step %d: %f' % (step+1, average_loss))
            average_loss = 0
            
    embeddings_eng = embeddings.eval()

np.save('eng-embeddings.npy', embeddings_eng)


Average loss at step 200: 19205.195217
Average loss at step 400: 15323.026056
Average loss at step 600: 14694.902463
Average loss at step 800: 14464.209200
Average loss at step 1000: 14124.243824
Average loss at step 1200: 13918.376618
Average loss at step 1400: 13726.682820
Average loss at step 1600: 13738.026134
Average loss at step 1800: 13619.683788
Average loss at step 2000: 13598.833608
Average loss at step 2200: 13503.690409
Average loss at step 2400: 13394.978613
Average loss at step 2600: 13423.626555
Average loss at step 2800: 13325.297071
Average loss at step 3000: 13338.561650
Average loss at step 3200: 13290.745211
Average loss at step 3400: 13216.879252
Average loss at step 3600: 13187.814978
Average loss at step 3800: 13183.623336
Average loss at step 4000: 13103.349944
Average loss at step 4200: 13151.529444
Average loss at step 4400: 13082.631723
Average loss at step 4600: 13068.537695
Average loss at step 4800: 13083.604540
Average loss at step 5000: 13072.776033
Aver

### Word embeddings using skip-gram

In [24]:
batch_size = 64
embedding_size = 128

sentences = []
window_size = 1
data_hindi = []

num_sampled = 20

for sent in hindi:
    sentence = sent.split(' ')
    sentences.append(sentence)
    


for sent in sentences:
    for ind, wrd in enumerate(sent):
        for cont_wrd in sent[max(ind - window_size, 0) : min(ind + window_size, len(sent)) + 1]:
            if wrd not in cont_wrd:
                data_hindi.append([wrd, cont_wrd])
                
data_hindi[0:10]

[['समझे', 'कि'],
 ['कि', 'समझे'],
 ['कि', 'नहीं?'],
 ['नहीं?', 'कि'],
 ['मैं', 'ठीक'],
 ['ठीक', 'मैं'],
 ['ठीक', 'हूँ।'],
 ['हूँ।', 'ठीक'],
 ['बहुत', 'बढ़िया!'],
 ['बढ़िया!', 'बहुत']]

In [25]:
data_hindi_train = []
data_hindi_label = []

for inp in data_hindi:
    
    data_hindi_train.append(hin_dict[inp[0]])
    data_hindi_label.append(hin_dict[inp[1]])

print('data train: {}' .format(data_hindi_train[0:10]))
print('data label: {}' .format(data_hindi_label[0:10]))

data train: [1368, 595, 595, 1387, 1479, 997, 997, 1465, 472, 2924]
data label: [595, 1368, 1387, 595, 997, 1479, 1465, 997, 2924, 472]


### Define i/p and o/p

In [26]:
tf.reset_default_graph()
train_hindi_dataset = tf.placeholder(tf.int32, shape = [batch_size])
train_hindi_labels = tf.placeholder(tf.int32, shape = [batch_size, 1])

### Model parameters and other variables

In [51]:
embeddings_hindi = tf.Variable(tf.random_uniform([hin_vocab_size, embedding_size], -1.0, 1.0))

softmax_weights_hindi = tf.Variable(tf.truncated_normal([hin_vocab_size, embedding_size],
                                stddev=0.5 / math.sqrt(embedding_size))
                                )
softmax_biases_hindi = tf.Variable(tf.random_uniform([hin_vocab_size],0.0,0.01))


### Computations

In [52]:
embed_hindi = tf.nn.embedding_lookup(embeddings_hindi, train_hindi_dataset)

loss_hindi = tf.reduce_mean(tf.nn.sampled_softmax_loss(
                    weights=softmax_weights_hindi, biases=softmax_biases_hindi, inputs=embed_hindi,
                    labels=train_hindi_labels, num_sampled=num_sampled, num_classes=hin_vocab_size)
                    )

### Optimizer

In [53]:
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss_hindi)

### Running the skip gram loss algorithm for hindi

In [23]:

num_steps = 10000
skip_losses_hindi = []

with tf.Session() as session:
    tf.global_variables_initializer().run()
    average_loss_hindi = 0
    
    for step in range(num_steps):
        for batch_idx in range(len(data_hindi_train) // batch_size):
            
            batch_data_hindi = data_hindi_train[batch_size*batch_idx : batch_size*batch_idx + batch_size]
            batch_data_hindi = np.asarray(batch_data_hindi)
            
            batch_label_hindi = data_hindi_label[batch_size*batch_idx : batch_size*batch_idx + batch_size]
            batch_label_hindi = np.asarray(batch_label_hindi)
            batch_label_hindi = batch_label_hindi.reshape((batch_size, 1))
            
            
            feed_dict_hindi = {train_hindi_dataset: batch_data_hindi, train_hindi_labels: batch_label_hindi}
            _, l_hindi = session.run([optimizer, loss_hindi], feed_dict = feed_dict_hindi)
            
            average_loss_hindi += l_hindi
            
            if (step + 1) % 200 == 0:
                if step > 0:
                    average_loss_hindi / 2000
                skip_losses_hindi.append(average_loss_hindi)
        if (step + 1) % 200 == 0:        
            print('Average loss at step %d: %f' % (step+1, average_loss_hindi))
            average_loss_hindi = 0
            
        embeddings_hin = embeddings_hindi.eval()

np.save('hin-embeddings.npy', embeddings_hin)


Average loss at step 200: 21199.986116
Average loss at step 400: 16691.466200
Average loss at step 600: 15741.420610
Average loss at step 800: 15352.804156
Average loss at step 1000: 15067.859991
Average loss at step 1200: 14828.038417
Average loss at step 1400: 14764.958345
Average loss at step 1600: 14553.244605
Average loss at step 1800: 14485.796032
Average loss at step 2000: 14287.486551
Average loss at step 2200: 14335.235766
Average loss at step 2400: 14216.886235
Average loss at step 2600: 14138.943719
Average loss at step 2800: 14170.188631
Average loss at step 3000: 14045.531303
Average loss at step 3200: 14050.835196
Average loss at step 3400: 13931.026932
Average loss at step 3600: 13968.309216
Average loss at step 3800: 13891.555790
Average loss at step 4000: 13844.941279
Average loss at step 4200: 13868.686502
Average loss at step 4400: 13820.967113
Average loss at step 4600: 13806.205545
Average loss at step 4800: 13781.171436
Average loss at step 5000: 13686.674879
Aver

### NMT using TF seq2seq library

### Data Generations for MT

In [28]:
emb_mat = np.load('eng-embeddings.npy')
input_size = emb_mat.shape[1]

class DataGeneratorMT(object):
    
    def __init__(self,batch_size,num_unroll,is_source, is_train):
        global input_size
        self.batch_size = batch_size
        self.num_unroll = num_unroll
        self.cursor = [0 for offset in range(self.batch_size)]
        
        
        self.sent_ids = None
        
        self.is_source = is_source
        self.is_train = is_train
                
    def next_batch(self, sent_ids):
        
        
        if self.is_source:
            max_sent_length = eng_max_len
        else:
            max_sent_length = hin_max_len
            
        batch_data = np.zeros((self.batch_size),dtype=np.float32)
        batch_labels = np.zeros((self.batch_size),dtype=np.float32)
        
        
        for b in range(self.batch_size):
            
            sent_id = sent_ids[b]
            
            
            if self.is_source:
                
                if self.is_train:
                    sent_text = train_input[sent_id]
                else:
                    sent_text = test_input[sent_id]
                             
                batch_data[b] = sent_text[self.cursor[b]]
                batch_labels[b] = sent_text[self.cursor[b]+1]
            
            else:
                
                if self.is_train:
                    sent_text = train_output[sent_id]
                else:
                    sent_text = test_output[sent_id]
                
                
                if sent_text[self.cursor[b]]!=hin_dict['<sos>']:
                    batch_data[b] = sent_text[self.cursor[b]]
                else:
                    batch_data[b] = sent_text[self.cursor[b]]
                
                batch_labels[b] = sent_text[self.cursor[b]+1]
            
            self.cursor[b] = (self.cursor[b]+1)%(max_sent_length-1)
             
        return batch_data,batch_labels
        
    def unroll_batches(self,sent_ids):
        
        
        if sent_ids is not None:
            
            self.sent_ids = sent_ids
            
            self.cursor = [0 for _ in range(self.batch_size)]
                
        unroll_data,unroll_labels = [],[]
        
        for ui in range(self.num_unroll):
            
            if self.is_source:
                data, labels = self.next_batch(self.sent_ids)
            else:
                data, labels = self.next_batch(self.sent_ids)
                    
            unroll_data.append(data)
            unroll_labels.append(labels)
        
        return unroll_data, unroll_labels, self.sent_ids
    
    def reset_indices(self):
        self.cursor = [0 for offset in range(self.batch_size)]
        
dg = DataGeneratorMT(batch_size=5,num_unroll=20,is_source=True, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,1,2,3,4])

print('Source data')
for _, lbl in zip(u_data,u_labels):
    print([rev_eng_dict[w] for w in lbl.tolist()])

dg = DataGeneratorMT(batch_size=5,num_unroll=20,is_source=False, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,2,3,4,5])
print('\nTarget data batch')
for d_i,(_, lbl) in enumerate(zip(u_data,u_labels)):
    print([rev_hin_dict[w] for w in lbl.tolist()])

Source data
['wow!', 'help!', 'jump.', 'jump.', 'jump.']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']

Target data batch
['वाह!', 'उछलो.', 'कूदो.', 'छलांग.', 'नमस्ते।']
['

### Hyperparameters

In [29]:
vocab_size = hin_vocab_size
num_units = 128
batch_size = 8
source_sequence_length = 20
target_sequence_length = 22
learning_rate = 0.01

### Defining the TensorFlow inputs and outputs

In [30]:
tf.reset_default_graph()

enc_train_inputs = []
dec_train_inputs = []
dec_train_labels = []

encoder_emb_layer = tf.convert_to_tensor(np.load('eng-embeddings.npy'))
decoder_emb_layer = tf.convert_to_tensor(np.load('hin-embeddings.npy'))

for ui in range(source_sequence_length):
    enc_train_inputs.append(tf.placeholder(tf.int32, shape=[batch_size],name='train_inputs_%d'%ui))

for ui in range(target_sequence_length):
    dec_train_inputs.append(tf.placeholder(tf.int32, shape=[batch_size],name='train_inputs_%d'%ui))
    dec_train_labels.append(tf.placeholder(tf.int32, shape=[batch_size],name='train_outputs_%d'%ui))
    
encoder_emb_inp = [tf.nn.embedding_lookup(encoder_emb_layer, src) for src in enc_train_inputs]
encoder_emb_inp = tf.stack(encoder_emb_inp)

decoder_emb_inp = [tf.nn.embedding_lookup(decoder_emb_layer, src) for src in dec_train_inputs]
decoder_emb_inp = tf.stack(decoder_emb_inp)

### Encoder

In [31]:
encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)

initial_state = encoder_cell.zero_state(batch_size, dtype=tf.float32)

encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
    encoder_cell, encoder_emb_inp, initial_state=initial_state,
    sequence_length=[source_sequence_length for _ in range(batch_size)], 
    time_major=True, swap_memory=True)

### Decoder

In [32]:
decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)

projection_layer = Dense(units=vocab_size, use_bias=True)

helper = tf.contrib.seq2seq.TrainingHelper(
    decoder_emb_inp, [target_sequence_length for _ in range(batch_size)], time_major=True)

decoder = tf.contrib.seq2seq.BasicDecoder(
        decoder_cell, helper, encoder_state,
        output_layer=projection_layer)

outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder, output_time_major=True,
                swap_memory=True
                )

logits = outputs.rnn_output
train_prediction = outputs.sample_id

crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
    labels=dec_train_labels, logits=logits)
loss = tf.reduce_mean(crossent)

### Optimizer

In [33]:
optimizer = tf.train.AdamOptimizer(learning_rate)

gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
optimize = optimizer.apply_gradients(zip(gradients, v))

sess = tf.InteractiveSession()

### Running the Seq2seq NMT

In [34]:
loss_over_time = []
tf.global_variables_initializer().run()

src_word_embeddings = np.load('eng-embeddings.npy')
tgt_word_embeddings = np.load('hin-embeddings.npy')

num_steps = 10001
avg_loss = 0

enc_data_generator = DataGeneratorMT(
    batch_size=batch_size,num_unroll=source_sequence_length,is_train=True, is_source=True)
dec_data_generator = DataGeneratorMT(
    batch_size=batch_size,num_unroll=target_sequence_length,is_train=True, is_source=False)

for step in range(num_steps):

    
    print('.',end='')
    if (step+1)%100==0:
        print('')
        
    sent_ids = np.random.randint(low=0,high=train_input.shape[0],size=(batch_size))

    eu_data, eu_labels, _ = enc_data_generator.unroll_batches(sent_ids=sent_ids)
    
    du_data, du_labels, _ = dec_data_generator.unroll_batches(sent_ids=sent_ids)
    
    feed_dict = {}
    for ui,(dat,lbl) in enumerate(zip(eu_data,eu_labels)):            
        feed_dict[enc_train_inputs[ui]] = dat                
    
    for ui,(dat,lbl) in enumerate(zip(du_data,du_labels)):            
        feed_dict[dec_train_inputs[ui]] = dat
        feed_dict[dec_train_labels[ui]] = lbl

    
    _,l,tr_pred = sess.run([optimize,loss,train_prediction], feed_dict=feed_dict)
    tr_pred = tr_pred.flatten()
    
    if (step+1)%100==0:  
        
        print('Step ',step+1)

        print_str = 'Actual: '
        for w in np.concatenate(du_labels,axis=0)[::batch_size].tolist():
            print_str += rev_hin_dict[w] + ' '                    
            if rev_hin_dict[w] == '<eos>':
                break
                      
        print(print_str)
        print()
        
        print_str = 'Predicted: '
        for w in tr_pred[::batch_size].tolist():
            print_str += rev_hin_dict[w] + ' '
            if rev_hin_dict[w] == '<eos>':
                break
        print(print_str)
       
        print('\n')
        
        rand_idx = np.random.randint(low=1,high=batch_size)
        print_str = 'Actual: '
        for w in np.concatenate(du_labels,axis=0)[rand_idx::batch_size].tolist():
            print_str += rev_hin_dict[w] + ' '
            if rev_hin_dict[w] == '<eos>':
                break
        print(print_str)

            
        print()
        print_str = 'Predicted: '
        for w in tr_pred[rand_idx::batch_size].tolist():
            print_str += rev_hin_dict[w] + ' '
            if rev_hin_dict[w] == '<eos>':
                break
        print(print_str)
        print() 
    
    avg_loss += l 
    
    if (step+1)%500==0:
        print('============= Step ', str(step+1), ' =============')
        print('\t Loss: ',avg_loss/500.0)
        
        loss_over_time.append(avg_loss/500.0)
             
        avg_loss = 0.0

....................................................................................................
Step  100
Actual: मुझे समझ में नहीं आ रहा कि क्या पढ़ूँ। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

Predicted: मैं में में नहीं है। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 


Actual: मुझे एक दिन में काम खतम करना नामुमकिन लगा। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

Predicted: मैं में को <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

....................................................................................................
Step  200
Actual: मैंने अपने कुत्ते को पालतू जानवरों के कब्रिस्तान में बरी कर दिया। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

Predicted: मैं पास की से से से <PAD> लिए है। है। <PAD> सकते। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

....................................................................................................
Step  1400
Actual: कृपया धीमी आवाज़ में बात कीजिए। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

Predicted: वह ध्यान से <PAD> <PAD> कर <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 


Actual: मेरी सोच तुम्हारी सोच से बिलकुल अलग है। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

Predicted: वह कल तुम्हारी सोच से बिलकुल अलग है। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

....................................................................................................
Step  1500
Actual: मेरी दो बहनें हैं और दोनो की शादी हो चुकी है। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

Predicted: मैं एक नगरों के या मैं नहीं बात करना जाता है। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

....................................................................................................
Step  3900
Actual: मुझे रात को खाना खाने के बाद अपने भाई को उसके होमवर्क के साथ मदद करने की आदत है। <PAD> <PAD> <PAD> 

Predicted: मुझे दंत को खाना खाने के बाद अपने होमवर्क को देखते होमवर्क के लिए मदद करने की आदत है। <PAD> <PAD> <PAD> 


Actual: जंगल के जानवर मर गए। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

Predicted: वह में लिए को गए। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

....................................................................................................
Step  4000
Actual: जल्दी कीजिए। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

Predicted: मेरी घर <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 


....................................................................................................
Step  6400
Actual: मैं मेट्रो लेकर स्कूल जाती हूँ। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

Predicted: मैं रोज़ लेकर स्कूल चाहता हूँ। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 


Actual: कोईसा भी ले लो। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

Predicted: मुझे भी ले लो। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

....................................................................................................
Step  6500
Actual: जहाँ भी रखना है रखदो। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

Predicted: वह भी घुसाना है रखदो। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

....................................................................................................
Step  7700
Actual: हमने बड़ी मौज करी। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

Predicted: हम बच्चे मौज करी। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 


Actual: तुम टोक्यो में रहते हो क्या? <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

Predicted: तुम रोज़ में रहते हो क्या? <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

....................................................................................................
Step  7800
Actual: तुम्हे उसने गप मारने की ज़रूरत नहीं है। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

Predicted: यह इस अपने मारने का ज़रूरत नहीं थी। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

....................................................................................................
Step  9000
Actual: हम सब उसके मज़ाक पर हँसे। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

Predicted: हम अच्छे टॉम मज़ाक पर हँसे। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 


Actual: मुझे याद नहीं। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

Predicted: मैं नहीं नहीं। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

	 Loss:  0.4157130886316299
....................................................................................................
Step  9100
Actual: इस साल बहुत बारिश पड़ी है। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

Predicted: इस साल बहुत बारिश पड़ी है। <PAD> <PAD> <PAD> <PAD> <P