## NMT for English to Hindi

In [93]:
import os
import tensorflow as tf
import random
import numpy as np
import math
from tensorflow.python.layers.core import Dense

### Loading data

In [52]:
# To do: separate the punctuation from words
english = []
hindi = []
with open(os.path.join('hin-eng', 'hin.txt')) as f:
    for line in f:
        eng, hin = line.split('\t')
        # hindi is having a newline character '\n' at the end
        hin = hin[:-1]
        english.append(eng.lower())
        hindi.append(hin)
        
    assert len(english) == len(hindi)
    
    print(english[:10])
    print(hindi[:10])
    
    print('\nlength of english sentences:', len(english))
    print('length of hindi sentences:', len(hindi))

['wow!', 'help!', 'jump.', 'jump.', 'jump.', 'hello!', 'hello!', 'cheers!', 'cheers!', 'got it?']
['वाह!', 'बचाओ!', 'उछलो.', 'कूदो.', 'छलांग.', 'नमस्ते।', 'नमस्कार।', 'वाह-वाह!', 'चियर्स!', 'समझे कि नहीं?']

length of english sentences: 2867
length of hindi sentences: 2867


### Getting the length of unique sentences

In [53]:
unique_eng_sentences = set(english)
unique_hin_sentences = set(hindi)

print(len(unique_eng_sentences))
print(len(unique_hin_sentences))

2660
2788


### Creating the dictionary

In [54]:
eng_words = []
hin_words = []

for sent in english:
    sentence = sent.split(' ')
    eng_words.extend(sentence)

unq_eng_words = set(eng_words)
print('unique english words :', len(unq_eng_words))

for sent in hindi:
    sentence = sent.split(' ')
    hin_words.extend(sentence)

unq_hin_words = set(hin_words)
print('unique hindi words:', len(unq_hin_words))

eng_dict = {'<sos>':1, '<eos>':2, '<PAD>':3}

for wrd in unq_eng_words:
    eng_dict[wrd] = len(eng_dict) + 1
rev_eng_dict = dict(zip(eng_dict.values(), eng_dict.keys()))

print('\nenglish dict:\n', list(eng_dict.items())[0:10])
print('\nreverse english dict:\n', list(rev_eng_dict.items())[0:10])

hin_dict = {'<sos>':1, '<eos>':2, '<PAD>':3}

for wrd in unq_hin_words:
    hin_dict[wrd] = len(hin_dict) + 1
    
rev_hin_dict = dict(zip(hin_dict.values(), hin_dict.keys()))

print('\nhindi dict:\n', list(hin_dict.items())[0:10])
print('\nreverse hindi dict:\n', list(rev_hin_dict.items())[0:10])

eng_vocab_size = len(eng_dict) + 1
hin_vocab_size = len(hin_dict) + 1

unique english words : 3307
unique hindi words: 3156

english dict:
 [('shot?', 4), ('unites', 5), ('watching', 1069), ('caught', 6), ('books', 7), ('daily', 8), ('program', 9), ('buy', 10), ('8', 11), ('slept', 12)]

reverse english dict:
 [(1, '<sos>'), (2, '<eos>'), (3, '<PAD>'), (4, 'shot?'), (5, 'unites'), (6, 'caught'), (7, 'books'), (8, 'daily'), (9, 'program'), (10, 'buy')]

hindi dict:
 [('पकाया', 4), ('अठारह', 1547), ('आकर', 6), ('लाती', 2100), ('"अभी', 517), ('स्वेटर', 7), ('दीवालिया', 8), ('देतीं', 9), ('माँगता', 10), ('ताऊजी', 11)]

reverse hindi dict:
 [(1, '<sos>'), (2, '<eos>'), (3, '<PAD>'), (4, 'पकाया'), (5, 'खाना'), (6, 'आकर'), (7, 'स्वेटर'), (8, 'दीवालिया'), (9, 'देतीं'), (10, 'माँगता')]


### Creating train and test set

In [55]:
test_ind = [random.randint(0, len(english)) for i in range(50)]

english_train = [english[i] for i in range(len(english)) if i not in test_ind]
hindi_train = [hindi[i] for i in range(len(hindi)) if i not in test_ind]

english_test = [english[i] for i in test_ind]
hindi_test = [hindi[i] for i in test_ind]

print('length of english train:', len(english_train))
print('length of hindi train:', len(hindi_train))

print('length of english test:', len(english_test))
print('length of hindi test:', len(hindi_test))

for i in range(10):
    print('\neng:', english_test[i])
    print('hin:', hindi_test[i])

length of english train: 2817
length of hindi train: 2817
length of english test: 50
length of hindi test: 50

eng: the king abused his power.
hin: राजा ने अपनी ताकत का दुरुपयोग किया।

eng: only a few people listened to him.
hin: उसकी बात कुछ ही लोगों ने सुनी।

eng: i want to see the manager.
hin: मैं मैनेजर से मिलना चाह्ता हूँ।

eng: when does it begin?
hin: शुरू कब होता है?

eng: he likes playing soccer.
hin: उसको फ़ुटबॉल खेलना अच्छा लगता है।

eng: my father told me not to read books in bed.
hin: मेरे पिता ने मुझसे बिस्तर में किताबें पढ़ने से मना किया।

eng: he had to go without food for days.
hin: उसने दिनों तक खाना नहीं खाया।

eng: opinions vary from person to person.
hin: हर इनसान की सोच अलग होती है।

eng: tom is my friend.
hin: टॉम मेरा दोस्त है।

eng: i will explain it to her.
hin: मैं उसको यह बात समझाउँगा।


### Max no. of words in a sentence

In [56]:
en = []
hi = []

for sent in english:
    sentence = sent.split(' ')
    en.append(sentence)

for sent in hindi:
    sentence = sent.split(' ')
    hi.append(sentence)
    
print('max length of english sentence:', len(max(en, key = len)))
print('max length of hindi sentence:', len(max(hi, key = len)))


max length of english sentence: 22
max length of hindi sentence: 25


### Converting from words to numbers

In [57]:
# +2 for <sos> and <eos>
eng_max_len = len(max(en, key = len)) + 2
hin_max_len = len(max(hi, key = len)) + 2

train_input = []
train_output = []

for eng_sent, hin_sent in zip(english_train, hindi_train):
    
    numeric_eng_sent = [eng_dict['<sos>']]
    numeric_hin_sent = [hin_dict['<sos>']]
    
    
    for wrds in eng_sent.split():
        numeric_eng_sent.append(eng_dict[wrds])
    
    for wrds in hin_sent.split():
        numeric_hin_sent.append(hin_dict[wrds])
    
    if len(numeric_eng_sent) < eng_max_len:
        
        [numeric_eng_sent.append(eng_dict['<PAD>']) 
                            for i in range(eng_max_len - len(numeric_eng_sent) - 1)]
        
        numeric_eng_sent.append(eng_dict['<eos>'])
    
    train_input.append(numeric_eng_sent)
    
    if len(numeric_hin_sent) < hin_max_len:
        
        [numeric_hin_sent.append(hin_dict['<PAD>']) 
                            for i in range(hin_max_len - len(numeric_hin_sent) - 1)]
        numeric_hin_sent.append(hin_dict['<eos>'])
    
    train_output.append(numeric_hin_sent)


test_input = []
test_output = []

for eng_sent, hin_sent in zip(english_test, hindi_test):
    
    numeric_eng_sent = [eng_dict['<sos>']]
    numeric_hin_sent = [hin_dict['<sos>']]
    
    for wrds in eng_sent.split():
        numeric_eng_sent.append(eng_dict[wrds])
    
    for wrds in hin_sent.split():
        numeric_hin_sent.append(hin_dict[wrds])
    
    if len(numeric_eng_sent) < eng_max_len:
        
        [numeric_eng_sent.append(eng_dict['<PAD>']) 
                            for i in range(eng_max_len - len(numeric_eng_sent) - 1)]
        
        numeric_eng_sent.append(eng_dict['<eos>'])
    
    test_input.append(numeric_eng_sent)
    
    if len(numeric_hin_sent) < hin_max_len:
        
        [numeric_hin_sent.append(hin_dict['<PAD>']) 
                            for i in range(hin_max_len - len(numeric_hin_sent) - 1)]
        numeric_hin_sent.append(hin_dict['<eos>'])
    
    test_output.append(numeric_hin_sent)



### Checking if every thing is working fine

In [58]:
wrd_sent = []
for i in range(10):
    num_sent = test_input[i]
    print('\nnumeric and word sequence:\n', num_sent)
    for j in range(24):
        wrd_sent.append(rev_eng_dict[num_sent[j]])
        
    print(wrd_sent)
    
    wrd_sent = []


numeric and word sequence:
 [1, 81, 1098, 994, 1901, 648, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2]
['<sos>', 'the', 'king', 'abused', 'his', 'power.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<eos>']

numeric and word sequence:
 [1, 3252, 1462, 1656, 668, 2517, 2680, 286, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2]
['<sos>', 'only', 'a', 'few', 'people', 'listened', 'to', 'him.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<eos>']

numeric and word sequence:
 [1, 355, 3153, 2680, 263, 81, 670, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2]
['<sos>', 'i', 'want', 'to', 'see', 'the', 'manager.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<eos>']

numeric and word s

### Making train and test i/p and o/p numpy arrays

In [59]:
train_input = np.array(train_input, dtype=np.int32)
train_output = np.array(train_output, dtype=np.int32)

test_input = np.array(test_input,dtype=np.int32)
test_output = np.array(test_output,dtype=np.int32)

### Word embeddings using skip-gram

In [60]:
batch_size = 64
embedding_size = 128

sentences = []
window_size = 1
data = []

num_sampled = 20

for sent in english:
    sentence = sent.split(' ')
    sentences.append(sentence)
    


for sent in sentences:
    for ind, wrd in enumerate(sent):
        for cont_wrd in sent[max(ind - window_size, 0) : min(ind + window_size, len(sent)) + 1]:
            if wrd not in cont_wrd:
                data.append([wrd, cont_wrd])
                
data[0:10]

[['got', 'it?'],
 ['it?', 'got'],
 ["i'm", 'ok.'],
 ['ok.', "i'm"],
 ['come', 'in.'],
 ['in.', 'come'],
 ['get', 'out!'],
 ['out!', 'get'],
 ['go', 'away!'],
 ['away!', 'go']]

In [61]:
data_train = []
data_label = []

for inp in data:
    
    data_train.append(eng_dict[inp[0]])
    data_label.append(eng_dict[inp[1]])

print('data train: {}' .format(data_train[0:10]))
print('data label: {}' .format(data_label[0:10]))

data train: [343, 77, 37, 1412, 200, 119, 2388, 1255, 3233, 3034]
data label: [77, 343, 1412, 37, 119, 200, 1255, 2388, 3034, 3233]


### Define i/p and o/p

In [62]:
tf.reset_default_graph()
train_dataset = tf.placeholder(tf.int32, shape = [batch_size])
train_labels = tf.placeholder(tf.int32, shape = [batch_size, 1])

### Model Parameters and Other Variables

In [63]:
embeddings = tf.Variable(tf.random_uniform([eng_vocab_size, embedding_size], -1.0, 1.0))

softmax_weights = tf.Variable(tf.truncated_normal([eng_vocab_size, embedding_size],
                                stddev=0.5 / math.sqrt(embedding_size))
                                )
softmax_biases = tf.Variable(tf.random_uniform([eng_vocab_size],0.0,0.01))


### Computations

In [64]:
embed = tf.nn.embedding_lookup(embeddings, train_dataset)

loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(
                    weights=softmax_weights, biases=softmax_biases, inputs=embed,
                    labels=train_labels, num_sampled=num_sampled, num_classes=eng_vocab_size)
                    )

### Optimizer

In [65]:
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

### Running the skip gram algorithm

In [66]:
num_steps = 10000
skip_losses = []

with tf.Session() as session:
    tf.global_variables_initializer().run()
    average_loss = 0
    
    for step in range(num_steps):
        for batch_idx in range(len(data_train) // batch_size):
            
            batch_data = data_train[batch_size*batch_idx : batch_size*batch_idx + batch_size]
            batch_data = np.asarray(batch_data)
            
            batch_label = data_label[batch_size*batch_idx : batch_size*batch_idx + batch_size]
            batch_label = np.asarray(batch_label)
            batch_label = batch_label.reshape((batch_size, 1))
            
            
            feed_dict = {train_dataset: batch_data, train_labels: batch_label}
            _, l = session.run([optimizer, loss], feed_dict = feed_dict)
            
            average_loss += l
            
            if (step + 1) % 200 == 0:
                if step > 0:
                    average_loss / 2000
                skip_losses.append(average_loss)
        if (step + 1) % 200 == 0:        
            print('Average loss at step %d: %f' % (step+1, average_loss))
            average_loss = 0
            
    embeddings_eng = embeddings.eval()

np.save('eng-embeddings.npy', embeddings_eng)

Average loss at step 200: 19104.206836
Average loss at step 400: 15129.596671
Average loss at step 600: 14458.384918
Average loss at step 800: 14067.148496
Average loss at step 1000: 13909.639348
Average loss at step 1200: 13686.131503
Average loss at step 1400: 13574.306089
Average loss at step 1600: 13446.376667
Average loss at step 1800: 13468.504927
Average loss at step 2000: 13313.956548
Average loss at step 2200: 13251.755186
Average loss at step 2400: 13217.137033
Average loss at step 2600: 13138.801445
Average loss at step 2800: 13112.302946
Average loss at step 3000: 13059.683472
Average loss at step 3200: 13027.275180
Average loss at step 3400: 13053.489153
Average loss at step 3600: 12961.175036
Average loss at step 3800: 12936.898778
Average loss at step 4000: 12924.095328
Average loss at step 4200: 12890.947228
Average loss at step 4400: 12848.428462
Average loss at step 4600: 12803.918754
Average loss at step 4800: 12828.399154
Average loss at step 5000: 12805.359848
Aver

### Word embeddings Hindi

In [74]:
batch_size = 64
embedding_size = 128

sentences = []
window_size = 1
data_hindi = []

num_sampled = 20

for sent in hindi:
    sentence = sent.split(' ')
    sentences.append(sentence)
    


for sent in sentences:
    for ind, wrd in enumerate(sent):
        for cont_wrd in sent[max(ind - window_size, 0) : min(ind + window_size, len(sent)) + 1]:
            if wrd not in cont_wrd:
                data_hindi.append([wrd, cont_wrd])
                
data_hindi[0:10]

[['समझे', 'कि'],
 ['कि', 'समझे'],
 ['कि', 'नहीं?'],
 ['नहीं?', 'कि'],
 ['मैं', 'ठीक'],
 ['ठीक', 'मैं'],
 ['ठीक', 'हूँ।'],
 ['हूँ।', 'ठीक'],
 ['बहुत', 'बढ़िया!'],
 ['बढ़िया!', 'बहुत']]

In [75]:
data_hindi_train = []
data_hindi_label = []

for inp in data_hindi:
    
    data_hindi_train.append(hin_dict[inp[0]])
    data_hindi_label.append(hin_dict[inp[1]])

print('data train: {}' .format(data_hindi_train[0:10]))
print('data label: {}' .format(data_hindi_label[0:10]))

data train: [2471, 2997, 2997, 1348, 2330, 2350, 2350, 1441, 1625, 1172]
data label: [2997, 2471, 1348, 2997, 2350, 2330, 1441, 2350, 1172, 1625]


### Define i/p and o/p

In [76]:
tf.reset_default_graph()
train_hindi_dataset = tf.placeholder(tf.int32, shape = [batch_size])
train_hindi_labels = tf.placeholder(tf.int32, shape = [batch_size, 1])

### Model parameters and other variables

In [77]:
embeddings_hindi = tf.Variable(tf.random_uniform([hin_vocab_size, embedding_size], -1.0, 1.0))

softmax_weights_hindi = tf.Variable(tf.truncated_normal([hin_vocab_size, embedding_size],
                                stddev=0.5 / math.sqrt(embedding_size))
                                )
softmax_biases_hindi = tf.Variable(tf.random_uniform([hin_vocab_size],0.0,0.01))


### Computations

In [78]:
embed_hindi = tf.nn.embedding_lookup(embeddings_hindi, train_hindi_dataset)

loss_hindi = tf.reduce_mean(tf.nn.sampled_softmax_loss(
                    weights=softmax_weights_hindi, biases=softmax_biases_hindi, inputs=embed_hindi,
                    labels=train_hindi_labels, num_sampled=num_sampled, num_classes=hin_vocab_size)
                    )

### Optimizer

In [79]:
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss_hindi)

### Running the skip gram loss algorithm for hindi

In [80]:
num_steps = 10000
skip_losses_hindi = []

with tf.Session() as session:
    tf.global_variables_initializer().run()
    average_loss_hindi = 0
    
    for step in range(num_steps):
        for batch_idx in range(len(data_hindi_train) // batch_size):
            
            batch_data_hindi = data_hindi_train[batch_size*batch_idx : batch_size*batch_idx + batch_size]
            batch_data_hindi = np.asarray(batch_data_hindi)
            
            batch_label_hindi = data_hindi_label[batch_size*batch_idx : batch_size*batch_idx + batch_size]
            batch_label_hindi = np.asarray(batch_label_hindi)
            batch_label_hindi = batch_label_hindi.reshape((batch_size, 1))
            
            
            feed_dict_hindi = {train_hindi_dataset: batch_data_hindi, train_hindi_labels: batch_label_hindi}
            _, l_hindi = session.run([optimizer, loss_hindi], feed_dict = feed_dict_hindi)
            
            average_loss_hindi += l_hindi
            
            if (step + 1) % 200 == 0:
                if step > 0:
                    average_loss_hindi / 2000
                skip_losses_hindi.append(average_loss_hindi)
        if (step + 1) % 200 == 0:        
            print('Average loss at step %d: %f' % (step+1, average_loss_hindi))
            average_loss_hindi = 0
            
        embeddings_hin = embeddings_hindi.eval()

np.save('hin-embeddings.npy', embeddings_hin)

Average loss at step 200: 20764.698177
Average loss at step 400: 16067.201322
Average loss at step 600: 15385.223276
Average loss at step 800: 14916.733274
Average loss at step 1000: 14647.311958
Average loss at step 1200: 14481.542229
Average loss at step 1400: 14240.352908
Average loss at step 1600: 14187.222962
Average loss at step 1800: 14067.767327
Average loss at step 2000: 13935.622694
Average loss at step 2200: 13844.899747
Average loss at step 2400: 13751.748264
Average loss at step 2600: 13774.257029
Average loss at step 2800: 13712.555826
Average loss at step 3000: 13670.060449
Average loss at step 3200: 13636.004656
Average loss at step 3400: 13600.316429
Average loss at step 3600: 13515.999555
Average loss at step 3800: 13510.712746
Average loss at step 4000: 13386.726994
Average loss at step 4200: 13402.215762
Average loss at step 4400: 13386.307040
Average loss at step 4600: 13364.119302
Average loss at step 4800: 13314.938027
Average loss at step 5000: 13285.671227
Aver

### NMT using TF seq2seq library

### Data Generations for MT

In [88]:
emb_mat = np.load('eng-embeddings.npy')
input_size = emb_mat.shape[1]

class DataGeneratorMT(object):
    
    def __init__(self,batch_size,num_unroll,is_source, is_train):
        global input_size
        self._batch_size = batch_size
        self._num_unroll = num_unroll
        self._cursor = [0 for offset in range(self._batch_size)]
        
        
        self._sent_ids = None
        
        self._is_source = is_source
        self._is_train = is_train
                
    def next_batch(self, sent_ids):
        
        
        if self._is_source:
            max_sent_length = eng_max_len
        else:
            max_sent_length = hin_max_len
            
        batch_data = np.zeros((self._batch_size),dtype=np.float32)
        batch_labels = np.zeros((self._batch_size),dtype=np.float32)
        
        
        for b in range(self._batch_size):
            
            sent_id = sent_ids[b]
            
            
            if self._is_source:
                
                if self._is_train:
                    sent_text = train_input[sent_id]
                else:
                    sent_text = test_input[sent_id]
                             
                batch_data[b] = sent_text[self._cursor[b]]
                batch_labels[b] = sent_text[self._cursor[b]+1]
            
            else:
                
                if self._is_train:
                    sent_text = train_output[sent_id]
                else:
                    sent_text = test_output[sent_id]
                
                
                if sent_text[self._cursor[b]]!=hin_dict['<sos>']:
                    batch_data[b] = sent_text[self._cursor[b]]
                else:
                    batch_data[b] = sent_text[self._cursor[b]]
                
                batch_labels[b] = sent_text[self._cursor[b]+1]
            
            self._cursor[b] = (self._cursor[b]+1)%(max_sent_length-1)
             
        return batch_data,batch_labels
        
    def unroll_batches(self,sent_ids):
        
        
        if sent_ids is not None:
            
            self._sent_ids = sent_ids
            
            self._cursor = [0 for _ in range(self._batch_size)]
                
        unroll_data,unroll_labels = [],[]
        
        for ui in range(self._num_unroll):
            
            if self._is_source:
                data, labels = self.next_batch(self._sent_ids)
            else:
                data, labels = self.next_batch(self._sent_ids)
                    
            unroll_data.append(data)
            unroll_labels.append(labels)
        
        return unroll_data, unroll_labels, self._sent_ids
    
    def reset_indices(self):
        self._cursor = [0 for offset in range(self._batch_size)]
        
dg = DataGeneratorMT(batch_size=5,num_unroll=20,is_source=True, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,1,2,3,4])

print('Source data')
for _, lbl in zip(u_data,u_labels):
    print([rev_eng_dict[w] for w in lbl.tolist()])

dg = DataGeneratorMT(batch_size=5,num_unroll=30,is_source=False, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,2,3,4,5])
print('\nTarget data batch')
for d_i,(_, lbl) in enumerate(zip(u_data,u_labels)):
    print([rev_hin_dict[w] for w in lbl.tolist()])

Source data
['wow!', 'help!', 'jump.', 'jump.', 'jump.']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']

Target data batch
['वाह!', 'उछलो.', 'कूदो.', 'छलांग.', 'नमस्ते।']
['

### Hyperparameters

In [89]:
vocab_size = 3158
num_units = 128
batch_size = 4
source_sequence_length = 24
target_sequence_length = 27
learning_rate = 0.01

### Defining the TensorFlow inputs and outputs

In [90]:
tf.reset_default_graph()

enc_train_inputs = []
dec_train_inputs = []
dec_train_labels = []

encoder_emb_layer = tf.convert_to_tensor(np.load('eng-embeddings.npy'))
decoder_emb_layer = tf.convert_to_tensor(np.load('hin-embeddings.npy'))

for ui in range(source_sequence_length):
    enc_train_inputs.append(tf.placeholder(tf.int32, shape=[batch_size],name='train_inputs_%d'%ui))

for ui in range(target_sequence_length):
    dec_train_inputs.append(tf.placeholder(tf.int32, shape=[batch_size],name='train_inputs_%d'%ui))
    dec_train_labels.append(tf.placeholder(tf.int32, shape=[batch_size],name='train_outputs_%d'%ui))
    
encoder_emb_inp = [tf.nn.embedding_lookup(encoder_emb_layer, src) for src in enc_train_inputs]
encoder_emb_inp = tf.stack(encoder_emb_inp)

decoder_emb_inp = [tf.nn.embedding_lookup(decoder_emb_layer, src) for src in dec_train_inputs]
decoder_emb_inp = tf.stack(decoder_emb_inp)

### Encoder

In [91]:
encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)

initial_state = encoder_cell.zero_state(batch_size, dtype=tf.float32)

encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
    encoder_cell, encoder_emb_inp, initial_state=initial_state,
    sequence_length=[source_sequence_length for _ in range(batch_size)], 
    time_major=True, swap_memory=True)

### Decoder

In [96]:
decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)

projection_layer = Dense(units=vocab_size, use_bias=True)

helper = tf.contrib.seq2seq.TrainingHelper(
    decoder_emb_inp, [target_sequence_length for _ in range(batch_size)], time_major=True)

decoder = tf.contrib.seq2seq.BasicDecoder(
        decoder_cell, helper, encoder_state,
        output_layer=projection_layer)

outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder, output_time_major=True,
                swap_memory=True
                )

logits = outputs.rnn_output
train_prediction = outputs.sample_id

# Loss computation
# To do: Mask the <PAD> 
crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
    labels=dec_train_labels, logits=logits)
loss = tf.reduce_mean(crossent)

### Optimizer

In [97]:
optimizer = tf.train.AdamOptimizer(learning_rate)

gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
optimize = optimizer.apply_gradients(zip(gradients, v))

sess = tf.InteractiveSession()

### Running the Seq2seq NMT

In [None]:
loss_over_time = []
tf.global_variables_initializer().run()

src_word_embeddings = np.load('eng-embeddings.npy')
tgt_word_embeddings = np.load('hin-embeddings.npy')

num_steps = 10001
avg_loss = 0

enc_data_generator = DataGeneratorMT(
    batch_size=batch_size,num_unroll=source_sequence_length,is_train=True, is_source=True)
dec_data_generator = DataGeneratorMT(
    batch_size=batch_size,num_unroll=target_sequence_length,is_train=True, is_source=False)

for step in range(num_steps):

    # num_enc_unrollings: 40
    # num_dec_unrollings: 60
    print('.',end='')
    if (step+1)%100==0:
        print('')
        
    # Pick a random batch of sentences to train the algorithm
    sent_ids = np.random.randint(low=0,high=train_inputs.shape[0],size=(batch_size))

    # Create a batch of data for the encoder
    eu_data, eu_labels, _ = enc_data_generator.unroll_batches(sent_ids=sent_ids)
    
    # Create a batch of data for the decoder
    du_data, du_labels, _ = dec_data_generator.unroll_batches(sent_ids=sent_ids)
    
    feed_dict = {}
    for ui,(dat,lbl) in enumerate(zip(eu_data,eu_labels)):            
        feed_dict[enc_train_inputs[ui]] = dat                
    
    for ui,(dat,lbl) in enumerate(zip(du_data,du_labels)):            
        feed_dict[dec_train_inputs[ui]] = dat
        feed_dict[dec_train_labels[ui]] = lbl

    # Optimize the NMT with either Adam (first 10000 iterations)
    # or stochastic gradient descent (after 10000 iterations)
    
    _,l,tr_pred = sess.run([optimize,loss,train_prediction], feed_dict=feed_dict)
    tr_pred = tr_pred.flatten()
    
    # Print some training predictions
    if (step+1)%100==0:  
        
        print('Step ',step+1)

        # Print the train results (actual and predicted)
        print_str = 'Actual: '
        for w in np.concatenate(du_labels,axis=0)[::batch_size].tolist():
            print_str += rev_hin_dict[w] + ' '                    
            if rev_hin_dict[w] == '<EOS>':
                break
                      
        print(print_str)
        print()
        
        print_str = 'Predicted: '
        for w in tr_pred[::batch_size].tolist():
            print_str += rev_hin_dict[w] + ' '
            if rev_hin_dict[w] == '<EOS>':
                break
        print(print_str)
       
        print('\n')
        
        rand_idx = np.random.randint(low=1,high=batch_size)
        print_str = 'Actual: '
        for w in np.concatenate(du_labels,axis=0)[rand_idx::batch_size].tolist():
            print_str += rev_hin_dict[w] + ' '
            if rev_hin_dict[w] == '<EOS>':
                break
        print(print_str)

            
        print()
        print_str = 'Predicted: '
        for w in tr_pred[rand_idx::batch_size].tolist():
            print_str += rev_hin_dict[w] + ' '
            if rev_hin_dict[w] == '<EOS>':
                break
        print(print_str)
        print() 
    
    avg_loss += l # Update average loss
    
    # Print the loss
    if (step+1)%500==0:
        print('============= Step ', str(step+1), ' =============')
        print('\t Loss: ',avg_loss/500.0)
        
        loss_over_time.append(avg_loss/500.0)
             
        avg_loss = 0.0