## NMT for English to Hindi

In [91]:
import os
import tensorflow as tf
import random
import numpy as np
import math

### Loading data

In [62]:
english = []
hindi = []
with open(os.path.join('hin-eng', 'hin.txt')) as f:
    for line in f:
        eng, hin = line.split('\t')
        # hindi is having a newline character '\n' at the end
        hin = hin[:-1]
        english.append(eng.lower())
        hindi.append(hin)
        
    assert len(english) == len(hindi)
    
    print(english[:10])
    print(hindi[:10])
    
    print('\nlength of english sentences:', len(english))
    print('length of hindi sentences:', len(hindi))

['wow!', 'help!', 'jump.', 'jump.', 'jump.', 'hello!', 'hello!', 'cheers!', 'cheers!', 'got it?']
['वाह!', 'बचाओ!', 'उछलो.', 'कूदो.', 'छलांग.', 'नमस्ते।', 'नमस्कार।', 'वाह-वाह!', 'चियर्स!', 'समझे कि नहीं?']

length of english sentences: 2867
length of hindi sentences: 2867


### Getting the length of unique sentences

In [63]:
unique_eng_sentences = set(english)
unique_hin_sentences = set(hindi)

print(len(unique_eng_sentences))
print(len(unique_hin_sentences))

2660
2788


### Creating the dictionary

In [64]:
eng_words = []
hin_words = []

for sent in english:
    sentence = sent.split(' ')
    eng_words.extend(sentence)

unq_eng_words = set(eng_words)
print('unique english words :', len(unq_eng_words))

for sent in hindi:
    sentence = sent.split(' ')
    hin_words.extend(sentence)

unq_hin_words = set(hin_words)
print('unique hindi words:', len(unq_hin_words))

eng_dict = {'<sos>':1, '<eos>':2, '<PAD>':3}

for wrd in unq_eng_words:
    eng_dict[wrd] = len(eng_dict)
rev_eng_dict = dict(zip(eng_dict.values(), eng_dict.keys()))

print('\nenglish dict:\n', list(eng_dict.items())[0:10])
print('\nreverse english dict:\n', list(rev_eng_dict.items())[0:10])

hin_dict = {'<sos>':1, '<eos>':2, '<PAD>':3}

for wrd in unq_hin_words:
    hin_dict[wrd] = len(hin_dict)
    
rev_hin_dict = dict(zip(hin_dict.values(), hin_dict.keys()))

print('\nhindi dict:\n', list(hin_dict.items())[0:10])
print('\nreverse hindi dict:\n', list(rev_hin_dict.items())[0:10])

eng_vocab_size = len(unq_eng_words)
hin_vocab_size = len(unq_hin_words)

unique english words : 3307
unique hindi words: 3156

english dict:
 [('old', 3), ('map.', 4), ('is!', 6), ('beast.', 2209), ('meeting', 1677), ('abroad.', 5), ('heat.', 2373), ('loyalty.', 7), ('me.', 8), ('last,', 9)]

reverse english dict:
 [(1, '<sos>'), (2, '<eos>'), (3, '<PAD>'), (4, 'map.'), (5, 'abroad.'), (6, 'is!'), (7, 'loyalty.'), (8, 'me.'), (9, 'last,'), (10, 'crash')]

hindi dict:
 [('ब्रॅड', 3), ('बोर', 4), ('मुट्ठीभर', 8), ('बाँटा।', 7), ('दिल', 5), ('हाथ-पैर', 9), ('नाश्ता', 10), ('चली', 11), ('उपयोगी', 12), ('निधन', 13)]

reverse hindi dict:
 [(1, '<sos>'), (2, '<eos>'), (3, '<PAD>'), (4, 'बोर'), (5, 'दिल'), (6, 'किसान'), (7, 'बाँटा।'), (8, 'मुट्ठीभर'), (9, 'हाथ-पैर'), (10, 'नाश्ता')]


### Creating train and test set

In [65]:
test_ind = [random.randint(0, len(english)) for i in range(50)]

english_train = [english[i] for i in range(len(english)) if i not in test_ind]
hindi_train = [hindi[i] for i in range(len(hindi)) if i not in test_ind]

english_test = [english[i] for i in test_ind]
hindi_test = [hindi[i] for i in test_ind]

print('length of english train:', len(english_train))
print('length of hindi train:', len(hindi_train))

print('length of english test:', len(english_test))
print('length of hindi test:', len(hindi_test))

for i in range(10):
    print('\neng:', english_test[i])
    print('hin:', hindi_test[i])

length of english train: 2817
length of hindi train: 2817
length of english test: 50
length of hindi test: 50

eng: is it necessary for me to attend the party?
hin: मेरा पार्टी में आना ज़रूरी है क्या?

eng: tokyo is a very big city.
hin: टोक्यो बहुत बड़ा शहर है।

eng: she has long arms and legs.
hin: उसके हाथ-पैर लम्बे हैं।

eng: they have demanded that all copies of the book be destroyed.
hin: उन्होंने मांग करी है कि इस किताब की सारी कॉपियाँ नष्ट कर दीं जाएं।

eng: he was my dear friend.
hin: वह मेरा अच्छा दोस्त था।

eng: that's my fault.
hin: वह मेरी गलती है।

eng: i was able to answer all the questions.
hin: मैं सारे सवालों का जवाब दे पाया था।

eng: he did the work on his own.
hin: उसने काम अपने-आप किया।

eng: perfect!
hin: उत्तम!

eng: i hope that it rains tomorrow.
hin: काश कल बारिश हो जाए।


### Max no. of words in a sentence

In [66]:
en = []
hi = []

for sent in english:
    sentence = sent.split(' ')
    en.append(sentence)

for sent in hindi:
    sentence = sent.split(' ')
    hi.append(sentence)
    
print('max length of english sentence:', len(max(en, key = len)))
print('max length of hindi sentence:', len(max(hi, key = len)))

max length of english sentence: 22
max length of hindi sentence: 25


### Converting from words to numbers

In [67]:
# +2 for <sos> and <eos>
eng_max_len = len(max(en, key = len)) + 2
hin_max_len = len(max(hi, key = len)) + 2

train_input = []
train_output = []

for eng_sent, hin_sent in zip(english_train, hindi_train):
    
    numeric_eng_sent = [eng_dict['<sos>']]
    numeric_hin_sent = [hin_dict['<sos>']]
    #print(numeric_eng_sent)
    
    for wrds in eng_sent.split():
        numeric_eng_sent.append(eng_dict[wrds])
    
    for wrds in hin_sent.split():
        numeric_hin_sent.append(hin_dict[wrds])
    
    if len(numeric_eng_sent) < eng_max_len:
        
        [numeric_eng_sent.append(eng_dict['<PAD>']) 
                            for i in range(eng_max_len - len(numeric_eng_sent) - 1)]
        
        numeric_eng_sent.append(eng_dict['<eos>'])
    
    train_input.append(numeric_eng_sent)
    
    if len(numeric_hin_sent) < hin_max_len:
        
        [numeric_hin_sent.append(hin_dict['<PAD>']) 
                            for i in range(hin_max_len - len(numeric_hin_sent) - 1)]
        numeric_hin_sent.append(hin_dict['<eos>'])
    
    train_output.append(numeric_hin_sent)


test_input = []
test_output = []

for eng_sent, hin_sent in zip(english_test, hindi_test):
    
    numeric_eng_sent = [eng_dict['<sos>']]
    numeric_hin_sent = [hin_dict['<sos>']]
    #print(numeric_eng_sent)
    
    for wrds in eng_sent.split():
        numeric_eng_sent.append(eng_dict[wrds])
    
    for wrds in hin_sent.split():
        numeric_hin_sent.append(hin_dict[wrds])
    
    if len(numeric_eng_sent) < eng_max_len:
        
        [numeric_eng_sent.append(eng_dict['<PAD>']) 
                            for i in range(eng_max_len - len(numeric_eng_sent) - 1)]
        
        numeric_eng_sent.append(eng_dict['<eos>'])
    
    test_input.append(numeric_eng_sent)
    
    if len(numeric_hin_sent) < hin_max_len:
        
        [numeric_hin_sent.append(hin_dict['<PAD>']) 
                            for i in range(hin_max_len - len(numeric_hin_sent) - 1)]
        numeric_hin_sent.append(hin_dict['<eos>'])
    
    test_output.append(numeric_hin_sent)



### Checking if every thing is working fine

In [68]:
wrd_sent = []
for i in range(10):
    num_sent = test_input[i]
    print('\nnumeric and word sequence:\n', num_sent)
    for j in range(24):
        wrd_sent.append(rev_eng_dict[num_sent[j]])
        
    print(wrd_sent)
    
    wrd_sent = []


numeric and word sequence:
 [1, 2672, 2416, 3020, 220, 2299, 2192, 587, 2586, 1461, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2]
['<sos>', 'is', 'it', 'necessary', 'for', 'me', 'to', 'attend', 'the', 'party?', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<eos>']

numeric and word sequence:
 [1, 1971, 2672, 1820, 2300, 1262, 25, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2]
['<sos>', 'tokyo', 'is', 'a', 'very', 'big', 'city.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<eos>']

numeric and word sequence:
 [1, 862, 1914, 2622, 2135, 348, 2330, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2]
['<sos>', 'she', 'has', 'long', 'arms', 'and', 'legs.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<eos>']

numeric and w

### Word embeddings using skip-gram

In [94]:
batch_size = 64
embedding_size = 128

sentences = []
window_size = 1
data = []

num_sampled = 20

for sent in english:
    sentence = sent.split(' ')
    sentences.append(sentence)
    


for sent in sentences:
    for ind, wrd in enumerate(sent):
        for cont_wrd in sent[max(ind - window_size, 0) : min(ind + window_size, len(sent)) + 1]:
            if wrd not in cont_wrd:
                data.append([wrd, cont_wrd])
                
data[0:10]

[['got', 'it?'],
 ['it?', 'got'],
 ["i'm", 'ok.'],
 ['ok.', "i'm"],
 ['come', 'in.'],
 ['in.', 'come'],
 ['get', 'out!'],
 ['out!', 'get'],
 ['go', 'away!'],
 ['away!', 'go']]

In [86]:
data_train = []
data_label = []

for inp in data:
    
    data_train.append(eng_dict[inp[0]])
    data_label.append(eng_dict[inp[1]])

print('data train: {}' .format(data_train[0:10]))
print('data label: {}' .format(data_label[0:10]))

data train: [1318, 642, 2062, 869, 3283, 1139, 1772, 1830, 1316, 2399]
data label: [642, 1318, 869, 2062, 1139, 3283, 1830, 1772, 2399, 1316]


### Define i/p and o/p

In [87]:
tf.reset_default_graph()
train_dataset = tf.placeholder(tf.int32, shape = [batch_size])
train_labels = tf.placeholder(tf.int32, shape = [batch_size, 1])

### Model Parameters and Other Variables

In [92]:
embeddings = tf.Variable(tf.random_uniform([eng_vocab_size, embedding_size], -1.0, 1.0))

softmax_weights = tf.Variable(tf.truncated_normal([eng_vocab_size, embedding_size],
                                stddev=0.5 / math.sqrt(embedding_size))
                                )
softmax_biases = tf.Variable(tf.random_uniform([eng_vocab_size],0.0,0.01))


### Computations

In [96]:
embed = tf.nn.embedding_lookup(embeddings, train_dataset)

loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(
                    weights=softmax_weights, biases=softmax_biases, inputs=embed,
                    labels=train_labels, num_sampled=num_sampled, num_classes=eng_vocab_size)
                    )

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



### Optimizer

In [97]:
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

### Running the skip gram algorithm

In [102]:
num_steps = 10000
skip_losses = []

with tf.Session() as session:
    tf.global_variables_initializer()
    average_loss = 0
    
    for step in range(num_steps):
        for batch_idx in range(len(data_train) // batch_size):
            
            batch_data = data_train[batch_size*batch_idx : batch_size*batch_idx + batch_size]
            batch_label = data_label[batch_size*batch_idx : batch_size*batch_idx + batch_size]
            
            feed_dict = {train_dataset: batch_data, train_labels: batch_label}
            _, l = session.run([optimizer, loss], feed_dict = feed_dict)
            
            average_loss += l
            
            if (step + 1) % 200 == 0:
                if step > 0:
                    average_loss / 2000
                skip_losses.append(average_loss)
                
                print('Average loss at step %d: %f' % (step+1, average_loss))
                average_loss = 0

ValueError: Cannot feed value of shape (64,) for Tensor 'Placeholder_1:0', which has shape '(64, 1)'

In [99]:
data_train[0:64]

[1318,
 642,
 2062,
 869,
 3283,
 1139,
 1772,
 1830,
 1316,
 2399,
 1961,
 2563,
 1961,
 2563,
 1961,
 2563,
 1594,
 3198,
 1594,
 3198,
 360,
 2979,
 2062,
 738,
 2062,
 260,
 828,
 538,
 2506,
 8,
 3171,
 2487,
 3094,
 8,
 909,
 1594,
 3184,
 3184,
 426,
 1594,
 2698,
 2062,
 1640,
 2062,
 1908,
 2062,
 34,
 1921,
 835,
 581,
 2379,
 581,
 2379,
 581,
 2379,
 581,
 2379,
 3171,
 883,
 3283,
 119,
 119,
 1139,
 1326]

In [None]:
def one_hot_conv(data_point, vocab_size):
    vec = np.zeros(vocab_size)
    vec[data_point] = 1
    return vec

train_data = []
train_lables = []

for dat in data:
    
    train_data.append(one_hot_conv(eng_dict[dat[0]], eng_vocab_size + 3))
    train_labels.append(one_hot_conv(eng_dict[dat[1]], eng_vocab_size + 3))
    
train_data = np.asarray(train_data)
train_labels = np.asarray(train_labels)

In [None]:
tf.reset_default_graph()

train_data = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])