In [None]:
import tensorflow as tf
from keras.layers import Embedding,LSTM,Dropout,Dense,Layer
from keras import Model,Input
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
import keras.backend as K
import collections
import numpy as np
import time
from nltk.translate.bleu_score import corpus_bleu

In [None]:
class LanguageDict():
  def __init__(self, sents):
    word_counter = collections.Counter(tok.lower() for sent in sents for tok in sent)

    self.vocab = []
    self.vocab.append('<pad>')
    self.vocab.append('<unk>')
    self.vocab.extend([t for t,c in word_counter.items() if c > 10])

    self.word2ids = {w:id for id, w in enumerate(self.vocab)}
    self.ids2word = dict([(value, key) for (key, value) in self.word2ids.items()])
    self.UNK = self.word2ids['<unk>']
    self.PAD = self.word2ids['<pad>']

In [None]:
def load_dataset(source_path,target_path, max_num_examples=10000):
  source_lines = open(source_path).readlines()
  target_lines = open(target_path).readlines()
  assert len(source_lines) == len(target_lines)
  if max_num_examples > 0:
    max_num_examples = min(len(source_lines), max_num_examples)
    source_lines = source_lines[:max_num_examples]
    target_lines = target_lines[:max_num_examples]

  source_sents = [[tok.lower() for tok in sent.strip().split(' ')] for sent in source_lines]
  target_sents = [[tok.lower() for tok in sent.strip().split(' ')] for sent in target_lines]

  for sent in target_sents:
    sent.append('<end>')
    sent.insert(0,'<start>')

  source_lang_dict = LanguageDict(source_sents)
  target_lang_dict = LanguageDict(target_sents)

  unit = len(source_sents)//10
  source_words = [[source_lang_dict.word2ids.get(tok,source_lang_dict.UNK) for tok in sent] for sent in source_sents]
  source_words_train = pad_sequences(source_words[:8*unit],padding='post')
  source_words_dev = pad_sequences(source_words[8*unit:9*unit],padding='post')
  source_words_test = pad_sequences(source_words[9*unit:],padding='post')


  eos = target_lang_dict.word2ids['<end>']
  target_words = [[target_lang_dict.word2ids.get(tok,target_lang_dict.UNK) for tok in sent[:-1]] for sent in target_sents]
  target_words_train = pad_sequences(target_words[:8*unit],padding='post')
  target_words_train_labels = [sent[1:]+[eos] for sent in target_words[:8*unit]]
  target_words_train_labels = pad_sequences(target_words_train_labels,padding='post')
  target_words_train_labels = np.expand_dims(target_words_train_labels,axis=2)

  target_words_dev_labels = pad_sequences([sent[1:] + [eos] for sent in target_words[8 * unit:9 * unit]], padding='post')
  target_words_test_labels = pad_sequences([sent[1:] + [eos] for sent in target_words[9 * unit:]], padding='post')

  train_data = [source_words_train,target_words_train,target_words_train_labels]
  dev_data = [source_words_dev,target_words_dev_labels]
  test_data = [source_words_test,target_words_test_labels]

  return train_data,dev_data,test_data,source_lang_dict,target_lang_dict

In [None]:
from google.colab import drive
drive.mount('/drive')
source_path = "/drive/My Drive/CK Bigdata/Model 1/en.txt"
target_path = "/drive/My Drive/CK Bigdata/Model 1/vi.txt"

train_data,dev_data,test_data,source_lang_dict,target_lang_dict = load_dataset(source_path,target_path, max_num_examples=10000)

Mounted at /drive


In [None]:
print(f"Shape of training set: {len(train_data)}")

print("source_words")
print(train_data[0][0])
print([source_lang_dict.ids2word[word] for word in train_data[0][0]])
print("target words")
print(train_data[0][1])
print([target_lang_dict.ids2word[word] for word in train_data[0][1]])
print("target word labels")
print([target_lang_dict.ids2word[word] for word in train_data[0][2]])

Shape of training set: 3
source_words
[2 3 4 5 6 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
['the', 'science', 'behind', 'a', 'climate', '<unk>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '

In [None]:
dev_data[0]

array([[ 55,   1,  48, ...,   0,   0,   0],
       [139, 246, 494, ...,   0,   0,   0],
       [268,   9,  81, ...,   0,   0,   0],
       ...,
       [ 81,  82,   5, ...,   0,   0,   0],
       [ 81, 204, 205, ...,   0,   0,   0],
       [114, 204,  48, ...,   0,   0,   0]], dtype=int32)

In [None]:
test_data

[array([[  15, 1332,  263, ...,    0,    0,    0],
        [  50,   44,   99, ...,    0,    0,    0],
        [ 565,   46,   45, ...,    0,    0,    0],
        ...,
        [ 109,   82,    5, ...,    0,    0,    0],
        [  50,   81,    1, ...,    0,    0,    0],
        [   7,  130,    1, ...,    0,    0,    0]], dtype=int32),
 array([[ 33, 248, 393, ...,   0,   0,   0],
        [ 88, 474, 291, ...,   0,   0,   0],
        [474, 815,  10, ...,   0,   0,   0],
        ...,
        [ 82,   7, 342, ...,   0,   0,   0],
        [ 88, 145, 380, ...,   0,   0,   0],
        [ 14,   1, 139, ...,   0,   0,   0]], dtype=int32)]

In [None]:
class AttentionLayer(Layer):
  def compute_mask(self, inputs, mask=None):
    if mask == None:
      return None
    return mask[1]

  def compute_output_shape(self, input_shape):
    return (input_shape[1][0],input_shape[1][1],input_shape[1][2]*2)


  def call(self, inputs, mask=None):
    encoder_outputs, decoder_outputs = inputs
    
    decoder_outputs_T =  K.permute_dimensions(decoder_outputs,(0,2,1))

    luong_score = K.batch_dot(encoder_outputs,
                        decoder_outputs_T,
                        axes =[2,1]) 

    luong_score_softmax = K.softmax(luong_score, axis=1)

    luong_score_softmax_expand = K.expand_dims(luong_score_softmax,-1) 

    encoder_outputs_expand = K.expand_dims(encoder_outputs,2)

    product = encoder_outputs_expand*luong_score_softmax_expand

    encoder_vector = K.sum(product,axis = 1)
    
    new_decoder_outputs = K.concatenate([decoder_outputs, encoder_vector])

    return new_decoder_outputs

In [None]:
class NmtModel(object):
  def __init__(self,source_dict,target_dict,use_attention):

    self.hidden_size = 200
    # the size of the word embeddings being used
    self.embedding_size = 100
    # the dropout rate for the hidden layers
    self.hidden_dropout_rate=0.2
    # the dropout rate for the word embeddings
    self.embedding_dropout_rate = 0.2
    # batch size
    self.batch_size = 100

    self.max_target_step = 30

    # vocab size for source and target; we'll use everything we receive
    self.vocab_target_size = len(target_dict.vocab)
    self.vocab_source_size = len(source_dict.vocab)

    # instances of the dictionaries
    self.target_dict = target_dict
    self.source_dict = source_dict

    # special tokens to indicate sentence starts and ends.
    self.SOS = target_dict.word2ids['<start>']
    self.EOS = target_dict.word2ids['<end>']

    # Boolean to use attention or not
    # use attention or no
    self.use_attention = use_attention

    print("number of tokens in source: %d, number of tokens in target:%d" % (self.vocab_source_size,self.vocab_target_size))



  def build(self):

    #-------------------------Train Models------------------------------
    source_words = Input(shape=(None,),dtype='int32')
    target_words = Input(shape=(None,), dtype='int32')

    print('Task 1(a): Creating the embedding lookups...')
    embeddings_source = Embedding(self.vocab_source_size, self.embedding_size, name='embedding_source', #Note the first argument here is the vocabulary size
                        	embeddings_initializer='glorot_uniform', mask_zero=True, trainable=True)
    embeddings_target = Embedding(self.vocab_target_size, self.embedding_size, name='embedding_target', #Note the first argument here is the vocabulary size
                        	embeddings_initializer='glorot_uniform', mask_zero=True, trainable=True) 
    
    # (b.) Look up the embeddings for source words and for target words. Apply dropout to each encoded input
    print('\nTask 1(b): Looking up source and target words...')
    source_word_embeddings = embeddings_source(source_words)
    target_words_embeddings = embeddings_target(target_words)

    source_word_embeddings = Dropout(self.embedding_dropout_rate, 
                             input_shape = source_word_embeddings.shape, 
                             name = "dropout_source_embedding",seed=1010)(source_word_embeddings)

    target_words_embeddings = Dropout(self.embedding_dropout_rate, 
                          input_shape = source_word_embeddings.shape, 
                          name = "dropout_target_embedding",seed=1010)(target_words_embeddings)



    # (c.) An encoder LSTM() with return sequences set to True
    print('\nTask 1(c): Creating an encoder')
    encoder_lstm = LSTM(self.hidden_size, return_sequences = True, return_state = True, name = "encoder_LSTM")

    encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(source_word_embeddings)
    """
    End Task 1
    """
    encoder_states = [encoder_state_h,encoder_state_c]

    # The train decoder
    decoder_lstm = LSTM(self.hidden_size, recurrent_dropout=self.hidden_dropout_rate, 
                        return_sequences=True, return_state=True, name = "decoder_LSTM")
    decoder_outputs_train,_,_ = decoder_lstm(target_words_embeddings,initial_state=encoder_states)

    if self.use_attention:
      decoder_attention = AttentionLayer()
      decoder_outputs_train = decoder_attention([encoder_outputs,decoder_outputs_train])

    decoder_dense = Dense(self.vocab_target_size,activation='softmax')
    decoder_outputs_train = decoder_dense(decoder_outputs_train)

    # compiling the train model.
    adam = Adam(lr=0.01,clipnorm=5.0)
    self.train_model = Model([source_words,target_words], decoder_outputs_train)
    self.train_model.compile(optimizer=adam,loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # at this point you can print model summary for the train model
    print('\t\t\t\t\t\t Train Model Summary.')
    self.train_model.summary()

    self.encoder_model = Model(source_words,[encoder_outputs,encoder_state_h,encoder_state_c])
    # at this point you can print the summary for the encoder model.
    print('\t\t\t\t\t\t Inference Time Encoder Model Summary.')
    self.encoder_model.summary()

    # The decoder model
    # specifying the inputs to the decoder
    decoder_state_input_h = Input(shape=(self.hidden_size,)) # last hidden State
    decoder_state_input_c = Input(shape=(self.hidden_size,)) # cell state
    encoder_outputs_input = Input(shape=(None,self.hidden_size,)) # encoder outputs

    print('\n Putting together the decoder states')
    decoder_states = [decoder_state_input_h, decoder_state_input_c]

    # use decoder states as input to the decoder lstm to get the decoder outputs, h, and c for test time inference
    decoder_outputs_test,decoder_state_output_h, decoder_state_output_c = decoder_lstm(target_words_embeddings,
                                                                                       initial_state = decoder_states)

    # Task 2 (b.) Add attention if attention
    if self.use_attention:
      decoder_outputs_test = decoder_attention([encoder_outputs_input, 
                                                decoder_outputs_test])

    decoder_outputs_test = decoder_dense(decoder_outputs_test)

    self.decoder_model = Model([target_words,decoder_state_input_h,decoder_state_input_c,encoder_outputs_input],
                               [decoder_outputs_test,decoder_state_output_h,decoder_state_output_c])
    # you can now view the model summary
    print('\t\t\t\t\t\t Decoder Inference Model summary')
    print(self.decoder_model.summary())



  def time_used(self, start_time):
    curr_time = time.time()
    used_time = curr_time-start_time
    m = used_time // 60
    s = used_time - 60 * m
    return "%d m %d s" % (m, s)



  def train(self,train_data,dev_data,test_data, epochs):
    start_time = time.time()
    for epoch in range(epochs):
      print("Starting training epoch {}/{}".format(epoch + 1, epochs))
      epoch_time = time.time()
      source_words_train, target_words_train, target_words_train_labels = train_data

      self.train_model.fit([source_words_train,target_words_train],target_words_train_labels,batch_size=self.batch_size)

      print("Time used for epoch {}: {}".format(epoch + 1, self.time_used(epoch_time)))
      dev_time = time.time()
      print("Evaluating on dev set after epoch {}/{}:".format(epoch + 1, epochs))
      self.eval(dev_data)
      print("Time used for evaluate on dev set: {}".format(self.time_used(dev_time)))

    print("Training finished!")
    print("Time used for training: {}".format(self.time_used(start_time)))

    print("Evaluating on test set:")
    test_time = time.time()
    self.eval(test_data)
    print("Time used for evaluate on test set: {}".format(self.time_used(test_time)))



  def get_target_sentences(self, sents,vocab,reference=False):
    str_sents = []
    num_sent, max_len = sents.shape
    for i in range(num_sent):
      str_sent = []
      for j in range(max_len):
        t = sents[i,j].item()
        if t == self.SOS:
          continue
        if t == self.EOS:
          break

        str_sent.append(vocab[t])
      if reference:
        str_sents.append([str_sent])
      else:
        str_sents.append(str_sent)
    return str_sents



  def eval(self, dataset,print_outputs = False):
    # get the source words and target_word_labels for the eval dataset
    source_words, target_words_labels = dataset
    vocab = self.target_dict.vocab

    # using the same encoding network used during training time, encode the training
    encoder_outputs, state_h,state_c = self.encoder_model.predict(source_words,batch_size=self.batch_size)
    # for max_target_step steps, feed the step target words into the decoder.
    predictions = []
    step_target_words = np.ones([source_words.shape[0],1]) * self.SOS #start with <Start> symbol, initialized as a vector of <Start> symbols
    for _ in range(self.max_target_step):
      
      step_decoder_outputs, state_h,state_c = self.decoder_model.predict([step_target_words,state_h,state_c,encoder_outputs],batch_size=self.batch_size)
      step_target_words = np.argmax(step_decoder_outputs,axis=2)
      predictions.append(step_target_words)

    # predictions is a [time_step x batch_size x 1] array. We use get_target_sentence() to recover the batch_size sentences
    candidates = self.get_target_sentences(np.concatenate(predictions,axis=1),vocab)
    references = self.get_target_sentences(target_words_labels,vocab,reference=True)

    # score using nltk bleu scorer
    score = corpus_bleu(references,candidates)
    print("Model BLEU score: %.2f" % (score*100.0))

    #Modification
    if print_outputs:
      sources = self.get_target_sentences(np.array(source_words[0:len(source_words)]),self.source_dict.vocab)
      return sources,  candidates, references

In [None]:
def print_examples(model, example_no = 10):

  sources,  candidates, references = model.eval(test_data,print_outputs=True)

  for i in range(example_no-1):

    print(f"example:{i+1}")
    print(f"Source sentence: {' '.join(sources[i]).replace('<pad>', '')}")
    print(f"Predicted translation: {' '.join(candidates[i]).replace('<pad>', '')}")
    print(f"Actual translation: {' '.join([l[0] for l in references][i]).replace('<pad>', '')}")

In [None]:
def main(source_path, target_path, use_attention):
  max_example = 10000
  print('loading dictionaries')
  train_data, dev_data, test_data, source_dict, target_dict = load_dataset(source_path,target_path,max_num_examples=max_example)
  print("read %d/%d/%d train/dev/test batches" % (len(train_data[0]),len(dev_data[0]), len(test_data[0])))

  model = NmtModel(source_dict,target_dict,use_attention)
  model.build()
  model.train(train_data,dev_data,test_data,10)

In [None]:
#Clear session prior to creating the architecture
tf.keras.backend.clear_session()
model = NmtModel(source_lang_dict, target_lang_dict,False)
model.build()

number of tokens in source: 1514, number of tokens in target:1525
Task 1(a): Creating the embedding lookups...

Task 1(b): Looking up source and target words...

Task 1(c): Creating an encoder
						 Train Model Summary.
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_source (Embedding)   (None, None, 100)    151400      ['input_1[0][0]']                
                                                                       

  super(Adam, self).__init__(name, **kwargs)


Non-trainable params: 0
__________________________________________________________________________________________________
						 Inference Time Encoder Model Summary.
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_source (Embedding  (None, None, 100)        151400    
 )                                                               
                                                                 
 dropout_source_embedding (D  (None, None, 100)        0         
 ropout)                                                         
                                                                 
 encoder_LSTM (LSTM)         [(None, None, 200),       240800    
                              (None, 200),                       
                       

In [None]:
model.train(train_data,dev_data,test_data,20)

Starting training epoch 1/20
Time used for epoch 1: 9 m 55 s
Evaluating on dev set after epoch 1/20:


Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


Model BLEU score: 4.27
Time used for evaluate on dev set: 0 m 16 s
Starting training epoch 2/20
Time used for epoch 2: 10 m 21 s
Evaluating on dev set after epoch 2/20:
Model BLEU score: 0.55
Time used for evaluate on dev set: 0 m 15 s
Starting training epoch 3/20
Time used for epoch 3: 10 m 21 s
Evaluating on dev set after epoch 3/20:
Model BLEU score: 0.71
Time used for evaluate on dev set: 0 m 17 s
Starting training epoch 4/20
Time used for epoch 4: 10 m 21 s
Evaluating on dev set after epoch 4/20:
Model BLEU score: 1.18
Time used for evaluate on dev set: 0 m 13 s
Starting training epoch 5/20
Time used for epoch 5: 9 m 21 s
Evaluating on dev set after epoch 5/20:
Model BLEU score: 0.56
Time used for evaluate on dev set: 0 m 17 s
Starting training epoch 6/20
Time used for epoch 6: 9 m 13 s
Evaluating on dev set after epoch 6/20:
Model BLEU score: 1.33
Time used for evaluate on dev set: 0 m 16 s
Starting training epoch 7/20
Time used for epoch 7: 9 m 21 s
Evaluating on dev set after e

In [None]:
print_examples(model)

Model BLEU score: 2.24
example:1
Source sentence: with green technology and with <unk> to <unk> poverty , and global <unk> , world can become like this .                                                                                                           
Predicted translation: <unk> <unk> : <unk> <unk> : <unk> <unk> , <unk> <unk> , <unk> <unk> , <unk> <unk> , <unk> , <unk> .
Actual translation: với công nghệ xanh và với các khoản đầu tư để xoá bỏ đói nghèo , và với sự quản lý toàn cầu hiệu quả , thế giới có thể trở thành như thế này .
example:2
Source sentence: and look at position of old west .                                                                                                                      
Predicted translation: và dĩ nhiên , <unk> <unk> : <unk> .
Actual translation: và hãy nhìn vào vị trí của phương tây của ngày xưa .
example:3
Source sentence: remember when this blue box was all alone , leading world , living its own life .                                 

In [None]:
#Clear session prior to creating the architecture
tf.keras.backend.clear_session()
model_attention = NmtModel(source_lang_dict, target_lang_dict,True)
model_attention.build()

number of tokens in source: 1514, number of tokens in target:1525
Task 1(a): Creating the embedding lookups...

Task 1(b): Looking up source and target words...

Task 1(c): Creating an encoder
						 Train Model Summary.
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_source (Embedding)   (None, None, 100)    151400      ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                       

  super(Adam, self).__init__(name, **kwargs)



 Putting together the decoder states
						 Decoder Inference Model summary
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_target (Embedding)   (None, None, 100)    152500      ['input_2[0][0]']                
                                                                                                  
 dropout_target_embedding (Drop  (None, None, 100)   0           ['embedding_target[0][0]']       
 out)                                                                                             
                                                                                                  
 input_3 (Input

In [None]:
model_attention.train(train_data,dev_data,test_data,10)

Starting training epoch 1/10


In [None]:
print_examples(model_attention)