# Practical 5.2

## Word-level Sequence-to-Sequence (Seq2Seq) Model For Machine Translation

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
#cd drive/My Drive/Recsys-2019/Seq2Seq

/content/drive/My Drive/Recsys-2019/Seq2Seq


In [0]:
from __future__ import print_function

import os
import sys
import numpy as np
import nltk
import string
from string import punctuation
import re

from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding, GRU, Lambda, Bidirectional, concatenate
from keras.layers import Reshape

Using TensorFlow backend.


In [0]:
import gensim
from gensim.models import Word2Vec

## 1. Data preprocessing

In [0]:
file_to_read = 'data/nld.txt'

Function to tokenize text into words (array list of words). Notice that we discard all punctuation in original text.

In [0]:
def tokenizeWords(text):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    clean_text = regex.sub('', text)
    tokens = clean_text.split()
   
       
    return [t.lower() for t in tokens]

In [0]:
def indexingVocabulary(array_of_words):
    
    # frequency of word across document corpus
    tf = nltk.FreqDist(array_of_words)
    wordIndex = list(tf.keys())
    
    wordIndex.insert(0,'<pad>')
    wordIndex.append('<start>')
    wordIndex.append('<end>')
    wordIndex.append('<unk>')
    # indexing word vocabulary : pairs of (index,word)
    vocab=dict([(i,wordIndex[i]) for i in range(len(wordIndex))])
    
    return vocab

In [0]:
#reading text line by line
lines = open(os.path.join('./',file_to_read)).read().split('\n')

### Tokenization and vocabulary indexing

Notice that we only use 10.000 samples from data. Training a Seq2Seq model is computationally expensive (memory!)

In [0]:
num_samples = 10000  # Number of samples to train on.

input_str_tokens = []
target_str_tokens = []

ind_start = 10000
ind_end = 10000 + min(num_samples, len(lines) - 1)

for line in lines[ind_start : ind_end]:
    input_text, target_text = line.split('\t')
    # tokenize text from source language (english text)
    input_str_tokens.append(tokenizeWords(input_text))
    # tokenize text from target language (dutch text)
    target_str_tokens.append(tokenizeWords(target_text))

* `en_vocab` stores word index for encoder input sequences (english dictionary)
* `nl_vocab` stores word index for decoder target sequences (dutch dictionary)

In [0]:
# build vocabulary index
input_words = []
target_words = []

for i, tokens in enumerate(input_str_tokens):
    input_words.extend(tokens)
# vocabulary index for english text (input)    
en_vocab = indexingVocabulary(input_words)

for i, tokens in enumerate(target_str_tokens):
    target_words.extend(tokens)
# vocabulary index for dutch text (output)
nl_vocab = indexingVocabulary(target_words)

We also need to create reverse version of look up index to map text sequences into integer format.

In [0]:
en_reversedvocab = dict((v,k) for (k,v) in en_vocab.items())
nl_reversedvocab = dict((v,k) for (k,v) in nl_vocab.items())

### Preparing training sequences


* `seq_int_input`: input sequences for encoder model 
* `seq_int_target`: input sequences for decoder model

In [0]:
# integer format of sequence input 
seq_int_input = []
for i, text in enumerate(input_str_tokens):
    int_tokens = [en_reversedvocab[i] for i in text]
    seq_int_input.append(int_tokens)

For input and output sequences of decoder model, we add `starting` sign (`'<start>'`) and `ending` sign (`'<end>'`) at the beginning and last part of sequence. 

In [0]:
seq_int_target = []
for i, text in enumerate(target_str_tokens):
    targettext = list(text)
    targettext.insert(0,'<start>')
    targettext.append('<end>')
  
    int_tokens = [nl_reversedvocab[i] for i in targettext]
    seq_int_target.append(int_tokens)

## 2. Word embedding

* In character level, we define input and output sequences as one-hot vector in 3D numpy arrays (number of samples, sequence length, vocabulary size). 
* For word-level, we have integer input sequences with 2D shape (number of samples, sequence length). Instead of one-hot encoding words, we will use embedding layer to project each word sequence to its embedding.
* We will train our text with Word2Vec - Skipgram to provide initial weight for our embedding layer (may also use pretrained word embedding).

In [0]:
# for english text
# skipgram model with hierarchical softmax and negative sampling
word2vec_model_en = Word2Vec(size=256, min_count=0, window=5, sg=1, 
                          hs=1, negative=5, iter=100)

In [0]:
word2vec_model_en.build_vocab(input_str_tokens)
word2vec_vocab_en = dict([(v.index,k) for k, v in word2vec_model_en.wv.vocab.items()]) 
revert_w2v_vocab_en = dict((v,k) for (k,v) in word2vec_vocab_en.items())

In [0]:
# for dutch text
# skipgram model with hierarchical softmax and negative sampling
word2vec_model_nl = Word2Vec(size=256, min_count=0, window=5, sg=1, 
                          hs=1, negative=5, iter=100)

In [0]:
word2vec_model_nl.build_vocab(target_str_tokens)
word2vec_vocab_nl = dict([(v.index,k) for k, v in word2vec_model_nl.wv.vocab.items()]) 
revert_w2v_vocab_nl = dict((v,k) for (k,v) in word2vec_vocab_nl.items())

In [0]:
print('Training word2vec model...')

# for english text
# number of tokens
n_tokens = sum([len(seq) for seq in input_str_tokens])
# number of sentences/documents
n_examples = len(input_str_tokens)
word2vec_model_en.train(input_str_tokens, total_words=n_tokens, 
                        total_examples=n_examples, epochs=100)

Training word2vec model...


(4257876, 5946200)

In [0]:
# for dutch text
# number of tokens
n_tokens = sum([len(seq) for seq in target_str_tokens])
# number of sentences/documents
n_examples = len(target_str_tokens)
word2vec_model_nl.train(target_str_tokens, total_words=n_tokens, 
                        total_examples=n_examples, epochs=100)

(4297542, 6123900)

The following variables store our word embedding learnt from word2vec skipgram.

In [0]:
# the resulting learnt word embedding 
# for input text sequence (english language)
word2vec_we_en = word2vec_model_en.wv.syn0

# for target text sequence (dutch language)
word2vec_we_nl = word2vec_model_nl.wv.syn0

  """Entry point for launching an IPython kernel.
  after removing the cwd from sys.path.


Notice that vocabulary size of word embedding learnt by word2vec is less than our vocabulary size since we add additional word tokens: `'<pad>'`, `'<start>'`, `'<end>'`, `'<unk>'`

In [0]:
word2vec_we_en.shape

(4163, 256)

In [0]:
word2vec_we_nl.shape

(5280, 256)

In [0]:
embedding_en = np.zeros(shape=(len(en_vocab), 256), dtype='float32')
embedding_nl = np.zeros(shape=(len(nl_vocab), 256), dtype='float32')

In [0]:
embedding_en.shape

(4167, 256)

In [0]:
embedding_nl.shape

(5284, 256)

In [0]:
# for input sequences (text in english language)
for i, w in en_vocab.items():
    # this will assign default weight 0 for words: 'SOF', 'EOF', and 'UNK'
    if w not in word2vec_vocab_en.values():
        continue
    embedding_en[i, :] = word2vec_we_en[revert_w2v_vocab_en[w], :]

In [0]:
# for target output sequences (text in dutch language)
for i, w in nl_vocab.items():
    # this will assign default weight 0 for words: 'SOF', 'EOF', and 'UNK'
    if w not in word2vec_vocab_nl.values():
        continue
    embedding_nl[i, :] = word2vec_we_nl[revert_w2v_vocab_nl[w], :]

## 3. Word-based Translation model

Parameters initialization.

In [0]:
batch_size = 100  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
#num_samples = 1000  # Number of samples to train on.

EN_VOCAB_SIZE = len(en_vocab) # vocabulary size of english words
EN_EMBEDDING_DIM = embedding_en.shape[1] # embedding size of english words
NL_VOCAB_SIZE = len(nl_vocab) # vocabulary size of dutch words
NL_EMBEDDING_DIM = embedding_nl.shape[1] # embedding size of dutch words

## 3.1. Encoder model

Notice that in this example, we set parameters in our embedding layer to be trainable. 

In [0]:
# encoder model
encoder_inputs = Input(shape=(None,), name='encoder_inputs')
encoder_embedding = Embedding(EN_VOCAB_SIZE, EN_EMBEDDING_DIM, trainable = True, 
              weights=[embedding_en], name='embedding_encoder')(encoder_inputs)
gru_encoder = GRU(latent_dim, name='lstm_encoder')
encoder_states = gru_encoder(encoder_embedding)


Instructions for updating:
Colocations handled automatically by placer.


## 3.2. Decoder model

In [0]:
# decoder model
decoder_inputs = Input(shape=(None,), name='decoder_inputs')
decoder_embedding = Embedding(NL_VOCAB_SIZE, NL_EMBEDDING_DIM, trainable = True, 
              weights=[embedding_nl], name='embedding_decoder')
embedding_output = decoder_embedding(decoder_inputs)

s0 = Input(shape=(latent_dim,), name='s0') 
s = [s0]
    
gru_decoder = GRU(latent_dim, return_state=True)


decoder_dense = Dense(NL_VOCAB_SIZE, activation='softmax')

probs = []

for t in range(16):
  
    x_dec = Lambda(lambda x: x[:,t,:], name='dec_embedding-%s'%t)(embedding_output)
    x_dec = Reshape((1, NL_EMBEDDING_DIM))(x_dec)
    
    enc_state_reshape = Reshape((1, 256))(encoder_states)
    
    context_concat = concatenate([x_dec, enc_state_reshape],axis=-1)
       
    
    if t==0:
        s = encoder_states
        
        
    s, _  = lstm_decoder(context_concat, initial_state=s)
    
    prob = decoder_dense(s)
    probs.append(prob)
    
    s = [s]
    


In [0]:
model = Model(inputs=[encoder_inputs, decoder_inputs, s0], outputs=probs)

In [0]:
# Compile & run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [0]:
model.summary()

## Training translation model

In [0]:
max_encoder_seq_length = max([len(sequences) for sequences in seq_int_input])
max_decoder_seq_length = max([len(sequences) for sequences in seq_int_target])

In [0]:
seq_int_target = np.array(seq_int_target)

In [0]:
seq_int_target.shape

(10000,)

In [0]:
len(seq_int_target[0])

6

In [0]:
seq_int_target[0]

[5281, 1, 2, 3, 4, 5282]

In [0]:
seq_int_target[0][:-1]

[5281, 1, 2, 3, 4]

In [0]:
seq_int_target[0][1:]

[1, 2, 3, 4, 5282]

In [0]:
seq_int_target[1]

[5281, 5, 6, 7, 8, 9, 5282]

In [0]:
max_encoder_seq_length

10

In [0]:
max_decoder_seq_length

16

In [0]:
encoder_input_data = np.zeros((len(seq_int_input), max_encoder_seq_length), dtype='float32')

In [0]:
decoder_input_data = np.zeros((len(seq_int_input), max_decoder_seq_length), dtype='float32')

In [0]:
decoder_target = np.zeros((len(seq_int_input), max_decoder_seq_length), dtype='float32')

In [0]:
encoder_input_data.shape

(10000, 10)

In [0]:
decoder_input_data.shape

(10000, 16)

In [0]:
decoder_target.shape

(10000, 16)

### Padding input sequences for encoder and decoder

In [0]:
for i, seq_int in enumerate(seq_int_input):
    for j, word_index in enumerate(seq_int):
        encoder_input_data[i][j] = word_index

In [0]:
for i, seq_int in enumerate(seq_int_target):
  
  seq_in = seq_int[:-1]
  seq_out = seq_int[1:]
  
  for j, word_index in enumerate(seq_in):
    decoder_input_data[i][j] = word_index
    
  for j, word_index in enumerate(seq_out):
    decoder_target[i][j] = word_index

In [0]:
decoder_input_data.shape

(10000, 16)

In [0]:
decoder_target.shape

(10000, 16)

In [0]:
len(nl_vocab)

5284

### Fitting sequences into model

* Note: creating 3D numpy arrays of decoder output (one-hot-encoding) might cause Memory Error. 


In [0]:
import tensorflow as tf
from keras.utils import to_categorical
class Dataiterator():

    def __init__(self, X, y_in, y_out, vocab_size=5284, seq_length=10, decoder_dim=256, batch_size=32):
        
        self.X = X
        self.y_in = y_in 
        self.y_out = y_out
        self.states = np.zeros((len(X), decoder_dim)) 
        self.num_data = len(X) 
        self.vocab_size = vocab_size
        self.batch_size = batch_size 
        self.seq_length = seq_length
        self.reset() # initial: shuffling examples and set index to 0
        
    
    def onehotencoding(self, data):
      
            
        return to_categorical(data, num_classes=self.vocab_size, dtype='int32')
    
    def __iter__(self): # iterates data
        
        return self


    def reset(self): # initials
        
        self.idx = 0
        self.order = np.random.permutation(self.num_data) # shuffling examples by providing randomized ids 
        
    def __next__(self): # return model inputs - outputs per batch
        
        X_ids = [] # hold ids per batch 

        while len(X_ids) < self.batch_size:

            X_id = self.order[self.idx] # copy random id from initial shuffling
            X_ids.append(X_id)

            self.idx += 1 # 
            if self.idx >= self.num_data: # exception if all examples of data have been seen (iterated)
                self.reset()
                raise StopIteration()
    
        batch_X = self.X[np.array(X_ids)] # X values (encoder input) per batch
        batch_y_in = self.y_in[np.array(X_ids)] # y_in values (decoder input) per batch
        batch_y_out = self.y_out[np.array(X_ids)]
        batch_states = self.states[np.array(X_ids)] # state values (decoder state input) per batch
        batch_y = self.onehotencoding(batch_y_out)
        
      
        
        return batch_X, batch_y_in, batch_states, list(batch_y.swapaxes(0,1))

    # return all data examples 
    def all(self):
      
        y = self.onehotencoding(self.y_out)
        
        return self.X, self.y_in, self.states, list(y.swapaxes(0,1))



In [0]:
encoder_input_data.shape

(10000, 10)

In [0]:
decoder_input_data.shape

(10000, 16)

In [0]:
decoder_target.shape

(10000, 16)

In [0]:
x_train = encoder_input_data[:8000]
y_train_in = decoder_input_data[:8000]
y_train_out = decoder_target[:8000]

In [0]:
x_train = np.array(x_train)
y_train_in = np.array(y_train_in)
y_train_out = np.array(y_train_out)

In [0]:
x_dev = encoder_input_data[8000:len(encoder_input_data)]
y_dev_in = decoder_input_data[8000:len(encoder_input_data)]
y_dev_out = decoder_target[8000:len(encoder_input_data)]

In [0]:
x_dev = np.array(x_dev)
y_dev_in = np.array(y_dev_in)
y_dev_out = np.array(y_dev_out)

In [0]:
batch_size = 32

In [0]:
train_steps_epoch = len(x_train)/batch_size
print("train_steps_epoch : %s" %train_steps_epoch)
batch_train_iter = Dataiterator(x_train, y_train_in, y_train_out)

train_steps_epoch : 250.0


In [0]:
dev_steps_epoch = len(x_dev)/batch_size
print("dev_steps_epoch : %s" %dev_steps_epoch)
batch_dev_iter = Dataiterator(x_dev, y_dev_in, y_dev_out)

dev_steps_epoch : 62.5


In [0]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

def train_generator(model, batch_train_iter, batch_val_iter):
    
    earlystop_callbacks = [EarlyStopping(monitor='val_loss', patience=10),
                     ModelCheckpoint(filepath=os.path.join('./','{epoch:02d}-{loss:.2f}.check'), \
                                     monitor='val_loss', save_best_only=False, \
                                     save_weights_only=True)
                     ]
    
    def train_gen():
        while True:
            train_batches = [[[X, y_in, state], y_out] for X, y_in, \
                             state, y_out in batch_train_iter]
            for train_batch in train_batches:
                yield train_batch
                
    def val_gen():
        while True:
            val_batches = [[[X, y_in, state], y_out] for X, y_in, \
                           state, y_out in batch_val_iter]
            for val_batch in val_batches:
                yield val_batch
                
    history = model.fit_generator(train_gen(), validation_data=val_gen(), \
                                  validation_steps=dev_steps_epoch, steps_per_epoch=train_steps_epoch, \
                                  epochs = 20, callbacks = earlystop_callbacks)
      

In [0]:
train_generator(model, batch_train_iter, batch_dev_iter)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [0]:
train_generator(model, batch_train_iter, batch_dev_iter)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


In [0]:
train_generator(model, batch_train_iter, batch_dev_iter)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


In [0]:
model.save_weights('./weights_mt.hdf5')

## 4. Inference mode

## 4.1. Re-define encoder model

In [0]:
encoder_model = Model(encoder_inputs, encoder_states)

In [0]:
encoder_model.save('encoder_word_lstm_translation.h5')

## 4.2. Re-define decoder model to do the inference

In [0]:
in_decoder = Input(shape=(1, ), name='decoder_input')
in_state_decoder = Input(shape=(256,), name='dec_state_in')
enc_out = Input(shape=(256,), name='enc_state')

in_dec_embedded =  decoder_embedding(in_decoder)

in_dec_embedded = Reshape((1,256))(in_dec_embedded)
enc_out_ = Reshape((1,256))(enc_out)

context_concat = concatenate([in_dec_embedded, enc_out_],axis=-1)


s, _ = gru_decoder(context_concat, initial_state=[in_state_decoder])

softmax_prob = decoder_dense(s)
dec_states = [s]

decoder_model = Model([in_decoder] + [enc_out] + [in_state_decoder] , [softmax_prob] + dec_states)