# Build a seq2seq model for machine translation.

### Name: Zhili Yu



## 1. Data preparation

### 1.1. Load and clean text


In [16]:
import re
import string
from unicodedata import normalize
import numpy

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

def clean_data(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return numpy.array(cleaned)

#### Fill the following blanks:

In [17]:
# e.g., filename = 'Data/deu.txt'
filename = 'D:\CS583\CS583-2020S\homework\HM5\Data\deu.txt'

# e.g., n_train = 20000
n_train = 20000 #and test

In [18]:
# load dataset
doc = load_doc(filename)

# split into Language1-Language2 pairs
pairs = to_pairs(doc)

# clean sentences
clean_pairs = clean_data(pairs)[0:n_train, :]

In [19]:
for i in range(3000, 3010):
    print('[' + clean_pairs[i, 0] + '] => [' + clean_pairs[i, 1] + ']')

[you must go] => [du musst hingehen]
[you need it] => [ihr braucht es]
[you need it] => [sie brauchen es]
[you need me] => [du brauchst mich]
[you need me] => [ihr braucht mich]
[you need me] => [sie brauchen mich]
[you need us] => [du brauchst uns]
[you need us] => [sie brauchen uns]
[you need us] => [ihr braucht uns]
[you said it] => [ja es ist wirklich so]


In [20]:
input_texts = clean_pairs[:, 0]
target_texts = ['\t' + text + '\n' for text in clean_pairs[:, 1]]

print('Length of input_texts:  ' + str(input_texts.shape))
print('Length of target_texts: ' + str(input_texts.shape))

Length of input_texts:  (20000,)
Length of target_texts: (20000,)


In [21]:
max_encoder_seq_length = max(len(line) for line in input_texts)
max_decoder_seq_length = max(len(line) for line in target_texts)

print('max length of input  sentences: %d' % (max_encoder_seq_length))
print('max length of target sentences: %d' % (max_decoder_seq_length))

max length of input  sentences: 17
max length of target sentences: 73


**Remark:** To this end, you have two lists of sentences: input_texts and target_texts

## 2. Text processing

### 2.1. Convert texts to sequences

- Input: A list of $n$ sentences (with max length $t$).
- It is represented by a $n\times t$ matrix after the tokenization and zero-padding.

In [22]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# encode and pad sequences
def text2sequences(max_len, lines):
    tokenizer = Tokenizer(char_level=True, filters='')
    tokenizer.fit_on_texts(lines)
    seqs = tokenizer.texts_to_sequences(lines)
    seqs_pad = pad_sequences(seqs, maxlen=max_len, padding='post')
    return seqs_pad, tokenizer.word_index, tokenizer


encoder_input_seq, input_token_index, tokenizer = text2sequences(max_encoder_seq_length, 
                                                      input_texts)
decoder_input_seq, target_token_index, _ = text2sequences(max_decoder_seq_length, 
                                                       target_texts)

print('shape of encoder_input_seq: ' + str(encoder_input_seq.shape))
print('shape of input_token_index: ' + str(len(input_token_index)))
print('shape of decoder_input_seq: ' + str(decoder_input_seq.shape))
print('shape of target_token_index: ' + str(len(target_token_index)))

shape of encoder_input_seq: (20000, 17)
shape of input_token_index: 27
shape of decoder_input_seq: (20000, 73)
shape of target_token_index: 29


In [23]:
num_encoder_tokens = len(input_token_index) + 1
num_decoder_tokens = len(target_token_index) + 1

print('num_encoder_tokens: ' + str(num_encoder_tokens))
print('num_decoder_tokens: ' + str(num_decoder_tokens))

num_encoder_tokens: 28
num_decoder_tokens: 30


The followings print a sentence and its representation as a sequence.

In [24]:
target_texts[100]

'\tmach ne fliege\n'

In [25]:
decoder_input_seq[100, :]

array([ 8, 13, 10, 12,  7,  1,  5,  2,  1, 21, 18,  3,  2, 17,  2,  9,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0])

## 2.2. One-hot encode

- Input: A list of $n$ sentences (with max length $t$).
- It is represented by a $n\times t$ matrix after the tokenization and zero-padding.
- It is represented by a $n\times t \times v$ tensor ($t$ is the number of unique chars) after the one-hot encoding.

In [26]:
from keras.utils import to_categorical

# one hot encode target sequence
def onehot_encode(sequences, max_len, vocab_size):
    n = len(sequences)
    data = numpy.zeros((n, max_len, vocab_size))
    for i in range(n):
        data[i, :, :] = to_categorical(sequences[i], num_classes=vocab_size)
    return data

encoder_input_data = onehot_encode(encoder_input_seq, max_encoder_seq_length, num_encoder_tokens)
decoder_input_data = onehot_encode(decoder_input_seq, max_decoder_seq_length, num_decoder_tokens)

decoder_target_seq = numpy.zeros(decoder_input_seq.shape)
decoder_target_seq[:, 0:-1] = decoder_input_seq[:, 1:]
decoder_target_data = onehot_encode(decoder_target_seq, 
                                    max_decoder_seq_length, 
                                    num_decoder_tokens)

print(encoder_input_data.shape)
print(decoder_input_data.shape)

(20000, 17, 28)
(20000, 73, 30)


## 3. Build the networks (for training)

- Build encoder, decoder, and connect the two modules to get "model". 

- Fit the model on the bilingual data to train the parameters in the encoder and decoder.

### 3.1. Encoder network

- Input:  one-hot encode of the input language

- Return: 

    -- output (all the hidden states   $h_1, \cdots , h_t$) are always discarded
    
    -- the final hidden state  $h_t$
    
    -- the final conveyor belt $c_t$

In [27]:
from keras.layers import Input, LSTM, Bidirectional, Concatenate
from keras.models import Model

latent_dim = 256

# inputs of the encoder network
encoder_inputs = Input(shape=(None, num_encoder_tokens), 
                       name='encoder_inputs')

# set the LSTM layer
encoder_lstm = Bidirectional(LSTM(latent_dim, return_state=True, return_sequences=True,
                    dropout=0.5, name='encoder_lstm'))
encoder_sequence, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_inputs)

state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])

# build the encoder network model
encoder_model = Model(inputs=encoder_inputs, 
                      outputs=[encoder_sequence,state_h, state_c],
                      name='encoder')

Print a summary and save the encoder network structure to "./encoder.pdf"

In [28]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

SVG(model_to_dot(encoder_model, show_shapes=False).create(prog='dot', format='svg'))

plot_model(
    model=encoder_model, show_shapes=False,
    to_file='encoder.pdf'
)

encoder_model.summary()

Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, None, 28)     0                                            
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) [(None, None, 512),  583680      encoder_inputs[0][0]             
__________________________________________________________________________________________________
concatenate_3 (Concatenate)     (None, 512)          0           bidirectional_2[0][1]            
                                                                 bidirectional_2[0][3]            
__________________________________________________________________________________________________
concatenate_4 (Concatenate)     (None, 512)          0           bidirectional_2[0][2]      

### 3.2. Decoder network

- Inputs:  

    -- one-hot encode of the target language
    
    -- The initial hidden state $h_t$ 
    
    -- The initial conveyor belt $c_t$ 

- Return: 

    -- output (all the hidden states) $h_1, \cdots , h_t$

    -- the final hidden state  $h_t$ (discarded in the training and used in the prediction)
    
    -- the final conveyor belt $c_t$ (discarded in the training and used in the prediction)

In [29]:
from keras.layers import Input, LSTM, Dense
from keras.models import Model

# inputs of the decoder network

decoder_input_h = Input(shape=(latent_dim*4,), name='decoder_input_h')
decoder_input_c = Input(shape=(latent_dim*4,), name='decoder_input_c')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')
# set the LSTM layer
decoder_lstm = LSTM(latent_dim*4, return_sequences=True, 
                    return_state=True, dropout=0.5, name='decoder_lstm')
decoder_lstm_outputs, state_h, state_c = decoder_lstm(decoder_input_x, 
                                                      initial_state=[decoder_input_h, decoder_input_c])

# set the dense layer
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_lstm_outputs)

# build the decoder network model
decoder_model = Model(inputs=[decoder_input_x, decoder_input_h, decoder_input_c],
                      outputs=[decoder_outputs, state_h, state_c],
                      name='decoder')

Print a summary and save the encoder network structure to "./decoder.pdf"

In [30]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

SVG(model_to_dot(decoder_model, show_shapes=False).create(prog='dot', format='svg'))

plot_model(
    model=decoder_model, show_shapes=False,
    to_file='decoder.pdf'
)

decoder_model.summary()


Model: "decoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_input_x (InputLayer)    (None, None, 30)     0                                            
__________________________________________________________________________________________________
decoder_input_h (InputLayer)    (None, 1024)         0                                            
__________________________________________________________________________________________________
decoder_input_c (InputLayer)    (None, 1024)         0                                            
__________________________________________________________________________________________________
decoder_lstm (LSTM)             [(None, None, 1024), 4321280     decoder_input_x[0][0]            
                                                                 decoder_input_h[0][0]      

### 3.3. Connect the encoder and decoder

define attention layer

In [35]:
from keras.engine.topology import Layer

class Attention(Layer):

    def __init__(self, input_dim, w_dim, **kwargs):
        self.input_dim = input_dim
        self.w_dim = w_dim
        super(Attention, self).__init__(**kwargs)

    def build(self):
        # Create a trainable weight variable for this layer.
        self.w_h = self.add_weight(name='w_h', 
                                      shape=(self.w_dim, self.input_dim),
                                      initializer='uniform',
                                      trainable=True)
        self.w_s = self.add_weight(name='w_s', 
                                      shape=(self.w_dim, self.input_dim),
                                      initializer='uniform',
                                      trainable=True)
        super(Attention, self).build()  # Be sure to call this at the end

    def call(self, inputs):
        h = inputs[0]
        s = inputs[1]
        k = numpy.zeros(shape=(h.shape[0],self.w_dim))
        for i in range(h.shape[0]):
            k[i] = numpy.multiply(self.w_h,h[i])
        q = numpy.multiply(self.w_s,s)
        alpha = numpy.multiply(k,q)
        alpha = numpy.exp(alpha)/numpy.sum(numpy.exp(alpha))
        return numpy.dot(h,alpha)

In [37]:
# input layers
encoder_input_x = Input(shape=(None, num_encoder_tokens), name='encoder_input_x')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

# connect encoder to decoder
attention_layer = Attention(num_decoder_tokens+latent_dim*2,32)
encoder_final_states = encoder_model([encoder_input_x]) 
#return [encoder_sequence, state_h, state_c]
s = encoder_final_states[1]
c = encoder_final_states[2]
decoder_lstm_output = numpy.zeros(shape=(decoder_input_data.shape[1],latent_dim*2))
for i in range(decoder_input_data.shape[1]):
    r = attention_layer([encoder_final_states[0],s])
    s = numpy.concatenate(s,r)
    c = numpy.concatenate(c,r)
    decoder_lstm_output[i], s, c = decoder_lstm(decoder_input_x[i], initial_state=[s,c])
decoder_pred = decoder_dense(decoder_lstm_output)

model = Model(inputs=[encoder_input_x, decoder_input_x], 
              outputs=decoder_pred, 
              name='model_training')

TypeError: build() takes 1 positional argument but 2 were given

In [None]:
print(state_h)
print(decoder_input_h)

In [None]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

SVG(model_to_dot(model, show_shapes=False).create(prog='dot', format='svg'))

plot_model(
    model=model, show_shapes=False,
    to_file='model_training.pdf'
)

model.summary()

### 3.5. Fit the model on the bilingual dataset

- encoder_input_data: one-hot encode of the input language

- decoder_input_data: one-hot encode of the input language

- decoder_target_data: labels (left shift of decoder_input_data)

- tune the hyper-parameters

- stop when the validation loss stop decreasing.

In [None]:
print('shape of encoder_input_data' + str(encoder_input_data.shape))
print('shape of decoder_input_data' + str(decoder_input_data.shape))
print('shape of decoder_target_data' + str(decoder_target_data.shape))

In [None]:

encoder_train_data = encoder_input_data[0:n_train-4000,:,:]
decoder_train_data = decoder_input_data[0:n_train-4000,:,:]
decoder_train_target_data = decoder_target_data[0:n_train-4000,:,:]



model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

model.fit([encoder_train_data, decoder_train_data],  # training data
          decoder_train_target_data,                       # labels (left shift of the target sequences)
          batch_size=64, epochs=50, validation_split=0.2, verbose=2)

model.save('seq2seq.h5')

## 4. Make predictions


### 4.1. Translate English to XXX

1. Encoder read a sentence (source language) and output its final states, $h_t$ and $c_t$.
2. Take the [star] sign "\t" and the final state $h_t$ and $c_t$ as input and run the decoder.
3. Get the new states and predicted probability distribution.
4. sample a char from the predicted probability distribution
5. take the sampled char and the new states as input and repeat the process (stop if reach the [stop] sign "\n").

In [None]:
# Reverse-lookup token index to decode sequences back to something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

In [None]:
import math
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = numpy.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1.

    stop_condition = False
    decoded_sentence = ''
    temperature = 0.5
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # this line of code is greedy selection
        # try to use multinomial sampling instead (with temperature)
        output_tokens = numpy.power(output_tokens,temperature)
        output_tokens = output_tokens/numpy.sum(output_tokens[0,-1,:])
        sampled_token_index = numpy.argmax(output_tokens[0, -1, :])
        if sampled_token_index == 0:
            sampled_token_index = 1
        
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        target_seq = numpy.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        states_value = [h, c]

    return decoded_sentence


In [None]:
for seq_index in range(2100, 2120):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('English:       ', input_texts[seq_index])
    print('German (true): ', target_texts[seq_index][1:-1])
    print('German (pred): ', decoded_sentence[0:-1])


### 4.2. Translate an English sentence to the target language

1. Tokenization
2. One-hot encode
3. Translate

In [None]:
def text2sequences_translate(max_len, lines, tokenizer):
    seqs = tokenizer.texts_to_sequences(lines)
    seqs_pad = pad_sequences(seqs, maxlen=max_len, padding='post')
    return seqs_pad, tokenizer.word_index

In [None]:
input_sentence = ["i know them"]

input_sequence, _ = text2sequences_translate(max_encoder_seq_length, input_sentence,tokenizer)

input_x = onehot_encode(input_sequence, max_encoder_seq_length, num_encoder_tokens)
translated_sentence = decode_sequence(input_x)

print('source sentence is: ' + input_sentence[0])
print('translated sentence is: ' + translated_sentence)

## 5. Evaluate the translation using BLEU score

Reference: 
- https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
- https://en.wikipedia.org/wiki/BLEU

In [None]:
decoder_test_target_text = target_texts[n_train-2000:n_train]

seqs_target = [lines.split() for lines in decoder_test_target_text]
seqs_pred = []
for i in range(n_train-2000,n_train):
    input_seq = encoder_input_data[i: i + 1]
    decoded_sentence = decode_sequence(input_seq)
    seqs_pred.append(decoded_sentence.split())


In [None]:
from nltk.translate.bleu_score import sentence_bleu
bleu_score = 0
for i in range(2000):
    bleu_score += sentence_bleu([seqs_target[i]],seqs_pred[i],weights=(0.5, 0.5, 0, 0))
bleu_score = bleu_score/2000
print("BLEU score:"+str(bleu_score))