# Machine Translation using Sequence to Sequence Model

## Setup

In [None]:
%tensorflow_version 2.x

In [None]:
!pip install janome nltk



In [None]:
!mkdir data
!mkdir models
!wget http://www.manythings.org/anki/jpn-eng.zip -P data/
!unzip data/jpn-eng.zip -d data/

mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘models’: File exists
--2020-11-16 12:50:42--  http://www.manythings.org/anki/jpn-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.108.196, 104.24.109.196, 172.67.173.198, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.108.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2387505 (2.3M) [application/zip]
Saving to: ‘data/jpn-eng.zip.3’


2020-11-16 12:50:43 (8.76 MB/s) - ‘data/jpn-eng.zip.3’ saved [2387505/2387505]

Archive:  data/jpn-eng.zip
replace data/jpn.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace data/_about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


### Imports

In [None]:
from collections import defaultdict

import numpy as np
import tensorflow as tf
from janome.tokenizer import Tokenizer
from nltk.translate.bleu_score import corpus_bleu
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, Input, Embedding, GRU, Dot, Activation, Concatenate
from tensorflow.keras.models import Model, model_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Hyper-parameters

In [None]:
batch_size = 32
epochs = 100
model_path = 'models/mode.h5'
enc_arch = 'models/encoder.json'
dec_arch = 'models/decoder.json'
data_path = 'data/jpn.txt'
num_words = 10000
num_data = 20000

## The dataset

### Load the Bilingual Sentence Pairs

In [None]:
def load_dataset(filename):
    # wget http://www.manythings.org/anki/jpn-eng.zip
    en_texts = []
    ja_texts = []
    with open(filename) as f:
        for line in f:
            en_text, ja_text = line.strip().split('\t')[:2]
            en_texts.append(en_text)
            ja_texts.append(ja_text)
    return en_texts, ja_texts

In [None]:
en_texts, ja_texts = load_dataset(data_path)
en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data]

### Preprocess the dataset

In [None]:
t = Tokenizer(wakati=True)


def tokenize(text):
    return t.tokenize(text)


def build_vocabulary(texts, num_words=None):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=num_words, oov_token='<UNK>', filters=''
    )
    tokenizer.fit_on_texts(texts)
    return tokenizer


def preprocess_dataset(texts):
    return ['<start> {} <end>'.format(text) for text in texts]


def preprocess_ja(texts):
    return [' '.join(tokenize(text)) for text in texts]


def create_dataset(en_texts, ja_texts, en_vocab, ja_vocab):
    en_seqs = en_vocab.texts_to_sequences(en_texts)
    ja_seqs = ja_vocab.texts_to_sequences(ja_texts)
    en_seqs = pad_sequences(en_seqs, padding='post')
    ja_seqs = pad_sequences(ja_seqs, padding='post')
    return [en_seqs, ja_seqs[:, :-1]], ja_seqs[:, 1:]

In [None]:
ja_texts = preprocess_ja(ja_texts)
ja_texts = preprocess_dataset(ja_texts)
en_texts = preprocess_dataset(en_texts)
x_train, x_test, y_train, y_test = train_test_split(en_texts, ja_texts, test_size=0.2, random_state=42)
en_vocab = build_vocabulary(x_train, num_words)
ja_vocab = build_vocabulary(y_train, num_words)
x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab)

## The models

### Build a simple model

In [None]:
class BaseModel:

    def build(self):
        raise NotImplementedError()

    def save_as_json(self, filepath):
        model = self.build()
        with open(filepath, 'w') as f:
            f.write(model.to_json())

    @classmethod
    def load(cls, architecture_file, weight_file, by_name=True):
        with open(architecture_file) as f:
            model = model_from_json(f.read())
            model.load_weights(weight_file, by_name=by_name)
            return model


class Encoder(BaseModel):

    def __init__(self, input_dim, emb_dim=300, hid_dim=256, return_sequences=False):
        self.input = Input(shape=(None,), name='encoder_input')
        self.embedding = Embedding(input_dim=input_dim,
                                   output_dim=emb_dim,
                                   mask_zero=True,
                                   name='encoder_embedding')
        self.gru = GRU(hid_dim,
                       return_sequences=return_sequences,
                       return_state=True,
                       name='encoder_gru')

    def __call__(self):
        x = self.input
        embedding = self.embedding(x)
        output, state = self.gru(embedding)
        return output, state

    def build(self):
        output, state = self()
        return Model(inputs=self.input, outputs=[output, state])


class Decoder(BaseModel):

    def __init__(self, output_dim, emb_dim=300, hid_dim=256):
        self.input = Input(shape=(None,), name='decoder_input')
        self.embedding = Embedding(input_dim=output_dim,
                                   output_dim=emb_dim,
                                   mask_zero=True,
                                   name='decoder_embedding')
        self.gru = GRU(hid_dim,
                       return_sequences=True,
                       return_state=True,
                       name='decoder_gru')
        self.dense = Dense(output_dim, activation='softmax', name='decoder_output')

        # for inference.
        self.state_input = Input(shape=(hid_dim,), name='decoder_state_in')

    def __call__(self, states, enc_output=None):
        x = self.input
        embedding = self.embedding(x)
        outputs, state = self.gru(embedding, initial_state=states)
        outputs = self.dense(outputs)
        return outputs, state

    def build(self):
        decoder_output, decoder_state = self(states=self.state_input)
        return Model(
            inputs=[self.input, self.state_input],
            outputs=[decoder_output, decoder_state])


class LuongAttention:

    def __init__(self, units=300):
        self.dot = Dot(axes=[2, 2], name='dot')
        self.attention = Activation(activation='softmax', name='attention')
        self.context = Dot(axes=[2, 1], name='context')
        self.concat = Concatenate(name='concat')
        self.fc = Dense(units, activation='tanh', name='attn_out')

    def __call__(self, enc_output, dec_output):
        attention = self.dot([dec_output, enc_output])
        attention_weight = self.attention(attention)
        context_vector = self.context([attention_weight, enc_output])
        concat_vector = self.concat([context_vector, dec_output])
        output = self.fc(concat_vector)
        return output


class AttentionDecoder(Decoder):

    def __init__(self, output_dim, emb_dim=300, hid_dim=256):
        super().__init__(output_dim, emb_dim, hid_dim)
        self.attention = LuongAttention()
        self.enc_output = Input(shape=(None, hid_dim), name='encoder_output')

    def __call__(self, states, enc_output=None):
        x = self.input
        embedding = self.embedding(x)
        outputs, state = self.gru(embedding, initial_state=states)
        outputs = self.attention(enc_output, outputs)
        outputs = self.dense(outputs)
        return outputs, state

    def build(self):
        decoder_output, decoder_state = self(states=self.state_input,
                                             enc_output=self.enc_output)
        return Model(
            inputs=[self.input, self.enc_output, self.state_input],
            outputs=[decoder_output, decoder_state])


class Seq2seq(BaseModel):

    def __init__(self, encoder, decoder):
        self.encoder = encoder
        self.decoder = decoder

    def build(self):
        encoder_output, state = self.encoder()
        decoder_output, _ = self.decoder(states=state, enc_output=encoder_output)
        return Model([self.encoder.input, self.decoder.input], decoder_output)

In [None]:
encoder = Encoder(num_words)
decoder = Decoder(num_words)
seq2seq = Seq2seq(encoder, decoder)
model = seq2seq.build()
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

### Train the model

In [None]:
callbacks = [
    EarlyStopping(patience=3),
    ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True)
]
model.fit(x=x_train,
          y=y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=callbacks,
          validation_split=0.1)

encoder.save_as_json(enc_arch)
decoder.save_as_json(dec_arch)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


### Evaluate the model

#### Define Inference classes

In [None]:
class InferenceAPI:
    """A model API that generates output sequence.

    Attributes:
        encoder_model: Model.
        decoder_model: Model.
        en_vocab: source language's vocabulary.
        ja_vocab: target language's vocabulary.
    """

    def __init__(self, encoder_model, decoder_model, en_vocab, ja_vocab):
        self.encoder_model = encoder_model
        self.decoder_model = decoder_model
        self.en_vocab = en_vocab
        self.ja_vocab = ja_vocab

    def predict(self, text):
        output, state = self._compute_encoder_output(text)
        sequence = self._generate_sequence(output, state)
        decoded = self._decode(sequence)
        return decoded

    def _compute_encoder_output(self, text):
        """Compute encoder output.

        Args:
            text : string, the input text.

        Returns:
            output: encoder's output.
            state : encoder's final state.
        """
        assert isinstance(text, str)
        x = self.en_vocab.texts_to_sequences([text])
        output, state = self.encoder_model.predict(x)
        return output, state

    def _compute_decoder_output(self, target_seq, state, enc_output=None):
        """Compute decoder output.

        Args:
            target_seq: target sequence.
            state: hidden state.
            output: encoder's output.

        Returns:
            output: decoder's output.
            state: decoder's state.
        """
        output, state = self.decoder_model.predict([target_seq, state])
        return output, state

    def _generate_sequence(self, enc_output, state, max_seq_len=50):
        """Generate a sequence.

        Args:
            states: initial states of the decoder.

        Returns:
            sampled: a generated sequence.
        """
        target_seq = np.array([self.ja_vocab.word_index['<start>']])
        sequence = []
        for i in range(max_seq_len):
            output, state = self._compute_decoder_output(target_seq, state, enc_output)
            sampled_token_index = np.argmax(output[0, 0])
            if sampled_token_index == self.ja_vocab.word_index['<end>']:
                break
            sequence.append(sampled_token_index)
            target_seq = np.array([sampled_token_index])
        return sequence

    def _decode(self, sequence):
        """Decode a sequence.

        Args:
            sequence: a generated sequence.

        Returns:
            decoded: a decoded sequence.
        """
        decoded = self.ja_vocab.sequences_to_texts([sequence])
        decoded = decoded[0].split(' ')
        return decoded


class InferenceAPIforAttention(InferenceAPI):

    def _compute_decoder_output(self, target_seq, state, enc_output=None):
        output, state = self.decoder_model.predict([target_seq, enc_output, state])
        return output, state

#### Define evaluation function

In [None]:
def evaluate_bleu(X, y, api):
    d = defaultdict(list)
    for source, target in zip(X, y):
        d[source].append(target)
    hypothesis = []
    references = []
    for source, targets in d.items():
        pred = api.predict(source)
        hypothesis.append(pred)
        references.append(targets)
    bleu_score = corpus_bleu(references, hypothesis)
    return bleu_score

#### Perform evaluation

In [None]:
encoder = Encoder.load(enc_arch, model_path)
decoder = Decoder.load(dec_arch, model_path)
api = InferenceAPI(encoder, decoder, en_vocab, ja_vocab)
texts = sorted(set(en_texts[:50]), key=len)
for text in texts:
    decoded = api.predict(text=text)
    print('English : {}'.format(text))
    print('Japanese: {}'.format(decoded))

y_test = [y.split(' ')[1:-1] for y in y_test]
bleu_score = evaluate_bleu(x_test, y_test, api)
print('BLEU: {}'.format(bleu_score))

English : <start> Go. <end>
Japanese: ['何', 'か', '。']
English : <start> Hi. <end>
Japanese: ['おめでとう', '。']
English : <start> Who? <end>
Japanese: ['誰', '？']
English : <start> Wow! <end>
Japanese: ['ワォ', '！']
English : <start> Run. <end>
Japanese: ['くつろい', 'だ', '。']
English : <start> Wait! <end>
Japanese: ['待っ', 'て', '！']
English : <start> Fire! <end>
Japanese: ['撃て', '！']
English : <start> Jump. <end>
Japanese: ['飛び降りろ', '！']
English : <start> Help! <end>
Japanese: ['お', '入り', 'ください', '。']
English : <start> Jump! <end>
Japanese: ['飛び降りろ', '！']
English : <start> Stop! <end>
Japanese: ['おめでとう', '！']
English : <start> Hello! <end>
Japanese: ['こんにちは', '。']
English : <start> Go on. <end>
Japanese: ['出', 'て', 'いけ', '。']
English : <start> I try. <end>
Japanese: ['やっ', 'て', 'みる', '。']
English : <start> Hurry! <end>
Japanese: ['急げ', '！']
English : <start> I won! <end>
Japanese: ['私', 'は', '勝ち', '！']
English : <start> I see. <end>
Japanese: ['わかり', 'まし', 'た', '。']
BLEU: 0.19890089691474233


### Build an attention model

In [None]:
encoder = Encoder(num_words, return_sequences=True)
decoder = AttentionDecoder(num_words)
seq2seq = Seq2seq(encoder, decoder)
model = seq2seq.build()
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

### Train the model

In [None]:
callbacks = [
    EarlyStopping(patience=3),
    ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True)
]
model.fit(x=x_train,
          y=y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=callbacks,
          validation_split=0.1)

encoder.save_as_json(enc_arch)
decoder.save_as_json(dec_arch)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100


### Evaluate the model

In [None]:
encoder = Encoder.load(enc_arch, model_path)
decoder = Decoder.load(dec_arch, model_path)
api = InferenceAPIforAttention(encoder, decoder, en_vocab, ja_vocab)
bleu_score = evaluate_bleu(x_test, y_test, api)
print('BLEU: {}'.format(bleu_score))

BLEU: 0.21558149696078754
