In [1]:
from tensorflow.keras.utils import get_file

url = 'https://github.com/odashi/small_parallel_enja/archive/master.zip'

zip_file_path = get_file('small_parallel_enja.zip', url, cache_subdir='small_parallel_enja', extract=True) 

In [2]:
import os

data_dir = os.path.join(os.path.dirname(zip_file_path), 'small_parallel_enja-master')
!ls -l $data_dir

total 9076
-rw-r--r-- 1 tensorflow tensorflow    1946 Feb 11 09:59 README.md
-rw-r--r-- 1 tensorflow tensorflow   17054 Feb 11 09:59 dev.en
-rw-r--r-- 1 tensorflow tensorflow   27781 Feb 11 09:59 dev.ja
-rw-r--r-- 1 tensorflow tensorflow   17301 Feb 11 09:59 test.en
-rw-r--r-- 1 tensorflow tensorflow   27793 Feb 11 09:59 test.ja
-rw-r--r-- 1 tensorflow tensorflow 1701356 Feb 11 09:59 train.en
-rw-r--r-- 1 tensorflow tensorflow  339768 Feb 11 09:59 train.en.000
-rw-r--r-- 1 tensorflow tensorflow  340186 Feb 11 09:59 train.en.001
-rw-r--r-- 1 tensorflow tensorflow  341174 Feb 11 09:59 train.en.002
-rw-r--r-- 1 tensorflow tensorflow  339953 Feb 11 09:59 train.en.003
-rw-r--r-- 1 tensorflow tensorflow  340275 Feb 11 09:59 train.en.004
-rw-r--r-- 1 tensorflow tensorflow   30025 Feb 11 09:59 train.en.vocab.4k
-rw-r--r-- 1 tensorflow tensorflow   51162 Feb 11 09:59 train.en.vocab.all
-rw-r--r-- 1 tensorflow tensorflow 2784447 Feb 11 09:59 train.ja
-rw-r--r-- 1 tensorflow tensorflow  556444 Fe

In [3]:
import os
import unicodedata
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

TRAIN_SIZE_LIMIT = 30000
TEST_SIZE_LIMIT = 300

def load_data(path):
    texts = []
    for line in open(path, 'r'):
        texts.append(line.strip())
    return texts
  
def preprocess(text):
    text = unicodedata.normalize('NFKC', text)
    text = '<start> ' + text + ' <end>'
    return text

train_en = load_data(os.path.join(data_dir, 'train.en'))
train_ja = load_data(os.path.join(data_dir, 'train.ja'))

train_en = train_en[:TRAIN_SIZE_LIMIT]
train_ja = train_ja[:TRAIN_SIZE_LIMIT]

train_input = [preprocess(s) for s in train_en]
train_target = [preprocess(s) for s in train_ja]

In [4]:
en_vectorizer = TextVectorization(max_tokens=20000,
                                  standardize=None,
                                  output_mode='int',
                                  output_sequence_length=18)

en_vectorizer.adapt(train_input)

ja_vectorizer = TextVectorization(max_tokens=20000,
                                  standardize=None,
                                  output_mode='int',
                                  output_sequence_length=18)

ja_vectorizer.adapt(train_target)

In [5]:
train_input_ds = tf.data.Dataset.from_tensor_slices(en_vectorizer(train_input))
train_target_ds = tf.data.Dataset.from_tensor_slices(ja_vectorizer(train_target))

In [6]:
train_ds = tf.data.Dataset.zip((train_input_ds, train_target_ds))

In [7]:
en_vocab = en_vectorizer.get_vocabulary()
ja_vocab = ja_vectorizer.get_vocabulary()
vocab_inp_size = len(en_vocab) + 1
vocab_tar_size = len(ja_vocab) + 1

In [8]:
max_length_inp = 18
max_length_targ = 18

In [9]:
def convert(vocab, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, vocab[t]))

In [10]:
en, ja = next(iter(train_ds))

In [11]:
print ("Input Language; index to word mapping")
convert(en_vocab, en.numpy())
print ()
print ("Target Language; index to word mapping")
convert(ja_vocab, ja.numpy())

Input Language; index to word mapping
2 ----> <start>
6 ----> i
41 ----> can
22 ----> 't
149 ----> tell
136 ----> who
29 ----> will
709 ----> arrive
231 ----> first
4 ----> .
3 ----> <end>

Target Language; index to word mapping
2 ----> <start>
92 ----> 誰
14 ----> が
239 ----> 一番
7 ----> に
161 ----> 着
29 ----> く
22 ----> か
18 ----> 私
7 ----> に
5 ----> は
290 ----> 分か
39 ----> り
21 ----> ま
40 ----> せ
30 ----> ん
4 ----> 。
3 ----> <end>


In [12]:
BUFFER_SIZE = len(train_input)
BATCH_SIZE = 64
steps_per_epoch = len(train_input) // BATCH_SIZE
embedding_dim = 256
units = 1024

dataset = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

In [13]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 18]), TensorShape([64, 18]))

In [14]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=False,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [15]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# サンプル入力
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [16]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden):
        # 埋め込み層を通過したあとの x の shape  == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # Embeddingの出力と、エンコーダ出力を GRU 層に渡す
        output, state = self.gru(x, initial_state=hidden)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state

In [17]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 6952)


In [18]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
                             from_logits=True, reduction='none')
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [19]:
import os

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [20]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([ja_vocab.index('<start>')] * BATCH_SIZE, 1)

        # Teacher Forcing - 正解値を次の入力として供給
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden = decoder(dec_input, dec_hidden)

            loss += loss_function(targ[:, t], predictions)

            # Teacher Forcing を使用
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [21]:
import time

EPOCHS = 20

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
              print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                     batch,
                                                     batch_loss.numpy()))
    # 2 エポックごとにモデル（のチェックポイント）を保存
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 5.8900
Epoch 1 Batch 100 Loss 2.6323
Epoch 1 Batch 200 Loss 2.1955
Epoch 1 Batch 300 Loss 2.0682
Epoch 1 Batch 400 Loss 2.1204
Epoch 1 Loss 2.3380
Time taken for 1 epoch 41.62293243408203 sec

Epoch 2 Batch 0 Loss 1.7043
Epoch 2 Batch 100 Loss 1.6932
Epoch 2 Batch 200 Loss 1.6036
Epoch 2 Batch 300 Loss 1.8222
Epoch 2 Batch 400 Loss 1.6237
Epoch 2 Loss 1.7050
Time taken for 1 epoch 25.299636125564575 sec

Epoch 3 Batch 0 Loss 1.4698
Epoch 3 Batch 100 Loss 1.4758
Epoch 3 Batch 200 Loss 1.4239
Epoch 3 Batch 300 Loss 1.4386
Epoch 3 Batch 400 Loss 1.4671
Epoch 3 Loss 1.4174
Time taken for 1 epoch 24.931610822677612 sec

Epoch 4 Batch 0 Loss 1.2043
Epoch 4 Batch 100 Loss 1.1332
Epoch 4 Batch 200 Loss 1.1463
Epoch 4 Batch 300 Loss 1.1626
Epoch 4 Batch 400 Loss 1.0412
Epoch 4 Loss 1.1670
Time taken for 1 epoch 25.572669744491577 sec

Epoch 5 Batch 0 Loss 0.8723
Epoch 5 Batch 100 Loss 0.8590
Epoch 5 Batch 200 Loss 0.9150
Epoch 5 Batch 300 Loss 0.8782
Epoch 5 Batch 400 Loss 

In [22]:
import numpy as np

en_word_index = {w:i for i,w in enumerate(en_vocab)}

def evaluate(sentence):
    sentence = preprocess(sentence)
    
    inputs = [en_word_index.get(w, 1) for w in sentence.split(' ')] # index 1 for [UNK]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                           maxlen=max_length_inp,
                                                           padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([ja_vocab.index('<start>')], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden = decoder(dec_input,
                                          dec_hidden)

        predicted_id = tf.argmax(predictions[0]).numpy()

        if ja_vocab[predicted_id] == '<end>':
            return result, sentence

        result += ja_vocab[predicted_id] + ' '

        # 予測された ID がモデルに戻される
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence

In [23]:
def translate(sentence):
    result, sentence = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

In [24]:
# checkpoint_dir の中の最後のチェックポイントを復元
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f9838145ac8>

In [25]:
translate("i am a little out of sorts today .")

Input: <start> i am a little out of sorts today . <end>
Predicted translation: 今日 は 少し 気分 が 悪 い 。 


In [26]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pandas as pd

input_sentences = load_data(os.path.join(data_dir, 'test.en'))
target_sentences = load_data(os.path.join(data_dir, 'test.ja'))

input_sentences = input_sentences[:TEST_SIZE_LIMIT]
target_sentences = target_sentences[:TEST_SIZE_LIMIT]
predicted_sentences = []

for input_en in input_sentences:
    predicted_ja, _ = evaluate(preprocess(input_en))
    predicted_sentences.append(''.join(predicted_ja[:-1]))

result_df = pd.DataFrame({'input_sentence': input_sentences,
                          'target_sentence': target_sentences,
                          'predicted_sentence': predicted_sentences})  


In [27]:
bleu_scores = []
for row in result_df.itertuples():
    bleu_scores.append(
        sentence_bleu(row.target_sentence, row.predicted_sentence,
                      smoothing_function=SmoothingFunction().method4)
    )
result_df['bleu_score'] = bleu_scores

In [28]:
result_df

Unnamed: 0,input_sentence,target_sentence,predicted_sentence,bleu_score
0,they finally acknowledged it as true .,彼 ら は つい に それ が 真実 だ と 認め た 。,彼 ら は 彼 が 話 を する の を 許 し た 。,0.153911
1,he didn 't care for swimming .,彼 は 水泳 が 得意 で は な かっ た 。,彼 は 、 以前 、 彼 の こと に 賛成 し な かっ た 。,0.131693
2,he is no less kind than his sister .,彼 は お 姉 さん に 劣 ら ず 親切 だ 。,彼 は お 姉 さん ほど 背 が 高 く な い 。,0.166010
3,you must be back before ten .,１０ 時 前 に 戻 ら な けれ ば な ら な い 。,10 分 も か ら ず に つ か な けれ ば な ら な い 。,0.126820
4,break a leg .,成功 を 祈 る わ 。,時 がいつ し か 過ぎ て 。,0.220057
...,...,...,...,...
295,i 'm very sleepy now .,今 とても 眠 い 。,今 とても 忙し そう だ 。,0.313971
296,"above the music , i could hear her crying .",音楽 が な っ て い る の に 彼女 の 鳴き声 が 聞こえ た 。,私 は 音楽 を 聴 く ため に 彼女 の 夢 を 見 ま せ ん 。,0.122821
297,we had the meeting in this room last friday .,先週 の 金曜 日 この 部屋 で 会合 が あ っ た 。,この バス で 私 たち は 昨日 、 交通 事故 に あ っ た 。,0.126820
298,who do you want to speak to ?,お 話 に な る 方 の お 名前 は 。,あなた は 話 す べ き もの で は な い の か 。,0.136731


In [29]:
result_df.bleu_score.mean()

0.17331228745184793