In [1]:
from tensorflow.keras.utils import get_file

url = 'https://github.com/odashi/small_parallel_enja/archive/master.zip'

zip_file_path = get_file('small_parallel_enja.zip', url, cache_subdir='small_parallel_enja', extract=True) 

Downloading data from https://github.com/odashi/small_parallel_enja/archive/master.zip
2547712/Unknown - 2s 1us/step

In [2]:
import os

data_dir = os.path.join(os.path.dirname(zip_file_path), 'small_parallel_enja-master')
!ls -l $data_dir

total 9076
-rw-r--r-- 1 root root   17054 Feb  8 05:49 dev.en
-rw-r--r-- 1 root root   27781 Feb  8 05:49 dev.ja
-rw-r--r-- 1 root root    1946 Feb  8 05:49 README.md
-rw-r--r-- 1 root root   17301 Feb  8 05:49 test.en
-rw-r--r-- 1 root root   27793 Feb  8 05:49 test.ja
-rw-r--r-- 1 root root 1701356 Feb  8 05:49 train.en
-rw-r--r-- 1 root root  339768 Feb  8 05:49 train.en.000
-rw-r--r-- 1 root root  340186 Feb  8 05:49 train.en.001
-rw-r--r-- 1 root root  341174 Feb  8 05:49 train.en.002
-rw-r--r-- 1 root root  339953 Feb  8 05:49 train.en.003
-rw-r--r-- 1 root root  340275 Feb  8 05:49 train.en.004
-rw-r--r-- 1 root root   30025 Feb  8 05:49 train.en.vocab.4k
-rw-r--r-- 1 root root   51162 Feb  8 05:49 train.en.vocab.all
-rw-r--r-- 1 root root 2784447 Feb  8 05:49 train.ja
-rw-r--r-- 1 root root  556444 Feb  8 05:49 train.ja.000
-rw-r--r-- 1 root root  555732 Feb  8 05:49 train.ja.001
-rw-r--r-- 1 root root  557218 Feb  8 05:49 train.ja.002
-rw-r--r-- 1 root root  557538 Feb  8 05:4

In [35]:
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

def load_data(path):
      tokenizer = Tokenizer(filters='')
      texts = []
      for line in open(path, 'r'):
          texts.append('<start> ' + line.strip() + ' <end>')
    
      tokenizer.fit_on_texts(texts)

      return tokenizer.texts_to_sequences(texts), tokenizer

en, inp_lang = load_data(os.path.join(data_dir, 'train.en'))
ja, targ_lang = load_data(os.path.join(data_dir, 'train.ja'))

vocab_inp_size = len(tokenizer_en.word_index) + 1
vocab_tar_size = len(tokenizer_ja.word_index) + 1

train_en, test_en, train_ja, test_ja = train_test_split(en, ja, test_size=0.1, random_state=36)

input_tensor_train = pad_sequences(train_en, padding='post')
target_tensor_train = pad_sequences(train_ja, padding='post')
input_tensor_val = pad_sequences(test_en, padding='post')
target_tensor_val = pad_sequences(test_ja, padding='post')

max_length_inp = len(input_tensor_train[0])
max_length_targ = len(target_tensor_train[0])

In [14]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [15]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
13 ----> it
33 ----> 's
918 ----> dangerous
6 ----> to
182 ----> play
387 ----> around
4 ----> the
459 ----> fire
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
718 ----> 火
9 ----> の
459 ----> 近く
11 ----> で
382 ----> 遊
344 ----> ぶ
9 ----> の
4 ----> は
457 ----> 危険
22 ----> だ
3 ----> 。
2 ----> <end>


In [16]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train) // BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [17]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 18]), TensorShape([64, 18]))

In [18]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units):
        super(Encoder, self).__init__()
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, mask_zero=True)
        self.lstm = tf.keras.layers.LSTM(self.enc_units,
                                       return_sequences=False,
                                       return_state=True)
 
    def call(self, x):
        x = self.embedding(x)
        output, *states = self.lstm(x)
        return output, states


In [19]:
encoder = Encoder(vocab_inp_size, embedding_dim, units)

# サンプル入力
sample_output, sample_hidden = encoder(example_input_batch)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden[0].shape))
print ('Encoder Carry state shape: (batch size, units) {}'.format(sample_hidden[1].shape))

Encoder output shape: (batch size, sequence length, units) (64, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)
Encoder Carry state shape: (batch size, units) (64, 1024)


In [20]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units):
        super(Decoder, self).__init__()
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(self.dec_units,
                                       return_sequences=True,
                                       return_state=True)
        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden):
        # 埋め込み層を通過したあとの x の shape  == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # Embeddingの出力と、エンコーダ出力を LSTM 層に渡す
        output, *states = self.lstm(x, initial_state=hidden)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, states

In [21]:
decoder = Decoder(vocab_tar_size, embedding_dim, units)

sample_decoder_output, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 8777)


In [22]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
                             from_logits=True, reduction='none')
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [23]:
import os

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [24]:
@tf.function
def train_step(inp, targ):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher Forcing - 正解値を次の入力として供給
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden = decoder(dec_input, dec_hidden)

            loss += loss_function(targ[:, t], predictions)

            # Teacher Forcing を使用
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [25]:
import time

EPOCHS = 20

for epoch in range(EPOCHS):
    start = time.time()

    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ)
        total_loss += batch_loss

        if batch % 100 == 0:
              print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                     batch,
                                                     batch_loss.numpy()))
    # 2 エポックごとにモデル（のチェックポイント）を保存
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 6.0849
Epoch 1 Batch 100 Loss 2.7703
Epoch 1 Batch 200 Loss 2.3135
Epoch 1 Batch 300 Loss 2.1015
Epoch 1 Batch 400 Loss 2.1106
Epoch 1 Batch 500 Loss 2.0142
Epoch 1 Batch 600 Loss 2.2089
Epoch 1 Loss 2.3625
Time taken for 1 epoch 80.29868793487549 sec

Epoch 2 Batch 0 Loss 1.8620
Epoch 2 Batch 100 Loss 1.9529
Epoch 2 Batch 200 Loss 1.8381
Epoch 2 Batch 300 Loss 1.7275
Epoch 2 Batch 400 Loss 1.7937
Epoch 2 Batch 500 Loss 1.6890
Epoch 2 Batch 600 Loss 1.7004
Epoch 2 Loss 1.7626
Time taken for 1 epoch 63.54192876815796 sec

Epoch 3 Batch 0 Loss 1.4768
Epoch 3 Batch 100 Loss 1.5853
Epoch 3 Batch 200 Loss 1.5798
Epoch 3 Batch 300 Loss 1.5546
Epoch 3 Batch 400 Loss 1.5519
Epoch 3 Batch 500 Loss 1.4247
Epoch 3 Batch 600 Loss 1.4377
Epoch 3 Loss 1.4843
Time taken for 1 epoch 62.92848563194275 sec

Epoch 4 Batch 0 Loss 1.3223
Epoch 4 Batch 100 Loss 1.2289
Epoch 4 Batch 200 Loss 1.3007
Epoch 4 Batch 300 Loss 1.3095
Epoch 4 Batch 400 Loss 1.1648
Epoch 4 Batch 500 Loss 1.1917


In [None]:
def predict(inputs):
    inputs = tf.convert_to_tensor([inputs])
    predicted_seq = []
    
    enc_out, enc_hidden = encoder(inputs)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden = decoder(dec_input,
                                          dec_hidden)

        predicted_id = tf.argmax(predictions[0]).numpy()

        predicted_seq.append(predicted_id)

        if targ_lang.index_word[predicted_id] == '<end>':
            return predicted_seq

        # 予測された ID がモデルに戻される
        dec_input = tf.expand_dims([predicted_id], 0)

    return predicted_seq

In [36]:
import numpy as np

def evaluate(sentence):
    inputs = [tokenizer_en.word_index[i.lower()] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                           maxlen=len_en,
                                                           padding='post')
    
    predicted_seq = predict(inputs)

    result = ' '.join([tokenizer_ja.index_word[i] for i in predicted_seq])

    return result, sentence

In [37]:
def translate(sentence):
    result, sentence = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))


In [38]:
# checkpoint_dir の中の最後のチェックポイントを復元
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f12c91e8e80>

In [39]:
translate('it is necessary that the bill pass the diet .')

Input: it is necessary that the bill pass the diet .
Predicted translation: その 計画 は うま く い く か も しれ な い 。 <end> 


In [40]:
def predict(inputs):
    inputs = tf.convert_to_tensor([inputs])
    predicted_seq = []
    
    enc_out, enc_hidden = encoder(inputs)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden = decoder(dec_input,
                                          dec_hidden)

        predicted_id = tf.argmax(predictions[0]).numpy()

        predicted_seq.append(predicted_id)

        if targ_lang.index_word[predicted_id] == '<end>':
            return predicted_seq

        # 予測された ID がモデルに戻される
        dec_input = tf.expand_dims([predicted_id], 0)

    return predicted_seq
    

In [44]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pandas as pd

input_sentences = []
target_sentences = []
predicted_sentences = []

for i, input_en in enumerate(input_tensor_val):
    predicted_ja = predict(input_en)
    tokens_input = [inp_lang.index_word[id] for id in input_en if id > 2]
    tokens_target = [targ_lang.index_word[id] for id in target_tensor_val[i] if id > 2]
    tokens_predicted = [targ_lang.index_word[id] for id in predicted_ja if id > 2]
    input_sentences.append(' '.join(tokens_input))
    target_sentences.append(''.join(tokens_target))
    predicted_sentences.append(''.join(tokens_predicted))

result_df = pd.DataFrame({'input_sentence': input_sentences,
                          'target_sentence': target_sentences,
                          'predicted_sentence': predicted_sentences})  


In [71]:
bleu_scores = []
for row in result_df.itertuples():
    bleu_scores.append(
        sentence_bleu(row.target_sentence, row.predicted_sentence,
                      smoothing_function=SmoothingFunction().method4)
    )
result_df['bleu_score'] = bleu_scores

In [72]:
result_df

Unnamed: 0,input_sentence,target_sentence,predicted_sentence,bleu_score
0,i was wrong .,私が間違っていました。,私は間違っていた。,0.331186
1,she is going to learn how to drive .,彼女は近く、運転を習うつもりでいます。,彼女は車のために勉強する予定だ。,0.279488
2,i am fond of skiing .,私はスキーが好きだ。,私はスキーが好きです。,0.336136
3,i will start working on july the first .,７月１日から仕事を始めます。,私は二日間の日に発つつもりです。,0.245981
4,ken took the examination with confidence .,ケンは自信をもって試験を受けた。,ケンは試験を尽くしたが無駄だった。,0.321431
...,...,...,...,...
9995,what does the company produce ?,その会社は何を作っているのですか。,その会社は何をしているのか。,0.353037
9996,he was absent from school because he was sick .,彼は病気だったので学校を休んだ。,彼は病気で学校を欠席した。,0.339871
9997,won 't you take a chair ?,座りませんか。,煙草を吸ってくれない？,0.000000
9998,i had nothing to do with that incident .,私はその事件とは何の関係も無かった。,私はその出来事とは何の関係もなかった。,0.354081


In [73]:
result_df.bleu_score.mean()

0.3100502226469779