In [1]:
import re
import unicodedata
import tensorflow as tf
from sklearn.model_selection import train_test_split
import time
import io
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [2]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(word):
    word = unicode_to_ascii(word.lower().strip())
    word = re.sub(r"([?.!,])", r" \1 ", word)
    word = re.sub(r'[" "]+', " ", word)
    word = re.sub(r"[^a-zA-Z?.!,]+", " ", word)
    word = word.rstrip().strip()
    word = '<start> ' + word + ' <end>'
    return word

def create_dataset(filename, nums):
    with open(filename, 'r') as f:
        lines = f.readlines()
    line = [line.strip().split('\n') for line in lines]
    word_pairs = [[preprocess_sentence(s) for s in l[0].split('\t')] for l in line[:nums]]
    return word_pairs

In [3]:
def max_length(ts):
    return max(len(t) for t in ts)

def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, lang_tokenizer

def load_data(numbers_example):
    dataset = create_dataset('fra.txt', numbers_example)
    input_lang = [token[0] for token in dataset]
    target_lang = [token[1] for token in dataset]
    input_tensor, inp_lang_tokenizer = tokenize(input_lang)
    target_tensor, targ_lang_tokenizer = tokenize(target_lang)
    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [4]:
numbers_example = 35000
input_tensor, target_tensor, inp_lang, targ_lang = load_data(numbers_example)

### Divide the data into training and testing or use cross-validation.

In [5]:
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)
input_tensor_train, input_tensor_test, target_tensor_train, target_tensor_test = train_test_split(input_tensor, target_tensor, test_size=0.2)
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_test), len(target_tensor_test))

28000 28000 7000 7000


In [6]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print("%d ----> %s" % (t, lang.index_word[t]))
# convert(inp_lang, input_tensor_train[0])
# convert(targ_lang, target_tensor_train[0])
# print ("Input Language; index to word mapping")
# print ("Target Language; index to word mapping")

In [7]:
buffer_size = len(input_tensor_train)
batch_size = 64
step_per_epoch = buffer_size//batch_size
embedding_dim = 256
units = 1024
vocab_input_size = len(inp_lang.index_word)+1
vocab_target_size = len(targ_lang.index_word)+1
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(buffer_size)
dataset = dataset.batch(batch_size, drop_remainder=True)

In [8]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape
# dataset = tf.data.Dataset.from_tensor_slices((np.random.uniform(size=(10, 6)), np.random.uniform(size=(10, 4)))).shuffle(2)
# dataset = dataset.batch(3, drop_remainder=True)
# a = next(iter(dataset))
# print(a)
# # print(b)

(TensorShape([64, 10]), TensorShape([64, 17]))

### Encode Model

In [9]:
class encode_model(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sze):
        super(encode_model, self).__init__()
        self.batch_sze = batch_sze
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                      return_sequences=True,
                                      return_state=True,
                                      recurrent_initializer='glorot_uniform')
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sze, self.enc_units))

In [10]:
encoder = encode_model(vocab_input_size, embedding_dim, units, batch_size)
# sample test
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder.call(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 10, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


### Attention layer

In [11]:
class attention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, query, values):
        hidden_with_time_axis = tf.expand_dims(query, 1)
        
        score = self.V(tf.nn.tanh(self.W1(values)+ self.W2(hidden_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights*values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [12]:
attention_layer = attention(10)
attention_context_vector, attention_weights = attention_layer(sample_hidden, sample_output)
print('shape of attention output context vectors: (batch size, units) {}'.format(attention_context_vector.shape))
print('shape of attention weights: (batch size, sequence length) {}'.format(attention_weights.shape))


shape of attention output context vectors: (batch size, units) (64, 1024)
shape of attention weights: (batch size, sequence length) (64, 10, 1)


### Decode Model

In [14]:
class decode_model(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sze):
        super(decode_model, self).__init__()
        self.batch_sze = batch_sze
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                      return_sequences=True,
                                      return_state=True,
                                      recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = attention(self.dec_units)
        
    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state, attention_weights

In [15]:
decoder = decode_model(vocab_target_size, embedding_dim, units, batch_size)
sample_decode_output, _, _ = decoder(tf.random.uniform((64,1)), sample_hidden, sample_output)
print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decode_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 8292)


### Optimizer and Loss Function

In [16]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
def loss_function(truth, pred):
    mask = tf.math.logical_not(tf.math.equal(truth, 0))
    loss_ = loss_object(truth, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [17]:
import os
checkpoint_dir = './training_checkpoint_final'
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

### Training Step

In [18]:
def train_step(Input, target, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(Input, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * batch_size, 1)
    
        # This is the step of teacher forcing
        for t in range(1, target.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(target[:, t], predictions)
            dec_input = tf.expand_dims(target[: ,t], 1)
        
    batch_loss = loss / int(target.shape[1])
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [18]:
epochs = 10
for epoch in range(epochs):
    start = time.time()
    total_loss = 0
    enc_hidden = encoder.initialize_hidden_state()
    for (batch_index, (Input, target)) in enumerate(dataset.take(step_per_epoch)):
        batch_loss = train_step(Input, target, enc_hidden)
        total_loss += batch_loss
        
        if batch_index % 200 == 0:
            print('Epoch{} Batch{} Loss{:.4f}'.format(epoch+1, batch_index, batch_loss.numpy()))
    
    if (epoch+1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)
    print('Epoch{} Loss{:.4f}'.format(epoch+1, total_loss/step_per_epoch))
    print('Time{} sec\n'.format(time.time()-start))

Epoch1 Batch0 Loss3.4833
Epoch1 Batch200 Loss1.4531
Epoch1 Batch400 Loss1.2163
Epoch1 Loss1.4585
Time1495.8422219753265 sec

Epoch2 Batch0 Loss1.0732
Epoch2 Batch200 Loss1.0037
Epoch2 Batch400 Loss0.8849
Epoch2 Loss0.9494
Time1493.738127231598 sec

Epoch3 Batch0 Loss0.7647
Epoch3 Batch200 Loss0.7326
Epoch3 Batch400 Loss0.6728
Epoch3 Loss0.6925
Time1494.3516147136688 sec

Epoch4 Batch0 Loss0.4185
Epoch4 Batch200 Loss0.5127
Epoch4 Batch400 Loss0.5343
Epoch4 Loss0.4907
Time1493.9165074825287 sec

Epoch5 Batch0 Loss0.3406
Epoch5 Batch200 Loss0.3634
Epoch5 Batch400 Loss0.3991
Epoch5 Loss0.3516
Time1492.1430087089539 sec

Epoch6 Batch0 Loss0.2194
Epoch6 Batch200 Loss0.2284
Epoch6 Batch400 Loss0.2983
Epoch6 Loss0.2625
Time1499.0714755058289 sec

Epoch7 Batch0 Loss0.1594
Epoch7 Batch200 Loss0.1786
Epoch7 Batch400 Loss0.2510
Epoch7 Loss0.2071
Time1454.8906807899475 sec

Epoch8 Batch0 Loss0.1273
Epoch8 Batch200 Loss0.1500
Epoch8 Batch400 Loss0.2096
Epoch8 Loss0.1712
Time1453.1325697898865 sec

E

In [19]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    sentence = preprocess_sentence(sentence)
    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += targ_lang.index_word[predicted_id] + ' '
        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence, attention_plot

def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

In [45]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f5fb2c6e710>

### Select a set of sentences from the testing data, and print the translation results from the model

In [50]:
def tensor2word(lang, tensor):
    sentence_list = ''
    for t in tensor:
        if t!=0:
            sentence_list += lang.index_word[t] + " "
    return sentence_list

def get_sentence(indexs):
    Inpout = tensor2word(inp_lang, input_tensor_test[indexs])
    true_result = tensor2word(targ_lang, target_tensor_test[indexs])
    sentence = Inpout[7:-6]
    return sentence, true_result

In [51]:
sentences, _ = get_sentence(12)
translate(sentences)

Input: <start> i m surviving . <end>
Predicted translation: je survis . <end> 


In [54]:
sentences, _ = get_sentence(13)
translate(sentences)

Input: <start> they re cool . <end>
Predicted translation: ils sont sympa . <end> 


In [55]:
sentences, _ = get_sentence(14)
translate(sentences)

Input: <start> you ve worked hard . <end>
Predicted translation: vous avez travaille d arrache pied . <end> 


### Compute the BLEU score for the testing data set.

In [62]:
from nltk.translate.bleu_score import sentence_bleu
def get_bleu(index):
    sentences, true_result = get_sentence(index)
    prediction, _, _ = evaluate(sentences)
    
    prediction = prediction.split(' ')[:-3]
    true_result = [true_result.split(' ')[1:-3]]
    bleu_score = sentence_bleu(true_result, prediction, weights=(1, 0, 0, 0))
    print('Prediction sentence: %s' % (prediction))
    print('Truth ground sentence: %s' % (true_result))
    print('BLEU score: %f' % bleu_score)
    return bleu_score

In [63]:
get_bleu(4)

Prediction sentence: ['c', 'est', 'un', 'mauvais', 'garcon']
Truth ground sentence: [['c', 'est', 'un', 'mauvais', 'garcon']]
BLEU score: 1.000000


1.0

In [64]:
def get_only_bleu(index):
    sentences, true_result = get_sentence(index)
    prediction, _, _ = evaluate(sentences)
    prediction = prediction.split(' ')[:-3]
    true_result = [true_result.split(' ')[1:-3]]
    bleu_score = sentence_bleu(true_result, prediction, weights=(1, 0, 0, 0))
    return bleu_score

bleu_score_list = []
length = len(input_tensor_test)

for i in range(1000):
    bleu_score_list.append(get_only_bleu(i))

In [67]:
average_mean = np.mean(bleu_score_list)
print('BLEU score for the testing dataset: %f' % average_mean)

BLEU score for the testing dataset: 0.761948
