In [1]:
import pathlib
import random
import string
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from pathlib import Path
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
import pathlib

from nltk.translate.bleu_score import sentence_bleu

In [2]:
import sys
import tensorflow as tf
import matplotlib.pyplot as plt

plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14) 
plt.rc('legend', fontsize=14) 
plt.rc('xtick', labelsize=10) 
plt.rc('ytick', labelsize=10)

import sklearn
from tensorflow import keras
# Common imports 
import numpy as np 
import os

# to make the output stable across runs 
np.random.seed(42) 
tf.random.set_seed(42)

# To plot pretty figures %matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt 
mpl.rc('axes', labelsize=14) 
mpl.rc('xtick', labelsize=12) 
mpl.rc('ytick', labelsize=12)

# Problem 1

Select and download a data set from the site http://www.manythings.org/anki Choose a language: French, German, Russian, Hebrew, Portugue, or any other language with a large data set. Do not use Spanish, since our class examples already use Spanish. 

a)	Identify the directory where the dataset is downloaded. Examine that file. Report a few initial lines. Write a preprocessing code that will clean downloaded data so that you could transform it into “clean” pairs of sentences in English and the selected language. Create datasets with some 50% of language pairs devoted to training, 45% to validation and 15% to testing.

b)	Work with a vocabulary of 2000 words and sentences of maximal length of 70. Train an encoder-decoder model with attention on your dataset. First test the model with 2 epochs and then run it with 20. On Google Colab with GPU run time, that will take approximately 30 minutes. Illustrate translation ability of your model with 10 sentences of random length from your test dataset.

c)	Find an API on the Web that would allow you to calculate the BLUE scores.  Use sentences in your test dataset. Use you trained model to produce “your” own translations of test sentences. Use the BLUE score API to assess the BLUE score of your model.  Instead of BLUE score you are welcome to use some other techniques for automated measurement of the quality of translations.


In [4]:
#locate downloaded french english text file
path_to_file = '/Users/mpellegrin008/Documents/VS Code/CSCI89_Deep Learning/fra-eng/fra.txt'

In [5]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [6]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, French]
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
    
    np.random.seed(42)  # extra code
    np.random.shuffle(word_pairs)

    
    return zip(*word_pairs)

In [7]:
sentences_en, sentences_fr, src = create_dataset(path_to_file, 1000000)
print(sentences_en[-1])
print(sentences_fr[-1])
print(src[-1])

<start> this seems kind of expensive . <end>
<start> ca a l air plutot cher . <end>
<start> cc by . france attribution tatoeba . org ck sacredceltic <end>


In [8]:
print(len(sentences_en))

227815


b) Work with a vocabulary of 2000 words and sentences of maximal length of 70. Train an encoder-decoder model with attention on your dataset. First test the model with 2 epochs and then run it with 20. On Google Colab with GPU run time, that will take approximately 30 minutes. Illustrate translation ability of your model with 10 sentences of random length from your test dataset.

In [9]:
#vectorize data

vocab_size = 2000
max_length = 70
text_vec_layer_en = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length)
text_vec_layer_fr = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length)
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_fr.adapt([f"startofseq {s} endofseq" for s in sentences_fr])

2023-11-18 10:55:55.586346: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2023-11-18 10:55:55.586384: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2023-11-18 10:55:55.586391: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2023-11-18 10:55:55.586437: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-11-18 10:55:55.586462: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2023-11-18 10:55:56.154085: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [10]:
text_vec_layer_en.get_vocabulary()[:10]

['', '[UNK]', 'start', 'end', 'i', 'you', 'to', 'the', 't', 'a']

In [11]:
text_vec_layer_fr.get_vocabulary()[:10]

['', '[UNK]', 'end', 'startofseq', 'start', 'endofseq', 'je', 'a', 'de', 'pas']

In [12]:
# Create datasets with some 50% of language pairs devoted to training, 35% to validation and 15% to testing
num_train = len(sentences_en)*.5
num_val = len(sentences_en)*.35
num_test = len(sentences_en) - num_train - num_val
print(num_train)
print(num_val+num_train)
print(num_test)

113907.5
193642.75
34172.25


In [13]:
X_train = tf.constant(sentences_en[:113907])
X_valid = tf.constant(sentences_en[113907:193642])
X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_fr[:113907]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_fr[113907:193642]])
Y_train = text_vec_layer_fr([f"{s} endofseq" for s in sentences_fr[:113907]])
Y_valid = text_vec_layer_fr([f"{s} endofseq" for s in sentences_fr[113907:193642]])
X_test = tf.constant(sentences_en[193642:])
Y_test = text_vec_layer_fr([f"{s} endofseq" for s in sentences_fr[193642:]])

In [14]:
#build the encoder decoder model with attention

tf.random.set_seed(42)  # extra code – ensures reproducibility on CPU
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [15]:
embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_fr(decoder_inputs)
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                    mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                    mask_zero=True)
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

In [None]:
#add attention

In [20]:
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, return_sequences=True, return_state=True))

In [21]:
encoder_outputs, *encoder_state = encoder(encoder_embeddings)
encoder_state = [tf.concat(encoder_state[::2], axis=-1),  # short-term (0 & 2)
                 tf.concat(encoder_state[1::2], axis=-1)]  # long-term (1 & 3)
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [22]:
attention_layer = tf.keras.layers.Attention()
attention_outputs = attention_layer([decoder_outputs, encoder_outputs])
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(attention_outputs)

In [23]:
#compile the model and train

model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
history = model.fit((X_train, X_train_dec), Y_train, epochs=10,
          validation_data=((X_valid, X_valid_dec), Y_valid))

model.save('encoder_decoder.keras')

Epoch 1/10


2023-11-18 11:11:08.121216: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }




2023-11-18 11:21:10.163868: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
#define function to translate test sentences

def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        X = np.array([sentence_en])  # encoder input 
        X_dec = np.array(["startofseq " + translation])  # decoder input
        y_proba = model.predict((X, X_dec))[0, word_idx]  # last token's probas
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_fr.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()

c) Find an API on the Web that would allow you to calculate the BLUE scores. Use sentences in your test dataset. Use you trained model to produce “your” own translations of test sentences. Use the BLUE score API to assess the BLUE score of your model. Instead of BLUE score you are welcome to use some other techniques for automated measurement of the quality of translations.

In [31]:
X_test[0].numpy()

b'<start> you could count to ten when you were two . <end>'

In [28]:
seq2seq_translation = []
for i in X_test[0:10]:
    if len(seq2seq_translation) < 10:
        try:
            seq2seq_translation.append(translate(i[7:-5])[:-6])
        except:
            continue

2023-11-18 13:27:27.764021: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES failed at strided_slice_op.cc:117 : INVALID_ARGUMENT: Attempting to slice scalar input.
2023-11-18 13:27:27.765290: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES failed at strided_slice_op.cc:117 : INVALID_ARGUMENT: Attempting to slice scalar input.
2023-11-18 13:27:27.766540: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES failed at strided_slice_op.cc:117 : INVALID_ARGUMENT: Attempting to slice scalar input.
2023-11-18 13:27:27.767236: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES failed at strided_slice_op.cc:117 : INVALID_ARGUMENT: Attempting to slice scalar input.
2023-11-18 13:27:27.767868: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES failed at strided_slice_op.cc:117 : INVALID_ARGUMENT: Attempting to slice scalar input.
2023-11-18 13:27:27.768487: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES failed at strided_slice_op.cc:117 : IN

In [32]:
#create list of translated sentences
translations = []
for i in range(10):
    translations.append(translate(X_test[i].numpy()))



In [36]:
translations

['start tu devrais [UNK] de heures lorsque tu [UNK] end',
 'start il est a l [UNK] a l [UNK] end',
 'start elle etait [UNK] toute tout end',
 'start il a [UNK] de pleuvoir de l [UNK] et les [UNK] end',
 'start la plupart de la temps [UNK] de la sante end',
 'start nous allons de une [UNK] de [UNK] de [UNK] de ici end',
 'start mon [UNK] est [UNK] end',
 'start tu es [UNK] end',
 'start tom a beaucoup de [UNK] end',
 'start elle a [UNK] de beaucoup de [UNK] de [UNK] end']

In [38]:
Y_labels = tf.constant([f"{s} endofseq" for s in sentences_fr[193642:]])

In [39]:
#define references
references = []
for i in range(10):
    references.append(Y_labels[i].numpy())

In [54]:
references

[b'<start> quand tu avais deux ans , tu pouvais compter jusqu a dix . <end> endofseq',
 b'<start> il vint en bus . <end> endofseq',
 b'<start> elle avait mal partout . <end> endofseq',
 b'<start> il s est mis a tomber des cordes . <end> endofseq',
 b'<start> le vin rouge convient bien a la viande . <end> endofseq',
 b'<start> nous aurons quelques visiteurs un de ces jours . <end> endofseq',
 b'<start> le football est mon sport prefere . <end> endofseq',
 b'<start> tu es une de ces menteuses ! <end> endofseq',
 b'<start> tom a beaucoup de problemes . <end> endofseq',
 b'<start> elle depense autant qu elle gagne . <end> endofseq']

In [68]:
#calculate bleu scores for test sentences
import nltk
from nltk.translate.bleu_score import sentence_bleu

for original, translation in zip(references, translations):
    print(sentence_bleu(str(original), str(translation))) # uses Bleu-4 for 4-grams by default

1.2821097245439388e-231
1.322612729825152e-231
1.37484330787042e-231
1.2128566900665309e-231
1.2183324802375697e-231
1.2481913216671088e-231
1.396075237825507e-231
1.4740564900137075e-231
1.4703199416115106e-231
1.2882297539194154e-231


# Problem 2. 

Repeat the analysis of Problem 1 with a transformer model similar to the one discussed in class. To increase the precision, again, like in problem 1, work with a slightly bigger vocabulary of 2000 words and somewhat longer sentences of up to 70 words.
Illustrate translation ability of your transformer model with the same sentences you used in Problem 1, section b. Compare with the results obtained with the encoder-decoder with attention model.
Compare the BLUE score (or some other measure for the quality of translation) between models in problem 1 and problem 2. Provide any comments, if you have them. 


In [55]:
# Create datasets with some 50% of language pairs devoted to training, 35% to validation and 15% to testing


train_pairs = tuple(zip(sentences_en[:113907], sentences_fr[:113907]))
val_pairs = tuple(zip(sentences_en[113907 : 193642], sentences_fr[113907 : 193642]))
test_pairs = tuple(zip(sentences_en[193642:], sentences_fr[193642:]))

print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

113907 training pairs
79735 validation pairs
34173 test pairs


In [56]:
#vecotrize data and adapt

strip_chars = string.punctuation + "¿" #should not be needed in french, but just in case
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

vocab_size = 2000
sequence_length = 70
batch_size = 64

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")


eng_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)

fra_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)

train_eng_texts = [pair[0] for pair in train_pairs]
train_fra_texts = [pair[1] for pair in train_pairs]
eng_vectorization.adapt(train_eng_texts)
fra_vectorization.adapt(train_fra_texts)

In [57]:
#format and create train and val datasets

def format_dataset(eng, fra):
    eng = eng_vectorization(eng)
    fra = fra_vectorization(fra)
    return (
        {
            "encoder_inputs": eng,
            "decoder_inputs": fra[:, :-1],
        },
        fra[:, 1:],
    )


def make_dataset(pairs):
    eng_texts, fra_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    fra_texts = list(fra_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, fra_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [58]:
#print shape of train dataset

for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (64, 70)
inputs["decoder_inputs"].shape: (64, 70)
targets.shape: (64, 70)


2023-11-18 13:42:50.042144: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [59]:
#define transformer model

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(dense_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        attention_output = self.attention(query=inputs, value=inputs, key=inputs)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "embed_dim": self.embed_dim,
                "dense_dim": self.dense_dim,
                "num_heads": self.num_heads,
            }
        )
        return config


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "sequence_length": self.sequence_length,
                "vocab_size": self.vocab_size,
                "embed_dim": self.embed_dim,
            }
        )
        return config


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(latent_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.add = layers.Add()  # instead of `+` to preserve mask
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, use_causal_mask=True
        )
        out_1 = self.layernorm_1(self.add([inputs, attention_output_1]))

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
        )
        out_2 = self.layernorm_2(self.add([out_1, attention_output_2]))

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(self.add([out_2, proj_output]))

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "embed_dim": self.embed_dim,
                "latent_dim": self.latent_dim,
                "num_heads": self.num_heads,
            }
        )
        return config


In [60]:
#assemble transformer model


embed_dim = 256
latent_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [61]:
#train transformer model

epochs = 10 

transformer.summary()
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
history1 = transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)
transformer.save('transformer.keras')

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_inputs (InputLayer  [(None, None)]               0         []                            
 )                                                                                                
                                                                                                  
 positional_embedding (Posi  (None, None, 256)            529920    ['encoder_inputs[0][0]']      
 tionalEmbedding)                                                                                 
                                                                                                  
 decoder_inputs (InputLayer  [(None, None)]               0         []                            
 )                                                                                      

In [74]:
#define decoder for test sentences

fra_vocab = fra_vectorization.get_vocabulary()
fra_index_lookup = dict(zip(range(len(fra_vocab)), fra_vocab))
max_decoded_sentence_length = 20


def decode_sequence(input_sentence):
    tokenized_input_sentence = eng_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = fra_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = fra_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence




In [73]:
test_eng_texts = [pair[0] for pair in test_pairs]

#append translated sentences to a list
translations1 = []
for i in range(10):
    input_sentence = test_eng_texts[i]
    translated = decode_sequence(input_sentence)
    translations1.append(translated)
    print(input_sentence)
    print(translated)
    print(references[i])
    print("   ")

<start> you could count to ten when you were two . <end>
[start] end  un on un [UNK] [UNK] [UNK] a [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] end a un [UNK] end
b'<start> quand tu avais deux ans , tu pouvais compter jusqu a dix . <end> endofseq'
   
<start> he came by bus . <end>
[start] end  [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] le [UNK] [UNK]
b'<start> il vint en bus . <end> endofseq'
   
<start> she was aching all over . <end>
[start] end  [UNK] [UNK] [UNK] le [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] le [UNK] [UNK] end end [UNK] end
b'<start> elle avait mal partout . <end> endofseq'
   
<start> it began to rain cats and dogs . <end>
[start] end  un [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] le [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] le
b'<start> il s est mis a tomber des cordes . <end> endofseq'
   
<start> red wine goes well with meat . <end>
[start] end  [UNK] [UNK] un [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] 

In [71]:
for original, translation in zip(references, translations1):
    print(sentence_bleu(str(original), str(translation))) # uses Bleu-4 for 4-grams by default

1.0350003944289303e-231
9.594503055152632e-232
9.788429383461836e-232
9.72161026064145e-232
9.958726787503715e-232
1.1200407237786664e-231
9.65701126654974e-232
9.65701126654974e-232
1.0244914152188952e-231
1.0216652050829071e-231


Contrary to our initial expectations, the transformer model does not perform better than the encoder-decoder with attention.  This is likely due to the limited number of epochs that we were able to train the model for, as well as the limited vocabulary size.  Additionally, we may need to improve the pre-processing of the sentences. 