## Resources

Corpus: Europarl v7 (http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz)

Using:
* Python 3.8.3
* TensorFlow 2.2.0
* SentencePiece 0.1.91

## Modules and variables

In [46]:
# General modules
import numpy as np
import pandas as pd
# Tokenizer
import sentencepiece as sp
# Data preprocessing tools
from keras.preprocessing.sequence import pad_sequences
# General Keras functionalities
from keras.models import load_model
from keras.callbacks import ModelCheckpoint
# Neural network components from Keras
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import RepeatVector
from keras.layers import Dense

In [47]:
# Source text database
data_filename = "./fra.txt"

# Text preprocessing
subset_size = 10000 # total sentence pairs in dataset: 175,623
test_size = 1000 # sentence pairs for testing
train_size = subset_size - test_size # sentence pairs for training
clean_en_filename = "clean_en.txt" # file to save cleaned English text data
clean_fr_filename = "clean_fr.txt" # French
clean_train_filename = "clean_en_fr.txt" # both languages

# Let's define vocabulary size for out tokenizer
vocab_size = 2000
# We'll use the same size for both languages to simplify
en_vocab_size = vocab_size
fr_vocab_size = vocab_size
# max_sentence_length = 20 # Pad all sentences to 40 word pieces (tokens) max

# Defining all the parameters of our network
nb_cells = 256 # LSTM cells in encoder/decoder

# Training parameters
nb_epochs = 30
batch_size = 64

# File name to save our trained model weights
trained_model_filename = "fr_en_nmt_model_test.h5"

## Checking the raw dataset

In [48]:
# Opening the text dataset file
file = open(
    data_filename,
    mode = 'rt',
    encoding = 'utf-8')
# Getting the text content
raw_text = file.read()
# Closing the file handle
file.close()

In [49]:
# Checking the beginning of the text
print(raw_text[:256])

Go.	Va !	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)
Hi.	Salut !	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)
Hi.	Salut.	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (g


## Data preprocessing: cleaning and reducing the dataset

In [50]:
# Importing the dataset in a Pandas dataframe
train_df = pd.read_csv(
    data_filename, # path to our dataset file
    sep='\t', # tab delimiter between columns in the csv
    usecols=[0, 1], # import only columns 0 and 1
    nrows=subset_size, # read only the first subset_size rows
    names=["en","fr"]) # label them 'en' and 'fr'

In [51]:
# Check our dataframe
print(train_df)

                    en                                fr
0                  Go.                              Va !
1                  Hi.                           Salut !
2                  Hi.                            Salut.
3                 Run!                           Cours !
4                 Run!                          Courez !
...                ...                               ...
9995  Be more precise.                 Soit plus précis.
9996  Be quiet, girls.  Restez tranquilles, les filles !
9997  Be very careful.               Sois très prudent !
9998  Be very careful.              Soyez très prudent !
9999  Be very careful.             Soyez très prudente !

[10000 rows x 2 columns]


In [52]:
# Additional cleanup here
# TODO: maybe lowercase all

In [53]:
# Saving our cleaned and reduced dataset
train_df.to_csv(
    clean_train_filename,
    sep='\t', # using tab separators
    index=False) # don't print the row index in the csv

# Saving the English part separately for SentencePiece
train_df.to_csv(
    clean_en_filename,
    columns=['en'], # print only the column 'en'
    index=False)

# And the French one
train_df.to_csv(
    clean_fr_filename,
    columns=['fr'], # print only the column 'fr'
    index=False)

## Tokenizer

In [54]:
# Training the model for English
sp.SentencePieceTrainer.train(
    input = clean_en_filename,
    model_prefix = 'en',
    vocab_size = en_vocab_size,
)

In [55]:
# Training the model for French
sp.SentencePieceTrainer.train(
    input = clean_fr_filename,
    model_prefix = 'fr',
    vocab_size = fr_vocab_size,
)

In [56]:
# Creating a tokenizer object for English
en_sp = sp.SentencePieceProcessor()
# Loading the English model
en_sp.Load("en.model")
# Creating a tokenizer object for French
fr_sp = sp.SentencePieceProcessor()
# Loading the French model
fr_sp.Load("fr.model")

True

In [57]:
# Testing the English tokenizer
en_test_sentence = "I like green apples."
# Encoding pieces
print(en_sp.EncodeAsPieces(en_test_sentence))
# Encoding pieces as IDs
print(en_sp.EncodeAsIds(en_test_sentence))
# Decoding encoded IDs
print(en_sp.DecodeIds(en_sp.EncodeAsIds(en_test_sentence)))

# Testing the French tokenizer
fr_test_sentence = "J'aime les pommes vertes."
# Encoding pieces
print(fr_sp.EncodeAsPieces(fr_test_sentence))
# Encoding pieces as IDs
print(fr_sp.EncodeAsIds(fr_test_sentence))
# Decoding encoded IDs
print(fr_sp.DecodeIds(fr_sp.EncodeAsIds(fr_test_sentence)))

['▁I', '▁like', '▁gree', 'n', '▁a', 'pples', '.']
[4, 41, 1033, 24, 10, 1235, 3]
I like green apples.
['▁J', "'", 'aime', '▁le', 's', '▁pommes', '▁vert', 'es', '.']
[11, 4, 79, 20, 5, 1449, 1155, 63, 3]
J'aime les pommes vertes.


## Tokenizing the data

In [58]:
# Load trained tokenizer for English and French
# Creating a tokenizer object for English
en_sp = sp.SentencePieceProcessor()
# Loading the English model
en_sp.Load("en.model")
# Creating a tokenizer object for French
fr_sp = sp.SentencePieceProcessor()
# Loading the French model
fr_sp.Load("fr.model")

True

In [59]:
# Load the cleaned up dataset
train_df = pd.read_csv(
    clean_train_filename,
    sep='\t')

In [60]:
# Checking the resulting data frame
print(train_df)

                    en                                fr
0                  Go.                              Va !
1                  Hi.                           Salut !
2                  Hi.                            Salut.
3                 Run!                           Cours !
4                 Run!                          Courez !
...                ...                               ...
9995  Be more precise.                 Soit plus précis.
9996  Be quiet, girls.  Restez tranquilles, les filles !
9997  Be very careful.               Sois très prudent !
9998  Be very careful.              Soyez très prudent !
9999  Be very careful.             Soyez très prudente !

[10000 rows x 2 columns]


In [61]:
# Function to tokenize our text (list of sentences) and
# add it to our data frame in the column 'label'
def tokenize_text(df, spm, txt_label, id_label):
    ids = []
    for line in df[txt_label].tolist():
        id_line = spm.EncodeAsIds(line)
        ids.append(id_line)
    df[id_label] = ids

# Let's run this function on the English text
tokenize_text(train_df, en_sp, 'en', 'en_ids')
# And on the French text
tokenize_text(train_df, fr_sp, 'fr', 'fr_ids')

In [62]:
# Checking the resulting data frame
print(train_df)

                    en                                fr  \
0                  Go.                              Va !   
1                  Hi.                           Salut !   
2                  Hi.                            Salut.   
3                 Run!                           Cours !   
4                 Run!                          Courez !   
...                ...                               ...   
9995  Be more precise.                 Soit plus précis.   
9996  Be quiet, girls.  Restez tranquilles, les filles !   
9997  Be very careful.               Sois très prudent !   
9998  Be very careful.              Soyez très prudent !   
9999  Be very careful.             Soyez très prudente !   

                      en_ids                               fr_ids  
0                    [81, 3]                             [199, 9]  
1                  [1004, 3]                             [992, 9]  
2                  [1004, 3]                             [992, 3]  
3      

In [63]:
# Check tokenized English sentence length
en_max_len = max(len(line) for line in train_df['en_ids'].tolist())
# Check tokenized French sentence length
fr_max_len = max(len(line) for line in train_df['fr_ids'].tolist())

In [64]:
print("English maximum sentence length:", en_max_len)
print("French maximum sentence length:", fr_max_len)

English maximum sentence length: 10
French maximum sentence length: 22


In [65]:
# Sentence padding
# Pad English tokens
padded_en_ids = pad_sequences(
    train_df['en_ids'].tolist(),
    maxlen = en_max_len,
    padding = 'post')
# Add them to our training data frame
train_df['pad_en_ids'] = padded_en_ids.tolist()

# Pad French tokens
padded_fr_ids = pad_sequences(
    train_df['fr_ids'].tolist(),
    maxlen = fr_max_len,
    padding = 'post')
# Add them to our training data frame
train_df['pad_fr_ids'] = padded_fr_ids.tolist()

In [66]:
print(train_df['pad_fr_ids'])

0       [199, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
1       [992, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
2       [992, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
3       [18, 812, 5, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4       [18, 812, 49, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
                              ...                        
9995    [118, 22, 203, 1143, 3, 0, 0, 0, 0, 0, 0, 0, 0...
9996    [6, 221, 265, 5, 66, 20, 5, 765, 9, 0, 0, 0, 0...
9997    [118, 5, 58, 169, 361, 9, 0, 0, 0, 0, 0, 0, 0,...
9998    [108, 49, 58, 169, 361, 9, 0, 0, 0, 0, 0, 0, 0...
9999    [108, 49, 58, 169, 375, 9, 0, 0, 0, 0, 0, 0, 0...
Name: pad_fr_ids, Length: 10000, dtype: object


In [67]:
# Shuffling our dataframe around
train_df = train_df.sample(frac=1).reset_index(drop=True)

In [68]:
print(train_df['pad_fr_ids'])

0       [14, 27, 58, 169, 1108, 3, 0, 0, 0, 0, 0, 0, 0...
1       [7, 51, 4, 79, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
2       [7, 133, 756, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
3       [14, 27, 400, 32, 201, 3, 0, 0, 0, 0, 0, 0, 0,...
4       [1281, 45, 8, 40, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0...
                              ...                        
9995    [18, 4, 15, 58, 211, 473, 3, 0, 0, 0, 0, 0, 0,...
9996    [520, 45, 34, 177, 9, 0, 0, 0, 0, 0, 0, 0, 0, ...
9997    [21, 167, 728, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
9998    [7, 1202, 5, 34, 870, 3, 0, 0, 0, 0, 0, 0, 0, ...
9999    [39, 22, 8, 41, 1951, 10, 0, 0, 0, 0, 0, 0, 0,...
Name: pad_fr_ids, Length: 10000, dtype: object


In [69]:
# Create our training input and target output numpy array to feed to our NMT model
# We'll take the first train_size lines for training (after random shuffle)
trainX = np.asarray(train_df['pad_fr_ids'][0:train_size].tolist())
trainY = np.asarray(train_df['pad_en_ids'][0:train_size].tolist())
# Reshape the output to match expected dimensionality
trainY = trainY.reshape(trainY.shape[0], trainY.shape[1], 1)

# The test dataset for checking on the last test_size lines (after random shuffle)
testX = np.asarray(train_df['pad_fr_ids'][train_size:].tolist())
testY = np.asarray(train_df['pad_en_ids'][train_size:].tolist())
# Reshape the output to match expected dimensionality
testY = testY.reshape(testY.shape[0], testY.shape[1], 1)

In [70]:
# Check dimensions
print(trainX.shape)
print(trainY.shape)
print(testX.shape)
print(testY.shape)

(9000, 22)
(9000, 10, 1)
(1000, 22)
(1000, 10, 1)


## Creating and training our NMT model

In [71]:
# Creating a Keras Sequential object for our NMT model
model = Sequential()

# Embedding layer to map our one-hot encoding to a small word space
model.add(Embedding(
    fr_vocab_size,
    nb_cells,
    input_length = fr_max_len,
    mask_zero = True))
# Adding an LSTM layer to act as the encoder
model.add(LSTM(
    units = nb_cells,
    return_sequences = False))
# Since we are not returning a sequence but just a vector, we need
# to repeat this vector multiple times to input it to our decoder LSTM
model.add(RepeatVector(en_max_len))
# Adding an LSTM layer to act as the decoder
model.add(LSTM(
    units = nb_cells,
    return_sequences = True))
# Adding a softmax
model.add((Dense(
    en_vocab_size,
    activation = 'softmax')))

# Compiling the model
model.compile(
    optimizer = 'adam',
    loss = 'sparse_categorical_crossentropy')

print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 22, 256)           512000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 10, 256)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 10, 256)           525312    
_________________________________________________________________
dense_2 (Dense)              (None, 10, 2000)          514000    
Total params: 2,076,624
Trainable params: 2,076,624
Non-trainable params: 0
_________________________________________________________________
None


In [72]:
# Training the model
model_filename = 'model.h5'
checkpoint = ModelCheckpoint(
    model_filename,
    monitor = 'val_loss',
    verbose = 1,
    save_best_only = True,
    mode = 'min')
model.fit(
    trainX,
    trainY,
    epochs = nb_epochs,
    batch_size = batch_size,
    callbacks = [checkpoint],
    validation_data = (testX, testY))



Train on 9000 samples, validate on 1000 samples
Epoch 1/30

Epoch 00001: val_loss improved from inf to 2.57256, saving model to model.h5
Epoch 2/30

Epoch 00002: val_loss improved from 2.57256 to 2.37557, saving model to model.h5
Epoch 3/30

Epoch 00003: val_loss improved from 2.37557 to 2.24880, saving model to model.h5
Epoch 4/30

Epoch 00004: val_loss improved from 2.24880 to 2.14819, saving model to model.h5
Epoch 5/30

Epoch 00005: val_loss improved from 2.14819 to 2.05727, saving model to model.h5
Epoch 6/30

Epoch 00006: val_loss improved from 2.05727 to 1.97027, saving model to model.h5
Epoch 7/30

Epoch 00007: val_loss improved from 1.97027 to 1.92219, saving model to model.h5
Epoch 8/30

Epoch 00008: val_loss improved from 1.92219 to 1.86327, saving model to model.h5
Epoch 9/30

Epoch 00009: val_loss improved from 1.86327 to 1.82075, saving model to model.h5
Epoch 10/30

Epoch 00010: val_loss improved from 1.82075 to 1.78247, saving model to model.h5
Epoch 11/30

Epoch 00011:

<keras.callbacks.callbacks.History at 0x7feb05b71d00>

## Testing the model

In [74]:
# Let's load the trained model
model = load_model(model_filename)

In [78]:
predictions = model.predict_classes(testX)

# Check the translation on a few sentences
decoded_predictions = []
for index in range(10):
    print("Original:")
    print(fr_sp.DecodeIds(testX[index, :].tolist()))
    print("Expected:")
    print(en_sp.DecodeIds(testY[index, :, 0].tolist()))
    print("Predicted:")
    print(en_sp.DecodeIds(predictions[index, :].tolist()))
    print("")

Original:
Je suis presque mort. ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ 
Expected:
I almost died. ⁇  ⁇  ⁇  ⁇  ⁇ 
Predicted:
I'm dead. ⁇  ⁇  ⁇  ⁇  ⁇ 

Original:
Es-tu sûre ? ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ 
Expected:
Are you sure? ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ 
Predicted:
Are you sure? ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ 

Original:
Soyez de nouveau le bienvenu ! ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ 
Expected:
Welcome back. ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ 
Predicted:
Go him. ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ 

Original:
Allez le chercher ! ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ 
Expected:
Go get it. ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ 
Predicted:
Go take it it ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ 

Original:
C'est à moi. ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ 
Expected:
That is mine. ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ 
Predicted:
It's mine. ⁇  ⁇  ⁇  ⁇  ⁇ 

Original:
Arrête de pleurer ! ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ 
Expected:
Stop crying. ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ 
Predicted:
Stop crying. ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ 

Original:
Est-ce que Tom va 