## Resources

Corpus: Europarl v7 (http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz)

Using:
* Python 3.8.3
* TensorFlow 2.2.0
* SentencePiece 0.1.91

## Modules and variables

In [1]:
# General modules
import numpy as np
import pandas as pd
# Tokenizer
import sentencepiece as sp
# Data preprocessing tools
from keras.preprocessing.sequence import pad_sequences
# General Keras functionalities
from keras.models import load_model
# Neural network components from Keras
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import RepeatVector
from keras.layers import Dense

Using TensorFlow backend.


In [2]:
# Source text database
en_filename = "./training/europarl-v7.fr-en.en"
fr_filename = "./training/europarl-v7.fr-en.fr"

# Text preprocessing
max_count = 100000 # total sentence pairs in dataset: 2,007,723
clean_en_filename = "clean_en.txt" # file to save cleaned English text data
clean_fr_filename = "clean_fr.txt" # French

# Let's define vocabulary size for out tokenizer
vocab_size = 10000 # max possible in sentencepiece is 27,535
# We'll use the same size for both languages to simplify
en_vocab_size = vocab_size
fr_vocab_size = vocab_size
max_sentence_length = 100 # Pad all sentences to 100 word pieces (tokens) max

# Defining all the parameters of our network
nb_cells = 1024 # LSTM cells in encoder/decoder

# Training parameters
nb_epochs = 15
batch_size = 64

# File name to save our trained model weights
trained_model_filename = "fr_en_nmt_model_test.h5"

## Checking the raw dataset

In [3]:
# Opening the English transcript file
en_file = open(
    en_filename,
    mode = 'rt',
    encoding = 'utf-8')
# Getting the text content
en_text = en_file.read()
# Closing the file handle
en_file.close()

In [4]:
# Opening the French transcript file
fr_file = open(
    fr_filename,
    mode = 'rt',
    encoding = 'utf-8')
# Getting the text content
fr_text = fr_file.read()
# Closing the file handle
fr_file.close()

In [5]:
# Checking the beginning of both texts
print(en_text[:128])
print(fr_text[:128])

Resumption of the session
I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I w
Reprise de la session
Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dern


It looks like the data is all in order. However, I am going to use a small subset of the data to speed up testing.

## Data preprocessing: cleaning and reducing the dataset

In [6]:
# Example: extracting the 3 first lines
lines = 0
en_subtext = []
with open(en_filename, 'rt') as en_file:
    for line in en_file:
        lines += 1
        en_subtext.append(line)
        if(lines >= 3):
            break

In [7]:
print(en_subtext)

['Resumption of the session\n', 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.\n', "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.\n"]


Now, let's check how many lines are in the whole text:

In [8]:
lines = 0
with open(en_filename, 'rt') as en_file:
    for line in en_file:
        lines += 1

print(lines)

lines = 0
with open(fr_filename, 'rt') as fr_file:
    for line in fr_file:
        lines += 1

print(lines)

2007723
2007723


It looks like we have a well aligned dataset already. Let's extract the first 10,000 lines to make this training tasks shorter for testing during our development.

In [9]:
# Extracting the 10,000 first lines
lines = 0
en_subtext = []

with open(en_filename, 'rt') as en_file:
    for line in en_file:
        lines += 1
        en_subtext.append(line)
        if(lines >= max_count):
            break

lines = 0
fr_subtext = []

with open(fr_filename, 'rt') as fr_file:
    for line in fr_file:
        lines += 1
        fr_subtext.append(line)
        if(lines >= max_count):
            break

We need to create some text files based on this subtext, since our tokenizer directly takes a file path as input for the training data.

In [10]:
# Cleaning up!
#TODO: lowercase all

In [11]:
# Saving our reduced English dataset
with open(clean_en_filename, 'w') as en_file:
    for line in en_subtext:
        en_file.write(line)

# And the French one
with open(clean_fr_filename, 'w') as fr_file:
    for line in fr_subtext:
        fr_file.write(line)

## Tokenizer

In [12]:
# Training the model for English
sp.SentencePieceTrainer.train(
    input = clean_en_filename,
    model_prefix = 'en',
    vocab_size = en_vocab_size,
)

In [13]:
# Training the model for French
sp.SentencePieceTrainer.train(
    input = clean_fr_filename,
    model_prefix = 'fr',
    vocab_size = fr_vocab_size,
)

In [14]:
# Creating a tokenizer object for English
en_sp = sp.SentencePieceProcessor()
# Loading the English model
en_sp.Load("en.model")
# Creating a tokenizer object for French
fr_sp = sp.SentencePieceProcessor()
# Loading the French model
fr_sp.Load("fr.model")

True

In [15]:
# Testing the English tokenizer
en_test_sentence = "I like green apples."
# Encoding pieces
print(en_sp.EncodeAsPieces(en_test_sentence))
# Encoding pieces as IDs
print(en_sp.EncodeAsIds(en_test_sentence))
# Decoding encoded IDs
print(en_sp.DecodeIds(en_sp.EncodeAsIds(en_test_sentence)))

# Testing the French tokenizer
fr_test_sentence = "J'aime les pommes vertes."
# Encoding pieces
print(fr_sp.EncodeAsPieces(fr_test_sentence))
# Encoding pieces as IDs
print(fr_sp.EncodeAsIds(fr_test_sentence))
# Decoding encoded IDs
print(fr_sp.DecodeIds(fr_sp.EncodeAsIds(fr_test_sentence)))

['▁I', '▁like', '▁green', '▁a', 'p', 'ple', 's', '.']
[13, 66, 3745, 10, 315, 6333, 15, 5]
I like green apples.
['▁J', "'", 'aime', '▁les', '▁p', 'omme', 's', '▁verte', 's', '.']
[141, 3, 7161, 14, 563, 5964, 11, 5926, 11, 6]
J'aime les pommes vertes.


## Tokenizing the data

Now that our tokenizer is ready, it is time to prepare our training data properly to feed it to our NMT model. This means creating a pandas data frame with the tokenized (machine readable) sentences for both the source language and the target language.

We will also add the human readable sentences, mainly for visually checking the result of the translation.

In [16]:
# Load trained tokenizer for English and French
# Creating a tokenizer object for English
en_sp = sp.SentencePieceProcessor()
# Loading the English model
en_sp.Load("en.model")
# Creating a tokenizer object for French
fr_sp = sp.SentencePieceProcessor()
# Loading the French model
fr_sp.Load("fr.model")

True

In [17]:
# Load the cleaned up dataset
# Opening the cleaned-up English text file
en_text = []
with open(
    clean_en_filename,
    mode = 'rt',
    encoding = 'utf-8') as file:
        for line in file:
            en_text.append(line)
# Opening the cleaned-up French text file
fr_text = []
with open(
    clean_fr_filename,
    mode = 'rt',
    encoding = 'utf-8') as file:
        for line in file:
            fr_text.append(line)

In [18]:
# Create empty data frame
training_df = pd.DataFrame()
# Add a French text data column
training_df['fr'] = fr_text
# Add an Eglish text data column
training_df['en'] = en_text

In [19]:
# Checking the resulting data frame
print(training_df)

                                                      fr  \
0                                Reprise de la session\n   
1      Je déclare reprise la session du Parlement eur...   
2      Comme vous avez pu le constater, le grand "bog...   
3      Vous avez souhaité un débat à ce sujet dans le...   
4      En attendant, je souhaiterais, comme un certai...   
...                                                  ...   
99995  Nous ne devrions pas nous satisfaire d'une sim...   
99996                                Soyons courageux.\n   
99997  Légiférons, recherchons un niveau d'harmonisat...   
99998  Veillons aux impacts de la circulation sur l'e...   
99999  J'insiste également sur l'importance de légifé...   

                                                      en  
0                            Resumption of the session\n  
1      I declare resumed the session of the European ...  
2      Although, as you will have seen, the dreaded '...  
3      You have requested a debate on this 

In [20]:
# Function to tokenize our text (list of sentences) and
# add it to our data frame in the column 'label'
def tokenize_text(df, spm, text, label):
    ids = []
    for line in text:
        id_line = spm.EncodeAsIds(line)
        ids.append(id_line)
    df[label] = ids

# Let's run this function on the English text
tokenize_text(training_df, en_sp, en_text, 'en_ids')
# And on the French text
tokenize_text(training_df, fr_sp, fr_text, 'fr_ids')

# Checking the resulting data frame
print(training_df)

                                                      fr  \
0                                Reprise de la session\n   
1      Je déclare reprise la session du Parlement eur...   
2      Comme vous avez pu le constater, le grand "bog...   
3      Vous avez souhaité un débat à ce sujet dans le...   
4      En attendant, je souhaiterais, comme un certai...   
...                                                  ...   
99995  Nous ne devrions pas nous satisfaire d'une sim...   
99996                                Soyons courageux.\n   
99997  Légiférons, recherchons un niveau d'harmonisat...   
99998  Veillons aux impacts de la circulation sur l'e...   
99999  J'insiste également sur l'importance de légifé...   

                                                      en  \
0                            Resumption of the session\n   
1      I declare resumed the session of the European ...   
2      Although, as you will have seen, the dreaded '...   
3      You have requested a debate on t

In [21]:
# Sentence padding

# Pad English tokens
padded_en_ids = pad_sequences(
    training_df['en_ids'].tolist(),
    maxlen = max_sentence_length,
    padding = 'post')
# Add them to our training data frame
training_df['pad_en_ids'] = padded_en_ids.tolist()

# Pad French tokens
padded_fr_ids = pad_sequences(
    training_df['fr_ids'].tolist(),
    maxlen = max_sentence_length,
    padding = 'post')
# Add them to our training data frame
training_df['pad_fr_ids'] = padded_fr_ids.tolist()

In [22]:
print(training_df['pad_fr_ids'][0:-10])
print(training_df['pad_fr_ids'][-10:])

0        [983, 8909, 5, 7, 1184, 0, 0, 0, 0, 0, 0, 0, 0...
1        [43, 2323, 1993, 7, 1184, 19, 59, 95, 20, 499,...
2        [526, 72, 493, 490, 10, 941, 4, 10, 339, 128, ...
3        [658, 493, 2515, 27, 163, 12, 26, 242, 21, 14,...
4        [111, 5133, 4, 37, 1393, 4, 74, 27, 510, 230, ...
                               ...                        
99985    [56, 10, 71, 4, 197, 7, 276, 4, 37, 115, 75, 1...
99986    [114, 3034, 9, 10, 230, 537, 94, 1358, 5, 745,...
99987    [114, 624, 47, 611, 9, 1811, 9, 10, 61, 29, 24...
99988    [68, 551, 5, 7, 42, 32, 14, 1007, 5, 7, 179, 6...
99989    [43, 1498, 144, 2025, 32, 14, 807, 20, 1753, 7...
Name: pad_fr_ids, Length: 99990, dtype: object
99990    [533, 230, 1358, 5, 745, 15, 3, 5739, 47, 14, ...
99991    [519, 1232, 4, 36, 771, 81, 23, 13, 624, 6614,...
99992    [888, 2202, 4230, 67, 3216, 16, 7, 561, 13, 74...
99993    [442, 237, 249, 5, 64, 3034, 51, 4160, 12, 64,...
99994    [1772, 132, 15, 3, 463, 1669, 32, 7, 2272, 9, ...
99995    

In [23]:
# Create our training input and target output numpy array to feed to our NMT model
# We'll take the first 9,990 lines for training
trainX = np.asarray(training_df['pad_fr_ids'][0:-10].tolist())
trainY = np.asarray(training_df['pad_en_ids'][0:-10].tolist())
# Reshape the output to match expected dimensionality
trainY = trainY.reshape(trainY.shape[0], trainY.shape[1], 1)

# The test dataset is just for a visual check, so we'll only take the last 10 lines
testX = np.asarray(training_df['pad_fr_ids'][-10:].tolist())
testY = np.asarray(training_df['pad_en_ids'][-10:].tolist())

In [24]:
# Check dimensions
print(trainX.shape)
print(trainY.shape)

(99990, 100)
(99990, 100, 1)


## Creating and training our NMT model

In [25]:
# Creating a Keras Sequential object for our NMT model
model = Sequential()

model.add(Embedding(
    en_vocab_size,
    nb_cells,
    input_length = max_sentence_length))
# Adding an LSTM layer to act as the encoder
model.add(LSTM(
    units = nb_cells,
    return_sequences = False))
# Since we are not returning a sequence but just a vector, we need
# to repeat this vector multiple times to input it to our decoder LSTM
model.add(RepeatVector(max_sentence_length))
# Adding an LSTM layer to act as the decoder
model.add(LSTM(
    units = nb_cells,
    return_sequences = True))
# Adding a softmax
model.add((Dense(
    fr_vocab_size,
    activation = 'softmax')))

# Compiling the model
model.compile(
    optimizer = 'adam',
    loss = 'sparse_categorical_crossentropy')

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 1024)         10240000  
_________________________________________________________________
lstm_1 (LSTM)                (None, 1024)              8392704   
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 100, 1024)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100, 1024)         8392704   
_________________________________________________________________
dense_1 (Dense)              (None, 100, 10000)        10250000  
Total params: 37,275,408
Trainable params: 37,275,408
Non-trainable params: 0
_________________________________________________________________
None


In [26]:
# Training the model
model.fit(
    trainX,
    trainY,
    epochs = nb_epochs,
    batch_size = batch_size)



Epoch 1/15

KeyboardInterrupt: 

In [None]:
# Saving our trained model
model.save(trained_model_filename)

## Testing the model

In [None]:
# Let's load the trained model
model = load_model(trained_model_filename)

In [None]:
predictions = model.predict_classes(testX)

decoded_predictions = []
for index in range(10):
    print("Expected:")
    print(en_sp.DecodeIds(testY[index].tolist()))
    print("Predicted:")
    print(en_sp.DecodeIds(predictions[index, :].tolist()))
    print("")