In [None]:
import collections


import string
import numpy as np
import pandas as pd
from random import randint
import nltk

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, GRU, Embedding, Input, TimeDistributed, Activation, RepeatVector, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

from pickle import dump, load

import os

In [None]:

def load_data(path):
    """
    Load dataset
    """
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split('\n')

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
df = pd.read_csv('data/eng-french.csv')
english_sentences = df['English'].to_list()
french_sentences = df['French'].to_list()


Each line in clean_en file contains an English sentence with the respective Freench translation in each line of clean_fr file.

In [6]:
for sample_i in range(3):
    print('English Line {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('French Line {}:  {}'.format(sample_i + 1, french_sentences[sample_i]))
    print()

English Line 1:  new jersey is sometimes quiet during autumn , and it is snowy in april .
French Line 1:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .

English Line 2:  the united states is usually chilly during july , and it is usually freezing in november .
French Line 2:  les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .

English Line 3:  california is usually quiet during march , and it is usually hot in june .
French Line 3:  california est généralement calme en mars , et il est généralement chaud en juin .



Vocabulary

The complexity of any machine translation problem (and NLP as a whole) is determined by the complexity of the vocabulary. A more complex vocabulary is a more complex problem.

Let’s look at the complexity of the data set we’ll be working with.

In [7]:
#  Let’s look at the complexity of the data set we’ll be working with.
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

In [8]:
print('Number of English words: {}'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('Unique words in English: {}'.format(len(english_words_counter)))
print('10 Most common English words:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('Number of French words: {}'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('Unique words in French: {}'.format(len(french_words_counter)))
print('10 Most common French words:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

Number of English words: 1823250
Unique words in English: 227
10 Most common English words:
"is" "," "." "in" "it" "during" "the" "but" "and" "sometimes"

Number of French words: 1961295
Unique words in French: 355
10 Most common French words:
"est" "." "," "en" "il" "les" "mais" "et" "la" "parfois"


### Pre-process
Dataset have been preprocessed already all we need is to do the following
#### 1. Tokenize
Turn each sentence into a sequence of words ids using Keras’s Tokenizer function.

#### 2. Padding
Make sure all the English sequences have the same length and all the French sequences have the same length by adding padding to the end of each sequence using Keras’s pad_sequences function.

In [9]:
def tokenize(x):
    x_tk = Tokenizer(char_level = False)
    x_tk.fit_on_texts(x)

    return x_tk.texts_to_sequences(x), x_tk


def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = length, padding = 'post')


def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)
    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
    
    return preprocess_x, preprocess_y, x_tk, y_tk

In [10]:
# text_sentences = english_sentences[:2]
# text_tokenized, text_tokenizer = tokenize(text_sentences)

# print(text_tokenizer.word_index)
# print()
# for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
#     print('Sequence {} in x'.format(sample_i + 1))
#     print('  Input:  {}'.format(sent))
#     print('  Output: {}'.format(token_sent))

In [11]:
# # Pad Tokenized output
# test_pad = pad(text_tokenized)
# for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
#     print('Sequence {} in x'.format(sample_i + 1))
#     print('  Input:  {}'.format(np.array(token_sent)))
#     print('  Output: {}'.format(pad_sent))

In [12]:
preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(english_sentences, french_sentences)

In [13]:
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

In [14]:
print('Preprocessed sentences')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Preprocessed sentences
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 344


## Models
We will experiment with various neural network architectures.  
    - Model 1 is a simple RNN  
    - Model 2 is a RNN with Embedding  
    - Model 3 is a Bidirectional RNN  
    - Model 4 is an Encoder-Decoder RNN  

After experimenting with the four simple architectures, we will construct with a deeper model that designed to outperform all four models.

In [15]:
def logits_to_text(logits, tokenizer):
    """Convert NMT output to French text"""
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

#### Vanilla RNN
We are creating a basic RNN model which is a good baseline for sequence data that translate English to French.

In [16]:
def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 1e-3
    input_seq = Input(input_shape[1:])
    rnn = GRU(64, return_sequences = True)(input_seq)
    logits = TimeDistributed(Dense(french_vocab_size))(rnn)
    model = Model(input_seq, Activation('softmax')(logits))
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    
    return model

In [1]:
# tests.test_simple_model(simple_model)

tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))# Train the neural network

simple_rnn_model = simple_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)
# print(simple_rnn_model.summary())
simple_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

In [18]:
# Print prediction(s)
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

est est est est est est est est <PAD> <PAD> est <PAD> est <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


### Model 2: Embedding
Vanilla RNN model using word embedding.

In [None]:
def embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 1e-3
    rnn = GRU(64, return_sequences=True, activation="tanh")
    
    embedding = Embedding(french_vocab_size, 64, input_length=input_shape[1]) 
    logits = TimeDistributed(Dense(french_vocab_size, activation="softmax"))
    
    model = Sequential()
    #em can only be used in first layer --> Keras Documentation
    model.add(embedding)
    model.add(rnn)
    model.add(logits)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

# tests.test_embed_model(embed_model)

In [None]:
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))
embeded_model = embed_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)
    
embeded_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2aaf08640>

In [None]:
print(logits_to_text(embeded_model.predict(tmp_x[:1])[0], french_tokenizer))

new jersey est parfois parfois en l' et et il il est en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


#### Model 3: Bidirectional RNNs

In [None]:
def bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
   
    learning_rate = 1e-3
    model = Sequential()
    model.add(Bidirectional(GRU(128, return_sequences = True, dropout = 0.1), 
                           input_shape = input_shape[1:]))
    model.add(TimeDistributed(Dense(french_vocab_size, activation = 'softmax')))
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    return model

# tests.test_bd_model(bd_model)

In [None]:
tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))
bidi_model = bd_model(
    tmp_x.shape,
    preproc_french_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(french_tokenizer.word_index)+1)
bidi_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=20, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2a8abdfd0>

In [None]:
print(logits_to_text(bidi_model.predict(tmp_x[:1])[0], french_tokenizer))

new jersey est parfois froid au printemps mais il est agréable en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


#### Model 4: Encoder-Decoder
The encoder creates a matrix representation of the sentence. The decoder takes this matrix as input and predicts the translation as output.

In [None]:
def encoder_decoder_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
  
    learning_rate = 1e-3
    model = Sequential()
    model.add(GRU(128, input_shape = input_shape[1:], return_sequences = False))
    model.add(RepeatVector(output_sequence_length))
    model.add(GRU(128, return_sequences = True))
    model.add(TimeDistributed(Dense(french_vocab_size, activation = 'softmax')))
    
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    return model

In [None]:
# tests.test_encdec_model(encdec_model)
tmp_x = pad(preproc_english_sentences)
tmp_x = tmp_x.reshape((-1, preproc_english_sentences.shape[1], 1))
encodeco_model = encoder_decoder_model(
    tmp_x.shape,
    preproc_french_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(french_tokenizer.word_index)+1)

encodeco_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=20, validation_split=0.2)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2a892aeb0>

In [None]:
print(logits_to_text(encodeco_model.predict(tmp_x[:1])[0], french_tokenizer))

new jersey est jamais en en mois et il est est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


#### Model 5: Stacked Model
Create a model_final that incorporates embedding and a bidirectional RNN into one model.

In [None]:
def model_final(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
  
    model = Sequential()
    model.add(Embedding(input_dim=english_vocab_size,output_dim=128,input_length=input_shape[1]))
    model.add(Bidirectional(GRU(256,return_sequences=False)))
    model.add(RepeatVector(output_sequence_length))
    model.add(Bidirectional(GRU(256,return_sequences=True)))
    model.add(TimeDistributed(Dense(french_vocab_size,activation='softmax')))
    learning_rate = 0.005
    
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    
    return model


In [None]:
tmp_X = pad(preproc_english_sentences)
model = model_final(tmp_X.shape,
                    preproc_french_sentences.shape[1],
                    len(english_tokenizer.word_index)+1,
                    len(french_tokenizer.word_index)+1)
    
model.fit(tmp_X, preproc_french_sentences, batch_size = 1024, epochs = 20, validation_split = 0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x311bbfc70>

In [None]:
def final_predictions(sentence, x, y, x_tk, y_tk):

    y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
    y_id_to_word[0] = '<PAD>'
    
    sentence = [x_tk.word_index[word] for word in sentence.split()]
    sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post')
    sentences = np.array([sentence[0], x[0]])
    predictions = model.predict(sentences, len(sentences))
    
    print('Sample 1:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))
    
    print('Sample 2:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[1]]))
    

In [None]:
Sample 1:
il a conduit un vieux <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Sample 2:
new jersey est parfois calme pendant l'automne et il est en en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
new jersey est parfois calme pendant l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

In [None]:
sentence = 'he saw a old yellow truck'
print('Target: Il a vu un vieux camion jaune')

final_predictions(sentence, preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer)

Target: Il a vu un vieux camion jaune
Sample 1:
il a conduit un vieux <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Sample 2:
new jersey est parfois calme pendant l'automne et il est en en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
new jersey est parfois calme pendant l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
