<a href="https://colab.research.google.com/github/myazzeh/NLP-Course/blob/main/Sequence_Learning/NLP_seq2seq_machine_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import collections
import helper
import numpy as np
import os
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Model, Sequential
from keras.optimizers import Adam
import pandas as pd
from keras.layers import GRU, LSTM, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Embedding
from keras.losses import sparse_categorical_crossentropy

#**Download and Prepare the datasets**#

In [None]:
!wget https://raw.githubusercontent.com/myazzeh/NLP-Course/main/datasets/small_vocab_en
!wget https://raw.githubusercontent.com/myazzeh/NLP-Course/main/datasets/small_vocab_fr

In [None]:
eng_file = os.path.join('small_vocab_en')
eng = open(eng_file, "r")
english_sentences= eng.read().split('\n')

fre_file =os.path.join('small_vocab_fr')
fre = open(fre_file, "r")
french_sentences = fre.read().split('\n')


In [None]:
#Test the sentences
for i in range(2):
    print(f'English sentence at Line {i + 1}: {english_sentences[i]}')
    print(f'French sentence at Line {i + 1}: {french_sentences[i]}')
    print('-------------------------------------------------')

In [None]:
# Get Initial Vocab Size for both English and French dataset
#english_vocab = len(collections.Counter([word for sentence in english_sentences for word in sentence.split()]))
#french_vocab  = len(collections.Counter([word for sentence in french_sentences for word in sentence.split()]))
#print(f'English vocab size is {english_vocab}, and French vocab size is {french_vocab}')

#**Tokenize English and French Corpus and Align all sequences**#

In [None]:
# Find Maximum sequence Length in both datasets
length_eng = max([len(sentence) for sentence in english_sentences])
length_fre = max([len(sentence) for sentence in french_sentences])
max_length= max(length_eng, length_fre)
print(f'Maxmium sequence length in both dataset is {max_length}')

In [None]:
etok = Tokenizer(char_level = False, oov_token='[UNK]')
etok.fit_on_texts(english_sentences)
english_seq = etok.texts_to_sequences(english_sentences)
english_seq = pad_sequences(english_seq, maxlen = max_length, padding = 'post')

ftok = Tokenizer(char_level = False)
ftok.fit_on_texts(french_sentences)
french_seq = ftok.texts_to_sequences(french_sentences)
french_seq = pad_sequences(french_seq, maxlen = max_length, padding = 'post')
#french_seq = french_seq.reshape(*french_seq.shape, 1)

In [None]:
english_seq[0]

In [None]:
french_seq[0]

In [None]:
print(*english_seq.shape)

In [None]:
#Get Vocab Size
english_vocab = len(etok.word_index)
french_vocab = len(ftok.word_index)
print(f'English vocab size is {english_vocab}, and French vocab size is {french_vocab}')

#**First Machine Translation Model using Two RNN (Encoder and Decoder)**#

In [None]:
#Model 1 is a simple RNN
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print('`logits_to_text` function loaded.')

In [None]:
model = Sequential()
model.add(Embedding(input_dim=english_vocab,output_dim=128,input_length=max_length))
model.add(Bidirectional(LSTM(256,return_sequences=False)))
model.add(RepeatVector(max_length))
model.add(Bidirectional(LSTM(256,return_sequences=True)))
model.add(TimeDistributed(Dense(french_vocab,activation='softmax')))
learning_rate = 0.005
model.compile(loss = sparse_categorical_crossentropy, optimizer = Adam(learning_rate), metrics = ['accuracy'])
model.fit(english_seq, french_seq, batch_size = 1024, epochs = 2, validation_split = 0.2)

In [None]:
print(logits_to_text(model.predict(french_seq[:1])[0], ftok))

In [None]:
y_id_to_word = {value: key for key, value in ftok.word_index.items()}
y_id_to_word[0] = '<PAD>'
sentence = 'he saw a old yellow truck'
sentence = [etok.word_index[word] for word in sentence.split()]
sentence = pad_sequences([sentence], maxlen=max_length, padding='post')
sentences = np.array([sentence[0], english_seq[0]])
predictions = model.predict(sentences, len(sentences))
print('Sample 1:')
print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))
print('Il a vu un vieux camion jaune')
print('Sample 2:')
print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[1]]))
print(' '.join([y_id_to_word[np.max(x)] for x in french_seq[0]]))

#**Second Machine Translation Model using Two RNN (Encoder and Decoder)**#





In [None]:
#Model 2: Embedding

from keras.models import Sequential
def embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 1e-3
    rnn = GRU(64, return_sequences=True, activation="tanh")
    embedding = Embedding(french_vocab_size, 64, input_length=input_shape[1])
    logits = TimeDistributed(Dense(french_vocab_size, activation="softmax"))
    model = Sequential()
    #em can only be used in first layer --> Keras Documentation
    model.add(embedding)
    model.add(rnn)
    model.add(logits)
    model.compile(loss=sparse_categorical_crossentropy,optimizer=Adam(learning_rate), metrics=['accuracy'])
    return model
#tests.test_embed_model(embed_model)
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))
embeded_model = embed_model(
tmp_x.shape,
max_french_sequence_length,
english_vocab_size,
french_vocab_size)
embeded_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)
print(logits_to_text(embeded_model.predict(tmp_x[:1])[0], french_tokenizer))

#**Third Machine Translation Model using Two RNN (Encoder and Decoder)**#

In [None]:
#Model 3: Bidirectional RNNs
def bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 1e-3
    model = Sequential()
    model.add(Bidirectional(GRU(128, return_sequences = True, dropout = 0.1),
    input_shape = input_shape[1:]))
    model.add(TimeDistributed(Dense(french_vocab_size, activation = 'softmax')))
    model.compile(loss = sparse_categorical_crossentropy,
    optimizer = Adam(learning_rate),
    metrics = ['accuracy'])
    return model
tests.test_bd_model(bd_model)
tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))
bidi_model = bd_model(
tmp_x.shape,
preproc_french_sentences.shape[1],
len(english_tokenizer.word_index)+1,
len(french_tokenizer.word_index)+1)
bidi_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=20, validation_split=0.2)
# Print prediction(s)
print(logits_to_text(bidi_model.predict(tmp_x[:1])[0], french_tokenizer))

#**Forth Machine Translation Model using Two RNN (Encoder and Decoder)**#

In [None]:
#Model 4: Encoder-Decoder

def encdec_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 1e-3
    model = Sequential()
    model.add(LSTM(128, input_shape = input_shape[1:], return_sequences = False))
    model.add(RepeatVector(output_sequence_length))
    model.add(LSTM(128, return_sequences = True))
    model.add(TimeDistributed(Dense(french_vocab_size, activation = 'softmax')))
    model.compile(loss = sparse_categorical_crossentropy, optimizer = Adam(learning_rate), metrics = ['accuracy'])
    return model

tests.test_encdec_model(encdec_model)
tmp_x = pad(preproc_english_sentences)
tmp_x = tmp_x.reshape((-1, preproc_english_sentences.shape[1], 1))
encodeco_model = encdec_model(
tmp_x.shape,
preproc_french_sentences.shape[1],
len(english_tokenizer.word_index)+1,
len(french_tokenizer.word_index)+1)
encodeco_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=20, validation_split=0.2)
print(logits_to_text(encodeco_model.predict(tmp_x[:1])[0], french_tokenizer))

#**Fifth Machine Translation Model using Two RNN (Encoder and Decoder)**#

In [None]:
#Model 5: Custom
def model_final(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    model = Sequential()
    model.add(Embedding(input_dim=english_vocab_size,output_dim=128,input_length=input_shape[1]))
    model.add(Bidirectional(GRU(256,return_sequences=False)))
    model.add(RepeatVector(output_sequence_length))
    model.add(Bidirectional(GRU(256,return_sequences=True)))
    model.add(TimeDistributed(Dense(french_vocab_size,activation='softmax')))
    learning_rate = 0.005
    model.compile(loss = sparse_categorical_crossentropy,
    optimizer = Adam(learning_rate),
    metrics = ['accuracy'])
    return model
tests.test_model_final(model_final)
print('Final Model Loaded')
def final_predictions(x, y, x_tk, y_tk):
tmp_X = pad(preproc_english_sentences)
model = model_final(tmp_X.shape,
preproc_french_sentences.shape[1],
len(english_tokenizer.word_index)+1,
len(french_tokenizer.word_index)+1)

model.fit(tmp_X, preproc_french_sentences, batch_size = 1024, epochs = 17, validation_split = 0.2)
y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
y_id_to_word[0] = '<PAD>'
sentence = 'he saw a old yellow truck'
sentence = [x_tk.word_index[word] for word in sentence.split()]
sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post')
sentences = np.array([sentence[0], x[0]])
predictions = model.predict(sentences, len(sentences))
print('Sample 1:')
print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))
print('Il a vu un vieux camion jaune')
print('Sample 2:')
print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[1]]))
print(' '.join([y_id_to_word[np.max(x)] for x in y[0]]))
final_predictions(preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer)