<a href="https://colab.research.google.com/github/nnilayy/Recurrent-Neural-Networks/blob/main/Neural_Machine_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
def  model_final (input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a model that incorporates embedding, encoder-decoder, and bidirectional RNN
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # Hyperparameters
    learning_rate = 0.003

    # Build the layers    
    model = Sequential()
    model.add(Embedding(english_vocab_size, 128, input_length=input_shape[1],input_shape=input_shape[1:]))
    model.add(Bidirectional(GRU(128)))
    model.add(RepeatVector(output_sequence_length))
    model.add(Bidirectional(GRU(128, return_sequences=True)))
    model.add(TimeDistributed(Dense(512, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    model.compile(loss=sparse_categorical_crossentropy,optimizer=Adam(learning_rate),metrics=['accuracy'])

In [None]:
model = Sequential()
model_w2v = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
model.add(model_w2v.wv.get_keras_embedding(train_embeddings=False))
model.add(LSTM(512))
model.add(RepeatVector(8))
model.add(SeqSelfAttention(attention_activation='sigmoid'))
model.add(LSTM(512))
model.add(Dense(LEN_RU, activation='softmax'))

In [5]:
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

In [None]:
model.summary()

In [None]:
()preprocessing 
()embedding layer 
()rnn 
()repeatvector 
()rnn
()dense

## Dataset ------------------------------------------------------------------------------------


In [2]:
import string
import re
from pickle import dump
from unicodedata import normalize
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Input,Dropout,RepeatVector, LSTM, TimeDistributed,GRU
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.utils import pad_sequences
from keras.models import load_model
from keras import optimizers
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import string
from numpy import array, argmax, random, take
import tensorflow as tf
import pandas as pd
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
# from keras_self_attention import SeqSelfAttention

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!tar zxvf /content/drive/MyDrive/French_To_English/fr-en.tgz

europarl-v7.fr-en.en
europarl-v7.fr-en.fr


In [5]:
# Function to load file
def load_file(filename):
  file=open(filename,'rt',encoding='utf-8')
  text=file.read()
  file.close()
  return text

In [6]:
# Function to Convert Document to sentences
def sentenize(text):
  sentences=text.strip().split("\n")
  return sentences

In [7]:
# Shortest and longest sentence lengths
def sentence_lengths(sentences):
 lengths = [len(s.split()) for s in sentences]
 return min(lengths), max(lengths)

In [7]:
# Number of Sentences
def num_sentences(sentences):
  return len(sentences)

In [8]:
# Preprocessing Function
def preprocess(sentences):
	preprocessed_sentences = list()
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	table = str.maketrans('', '', string.punctuation)
	for sentence in sentences:
		sentence = normalize('NFD', sentence).encode('ascii', 'ignore')
		sentence = sentence.decode('UTF-8')
		sentence = sentence.split()
		sentence = [word.lower() for word in sentence]
		sentence = [word.translate(table) for word in sentence]
		sentence = [re_print.sub('', w) for w in sentence]
		sentence = [word for word in sentence if word.isalpha()]
		preprocessed_sentences.append(' '.join(sentence))
	return preprocessed_sentences

In [9]:
english_text=load_file('/content/europarl-v7.fr-en.en')
english_sent=sentenize(english_text)
english_sent=preprocess(english_sent)

In [10]:
french_text=load_file('/content/europarl-v7.fr-en.fr')
french_sent=sentenize(french_text)
french_sent=preprocess(french_sent)

In [11]:
# Minimum and Maximum Length in English Sentences
eng_minlen,eng_maxlen=sentence_lengths(english_sent)
print("Minimum English Sentence Length: ",eng_minlen)
print("Maximum English Sentence Length: ",eng_maxlen)

# Minimum and Maximum Length in French Sentences
fr_minlen,fr_maxlen=sentence_lengths(french_sent)
print("Minimum French Sentence Length: ",fr_minlen)
print("Maximum French Sentence Length: ",fr_maxlen)

Minimum English Sentence Length:  0
Maximum English Sentence Length:  642
Minimum French Sentence Length:  0
Maximum French Sentence Length:  598


In [13]:
# Number of Sentences in English Text
eng_sen_num=num_sentences(english_sent)
print("English Text has " + str(eng_sen_num) + " sentences")

# Number of Sentences in French Text
fr_sen_num=num_sentences(french_sent)
print("French Text has " + str(fr_sen_num) + " sentences")

English Text has 2007723 sentences
French Text has 2007723 sentences


In [26]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x),tokenizer

In [56]:
eng_tokenized,eng_tokenizer=tokenize(english_sent)
fr_tokenized,fr_tokenizer=tokenize(french_sent)

In [None]:
eng_tokenizer.word_index
fr_tokenizer.word_index

In [9]:
import pickle
def save(filename,content):
  with open(filename, "wb") as file:
    pickle.dump(content, file)

def load(filename):
  with open(filename, "rb") as file:
    content= pickle.load(file)
  return content

In [60]:
# save("/content/drive/MyDrive/French_To_English/eng_tokenized",eng_tokenized)
# save("/content/drive/MyDrive/French_To_English/fr_tokenized",fr_tokenized)
# save("/content/drive/MyDrive/French_To_English/english_sent",english_sent)
# save("/content/drive/MyDrive/French_To_English/french_sent",french_sent)
# save("/content/drive/MyDrive/French_To_English/eng_tokenizer",eng_tokenizer)
# save("/content/drive/MyDrive/French_To_English/fr_tokenizer",fr_tokenizer)

In [10]:
eng_tokenized=load("/content/drive/MyDrive/French_To_English/eng_tokenized")
fr_tokenized=load("/content/drive/MyDrive/French_To_English/fr_tokenized")
english_sent=load("/content/drive/MyDrive/French_To_English/english_sent")
french_sent=load("/content/drive/MyDrive/French_To_English/french_sent")
eng_tokenizer=load("/content/drive/MyDrive/French_To_English/eng_tokenizer")
fr_tokenizer=load("/content/drive/MyDrive/French_To_English/fr_tokenizer")

In [11]:
eng_tokenized=np.array(eng_tokenized)
fr_tokenized=np.array(fr_tokenized)

In [12]:
def pad(sequences,length=None):
  padded=pad_sequences(sequences,maxlen=length,padding="post")
  return padded

In [13]:
eng_padded=pad(eng_tokenized,100)
fr_padded=pad(fr_tokenized,100)

In [14]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [None]:
def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a basic RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # Hyperparameters
    learning_rate = 0.005
    
    # TODO: Build the layers
    model = Sequential()
    model.add(GRU(256, input_shape=input_shape[1:], return_sequences=True))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax'))) 

    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

tests.test_simple_model(simple_model)

# Reshaping the input to work with a basic RNN
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

# Train the neural network
simple_rnn_model = simple_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

print(simple_rnn_model.summary())

simple_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

# Print prediction(s)
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

In [None]:
def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)
    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(english_sent, french_sent)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

In [28]:
# Model Function
def define_model(in_vocab,out_vocab, in_timesteps,out_timesteps,units):
      model = Sequential()
      model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
      model.add(LSTM(units))
      model.add(RepeatVector(out_timesteps))
      model.add(LSTM(units, return_sequences=True))
      model.add(Dense(out_vocab, activation='softmax'))
      model.compile(optimizer=tf.keras.optimizers.RMSprop(), 
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])
      return model

In [None]:
model=define_model(french_vocab_size, english_vocab_size, 100, 100, 512)
model.summary()


In [None]:
english_vocab_size=len(eng_tokenizer.word_index)
french_vocab_size=len(fr_tokenizer.word_index)
print("English Dictionary Size: ",english_vocab_size)
print("French Dictionary Size: ",french_vocab_size)

In [31]:
model.fit(eng_padded, fr_padded.reshape(fr_padded.shape[0], fr_padded.shape[1], 1),
                    epochs=30, 
                    batch_size=512, 
                    validation_split = 0.2,
                    # callbacks=[checkpoint], 
                    # verbose=0,
          )

Epoch 1/30


ResourceExhaustedError: ignored