<a href="https://colab.research.google.com/github/nimendra-ag/Deep-Learning-NLP-for-Sentiment-analysis-Translation/blob/main/Neural_Machine_Translation_with_RNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [167]:
import tensorflow as tf### models
import numpy as np### math computations
import matplotlib.pyplot as plt### plotting bar chart
import sklearn### machine learning library
from sklearn.metrics import confusion_matrix, roc_curve### metrics
import seaborn as sns### visualizations
import datetime
import pathlib
import io
import os
import re
import string
import time
from numpy import random
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import (Dense,Flatten,SimpleRNN,InputLayer,Conv1D,Bidirectional,GRU,LSTM,BatchNormalization,Dropout,Input, Embedding,TextVectorization)
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from google.colab import drive
from google.colab import files
from tensorboard.plugins import projector

# Data Preparation

## Data Download

In [168]:
!wget https://www.manythings.org/anki/fra-eng.zip

--2025-03-14 17:16:04--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943074 (7.6M) [application/zip]
Saving to: ‘fra-eng.zip.1’


2025-03-14 17:16:05 (14.9 MB/s) - ‘fra-eng.zip.1’ saved [7943074/7943074]



In [169]:
!unzip "/content/fra-eng.zip" -d "/content/dataset/"

Archive:  /content/fra-eng.zip
replace /content/dataset/_about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: yes
  inflating: /content/dataset/_about.txt  
replace /content/dataset/fra.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: yes
  inflating: /content/dataset/fra.txt  


## Kaggle Dataset

If you want to go with the kaggle data set you can use it. But if you use the other data set don't run these cells which are related to the kaggle dataset.

In [170]:
# !pip install -q kaggle
# !mkdir ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 /root/.kaggle/kaggle.json
# !kaggle datasets download -d dhruvildave/en-fr-translation-dataset

In [171]:
# !unzip "/content/en-fr-translation-dataset.zip" -d "/content/dataset/"

In [172]:
# dataset = tf.data.experimental.CsvDataset(
#   "/content/dataset/en-fr.csv",
#   [
#     tf.string,
#     tf.string
#   ],
# )

## Data Processing

In [173]:
text_dataset=tf.data.TextLineDataset("/content/dataset/fra.txt")

In [174]:
items = []
for line in text_dataset:
  items.append(line)

len(items)

232736

In [175]:
import random
random_items = random.sample(items, int(232736*0.4))

In [176]:
lengths = np.empty(232736)
i = 0
for line in text_dataset.take(232736):
  lengths[i] = len(tf.strings.split(line, ' '))
  i = i + 1
lengths.mean()

20.628669393647737

In [177]:
lengths

array([ 11.,  10.,  12., ..., 107., 106., 119.])

In [178]:
# for line in text_dataset:
#   print(line.numpy())

In [179]:
iterater = iter(text_dataset)
first_value = next(iterater)
print(first_value)
print(type(first_value))

tf.Tensor(b'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)', shape=(), dtype=string)
<class 'tensorflow.python.framework.ops.EagerTensor'>


In [180]:
VOCAB_SIZE=20000
ENGLISH_SEQUENCE_LENGTH=64
FRENCH_SEQUENCE_LENGTH=64
EMBEDDING_DIM=300
BATCH_SIZE=64

In [181]:
english_vectorize_layer=TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=ENGLISH_SEQUENCE_LENGTH
)

In [182]:
french_vectorize_layer=TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=FRENCH_SEQUENCE_LENGTH
)

In [183]:
for i in text_dataset.take(1):
  print(i.numpy())

b'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)'


In [184]:
for i in text_dataset.take(1):
  splitted_text=tf.strings.split(i,'\t')
  print(splitted_text[0], splitted_text[1])

tf.Tensor(b'Go.', shape=(), dtype=string) tf.Tensor(b'Va !', shape=(), dtype=string)


In [185]:
def selector(input_text):
  split_text=tf.strings.split(input_text,'\t')
  return {'input_1':split_text[0:1],'input_2':'starttoken '+split_text[1:2]},split_text[1:2]+' endtoken'

In [186]:
split_dataset=text_dataset.map(selector)

In [187]:
for i in split_dataset.take(1):
  print(i)
  print(i[0]['input_1'].numpy())
  print(i[0]['input_2'].numpy())
  print(i[1].numpy())

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)
[b'Go.']
[b'starttoken Va !']
[b'Va ! endtoken']


In [188]:
def separator(input_text):
  split_text=tf.strings.split(input_text, '\t')
  return split_text[0:1], 'starttoken ' + split_text[1:2] + ' endtoken'

In [189]:
init_dataset=text_dataset.map(separator)

In [190]:
for i in init_dataset.take(1):
  print(i)
  print(i[0])
  print(i[1])

(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va ! endtoken'], dtype=object)>)
tf.Tensor([b'Go.'], shape=(1,), dtype=string)
tf.Tensor([b'starttoken Va ! endtoken'], shape=(1,), dtype=string)


In [191]:
english_training_data=init_dataset.map(lambda x,y:x)  #input x, y and output x
english_vectorize_layer.adapt(english_training_data)

In [192]:
french_training_data=init_dataset.map(lambda x,y:y) #input x, y and output y
french_vectorize_layer.adapt(french_training_data)

In [193]:
print(len(english_vectorize_layer.get_vocabulary()))
print(len(french_vectorize_layer.get_vocabulary()))

16952
20000


In [194]:
def vectorizer(inputs, output):
  return {'input_1':english_vectorize_layer(inputs['input_1']),
          'input_2':french_vectorize_layer(inputs['input_2'])},french_vectorize_layer(output)

In [195]:
for i in split_dataset.take(1):
  print(english_vectorize_layer(i[0]['input_1']))
  print(english_vectorize_layer(i[0]['input_2']))
  print(french_vectorize_layer(i[1]))

tf.Tensor(
[[45  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]], shape=(1, 64), dtype=int64)
tf.Tensor(
[[1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]], shape=(1, 64), dtype=int64)
tf.Tensor(
[[104   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]], shape=(1, 64), dtype=int64)


In [196]:
split_dataset

<_MapDataset element_spec=({'input_1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'input_2': TensorSpec(shape=(None,), dtype=tf.string, name=None)}, TensorSpec(shape=(None,), dtype=tf.string, name=None))>

In [197]:
dataset=split_dataset.map(vectorizer)

In [198]:
for i in split_dataset.take(3):
  print(i)

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Marche.'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche. endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken En route !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'En route ! endtoken'], dtype=object)>)


In [199]:
english_vectorize_layer.get_vocabulary()[45]

'go'

In [200]:
french_vectorize_layer.get_vocabulary()[104]

'va'

In [201]:
for i in dataset.take(1):
  print(i)

({'input_1': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[45,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>, 'input_2': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[  2, 104,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])>}, <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[104,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,

In [202]:
dataset

<_MapDataset element_spec=({'input_1': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

In [203]:
dataset=dataset.shuffle(2048).unbatch().batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

In [204]:
dataset

<_PrefetchDataset element_spec=({'input_1': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

In [205]:
NUM_BATCHES = int(200000/BATCH_SIZE)
NUM_BATCHES

3125

In [206]:
train_dataset=dataset.take(int(0.9*NUM_BATCHES))
val_dataset=dataset.skip(int(0.9*NUM_BATCHES))

In [207]:
type(train_dataset)

In [208]:
train_dataset

<_TakeDataset element_spec=({'input_1': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

In [209]:
val_dataset

<_SkipDataset element_spec=({'input_1': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

## Modeling

In [210]:
NUM_UNITS=256

In [214]:
## Encoder
input = Input(shape=(ENGLISH_SEQUENCE_LENGTH,), dtype='int64', name='input_1')
x=Embedding(VOCAB_SIZE, EMBEDDING_DIM,)(input)
encoded_input=Bidirectional(GRU(NUM_UNITS))(x)

## Decoder
shifted_target=Input(shape=(FRENCH_SEQUENCE_LENGTH,), dtype='int64', name='input_2')
x=Embedding(VOCAB_SIZE, EMBEDDING_DIM,)(shifted_target)
x=GRU(NUM_UNITS*2, return_sequences=True)(x, initial_state=encoded_input)
# we need the o/p of every GRU cell, so return_sequences=True

## OUTPUT
x = Dropout(0.5)(x)
target=Dense(VOCAB_SIZE, activation='softmax')(x)

seq2seq_gru=Model([input, shifted_target], target)
seq2seq_gru.summary()