# **Natural Language Processing with Transformer Architecture**



import packages and check versions

In [43]:
import tensorflow as tf

# Check the current version of TensorFlow.
tf.__version__

'2.8.0'

In [44]:
# Uninstall the current version of TensorFlow.
!pip uninstall tensorflow

Found existing installation: tensorflow 2.8.0
Uninstalling tensorflow-2.8.0:
  Would remove:
    /usr/local/bin/estimator_ckpt_converter
    /usr/local/bin/import_pb_to_tensorboard
    /usr/local/bin/saved_model_cli
    /usr/local/bin/tensorboard
    /usr/local/bin/tf_upgrade_v2
    /usr/local/bin/tflite_convert
    /usr/local/bin/toco
    /usr/local/bin/toco_from_protos
    /usr/local/lib/python3.10/dist-packages/tensorflow-2.8.0.dist-info/*
    /usr/local/lib/python3.10/dist-packages/tensorflow/*
Proceed (Y/n)? y
  Successfully uninstalled tensorflow-2.8.0


In [None]:
# Install the desired version of TensorFlow.
!pip install tensorflow==2.8

In [46]:
# Verify the TensorFlow version.
print(tf.__version__)

2.8.0


## **import libraries**

In [47]:
# Importing libraries
import os
import pickle
import pandas as pd
import numpy as np
import random
import string
import re
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import TextVectorization

import warnings
warnings.filterwarnings("ignore")

# **Importing Drive**


In [48]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In this section

> We will load the Nepali text dataset from an Excel file stored in Google Drive. We will then display the first few rows of the DataFrame to get an overview of the data.



In [49]:
# Specify the path to your Excel file
excel_file_path = "/content/drive/MyDrive/english-nepali.xlsx"

# Read the Excel file, specify the sheet name if needed
df = pd.read_excel(excel_file_path)

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,english_sent,nepali_sent
0,"It happened after the death of Saul, when Davi...",दाऊदले अमालेकीहरूलाई हराएर पछि सिकलग गए। यो शा...
1,"it happened on the third day, that behold, a m...",तब तेस्रो दिनमा एउटा जवान सैनिक सिकलगमा आयो। त...
2,"David said to him, ""Where do you come from?"" H...","दाऊदले त्यसलाई सोधे, “तिमी कहाँबाट आयौ?” त्यस ..."
3,"David said to him, ""How did it go? Please tell...","दाऊदले भने, “मलाई भन, के भयो?” त्यसले भन्यो, “..."
4,"David said to the young man who told him, ""How...","दाऊदले त्यस सैनिकलाई भने, “तिमीले कसरी जान्यौ ..."


In [50]:
# Drop rows with missing values
df.dropna(axis=0, inplace=True)

In [51]:
df.shape

(151923, 2)

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 151923 entries, 0 to 151936
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   english_sent  151923 non-null  object
 1   nepali_sent   151923 non-null  object
dtypes: object(2)
memory usage: 3.5+ MB


# **DEFINE PARAMETER**

In [53]:
# Tranining Hyperparameters
batch_size = 128

# Model Hyperparameters
embed_dim = 128
num_heads = 10
latent_dim = 2048
vocab_size = 20000
sequence_length = 20
dropout = 0.2

# Preprocess Text Data
In this section, we will define a function to preprocess the English and Nepali text data. The preprocessing steps include converting text to lowercase, removing URLs, digits, special characters, quotes, extra spaces, and adding [start] and [end] tags to the Nepali sentences.

In [54]:
def preprocess_text(df):
    # Lowercase the characters
    df["english_sent"] = df["english_sent"].apply(lambda x : x.lower())
    df["nepali_sent"] = df["nepali_sent"].apply(lambda x : x.lower())

    # Rmoving URLs
    df["english_sent"] = df["english_sent"].apply(lambda x : re.sub(r"http\S+", "", x))
    df["nepali_sent"] = df["nepali_sent"].apply(lambda x : re.sub(r"http\S+", "", x))

    # Removing digits
    remove_digits = str.maketrans("", "",string.digits)
    df["english_sent"] = df["english_sent"].apply(lambda x : x.translate(remove_digits))
    df["nepali_sent"] = df["nepali_sent"].apply(lambda x : x.translate(remove_digits))
    df["nepali_sent"] = df["nepali_sent"].apply(lambda x : re.sub("[a-zA-z२३०८१५७९४६]", "", x))

    # Remove special characters
    special = set(string.punctuation)
    df['english_sent'] = df['english_sent'].apply(lambda x : ''.join(ch for ch in x if ch not in special))
    df['nepali_sent'] = df['nepali_sent'].apply(lambda x : ''.join(ch for ch in x if ch not in special))

    # Remove quotes
    df['english_sent'] = df['english_sent'].apply(lambda x: re.sub("'", '', x))
    df['nepali_sent'] = df['nepali_sent'].apply(lambda x: re.sub("'", '', x))

    # Remove extra spaces
    df['english_sent'] = df['english_sent'].apply(lambda x : x.strip())
    df['nepali_sent'] = df['nepali_sent'].apply(lambda x : x.strip())
    df['english_sent'] = df['english_sent'].apply(lambda x : re.sub(" +"," ",x))
    df['nepali_sent'] = df['nepali_sent'].apply(lambda x : re.sub(" +"," ",x))


    # Add [start] and [end] tags
    df["nepali_sent"] = df["nepali_sent"].apply(lambda x : "[start] " + x + " [end]")

In [55]:
# Drop rows with Null values
df.drop(df[df["english_sent"] == " "].index, inplace = True)
df.drop(df[df["nepali_sent"] == "[start]  [end]"].index, inplace = True)

In [56]:
# Find Sentence Length
df["eng_sent_length"] = df["english_sent"].apply(lambda x : len(x.split(' ')))
df["nepali_sent_length"] = df["nepali_sent"].apply(lambda x : len(x.split(' ')))

In [57]:
def decode_sequence(input_sentence):
    nepali_vocab = nepali_vectorization.get_vocabulary()
    nepali_index_lookup = dict(zip(range(len(nepali_vocab)), nepali_vocab))
    max_decoded_sentence_length = 20

    tokenized_input_sentence = eng_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = nepali_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = nepali_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break

    return decoded_sentence[8:-5] # Removing [start] and [end] tokens

# Create and Format Dataset
In this section, we will define a function to format the dataset for training. The function will vectorize the English and Nepali sentences and prepare the encoder inputs, decoder inputs, and targets for the sequence-to-sequence model.

In [58]:
# For creating Dataset
def format_dataset(eng, hin):
    eng = eng_vectorization(eng)
    nepali = nepali_vectorization(hin)
    return ({"encoder_inputs" : eng, "decoder_inputs" : nepali[:, :-1],}, nepali[:, 1:])


def make_dataset(df):
    dataset = tf.data.Dataset.from_tensor_slices((df["english_sent"].values, df["nepali_sent"].values))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()

In [59]:
df.shape

(151923, 4)

In [60]:
df.head()

Unnamed: 0,english_sent,nepali_sent,eng_sent_length,nepali_sent_length
0,"It happened after the death of Saul, when Davi...",दाऊदले अमालेकीहरूलाई हराएर पछि सिकलग गए। यो शा...,25,20
1,"it happened on the third day, that behold, a m...",तब तेस्रो दिनमा एउटा जवान सैनिक सिकलगमा आयो। त...,44,32
2,"David said to him, ""Where do you come from?"" H...","दाऊदले त्यसलाई सोधे, “तिमी कहाँबाट आयौ?” त्यस ...",22,14
3,"David said to him, ""How did it go? Please tell...","दाऊदले भने, “मलाई भन, के भयो?” त्यसले भन्यो, “...",40,22
4,"David said to the young man who told him, ""How...","दाऊदले त्यस सैनिकलाई भने, “तिमीले कसरी जान्यौ ...",21,12


# Filter, Sample, and Split the Dataset
In this section, we will filter the dataset to include sentences of specific length, sample a specified number of records, and split the data into training, validation, and test sets. We will also optionally save the modified datasets.

In [61]:
# Get sentences with specific length 20 or less
df = df[df["eng_sent_length"] <= 20]
df = df[df["nepali_sent_length"] <= 20]

# Define the number of records to sample for training
num_records_to_sample = min(4610, len(df))  # Take either 4,610 records or all available records

# Sample the specified number of records for training
df = df.sample(n=num_records_to_sample, random_state=500)
df = df.reset_index(drop=True)

# Split the data into train, validation, and test sets
train = df.iloc[:4000]  # 4,000 records for training
val = df.iloc[4000:4300]  # 300 records for validation
test = df.iloc[4300:]  # Remaining records for testing

# Optionally, you can save the modified datasets if needed
train.to_csv('train.csv', index=False)
val.to_csv('val.csv', index=False)
test.to_csv('test.csv', index=False)

print("Data has been processed and split into train, validation, and test sets.")


Data has been processed and split into train, validation, and test sets.


#**Tokenizing Sentences**

In [62]:
# Using TextVectorization to create sentence vectors
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")


def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

eng_vectorization = TextVectorization(
    max_tokens = vocab_size, output_mode = "int", output_sequence_length = sequence_length
    )

nepali_vectorization = TextVectorization(
    max_tokens = vocab_size, output_mode = "int", output_sequence_length = sequence_length + 1, standardize=custom_standardization
)

eng_vectorization.adapt(df["english_sent"].values)
nepali_vectorization.adapt(df["nepali_sent"].values)

In [63]:
# Savng parameters and weights of both vectorizer
pickle.dump({'config': eng_vectorization.get_config(),
             'weights': eng_vectorization.get_weights()}
            , open("eng_vectorizer.pkl", "wb"))

pickle.dump({'config': nepali_vectorization.get_config(),
             'weights': nepali_vectorization.get_weights()}
            , open("nepali_vectorizer.pkl", "wb"))

#**Creating Dataset**

In [64]:
train_ds = make_dataset(train)
val_ds = make_dataset(val)

# Positional Embedding Layer
In this section, we define a custom PositionalEmbedding layer, which is a crucial component of the transformer architecture. This layer adds positional information to the token embeddings, allowing the model to understand the order of tokens in the input sequence.

In [65]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_len, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.sequence_len = sequence_len
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.token_embedding = layers.Embedding(
            input_dim = vocab_size, output_dim = embed_dim
        )
        self.position_embedding = layers.Embedding(
            input_dim = sequence_len, output_dim = embed_dim
        )

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start = 0, limit = length, delta = 1)
        embedded_tokens = self.token_embedding(inputs)
        embedded_positions = self.position_embedding(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

# Transformer Encoder Layer
In this section, we define a custom TransformerEncoder layer, which is an essential component of the transformer architecture. This layer will be used in sequence-to-sequence models for natural language processing (NLP) tasks.

In [66]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, dropout,**kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.attention = layers.MultiHeadAttention(
            num_heads = num_heads, key_dim = embed_dim
        )
        self.layer_norm1 = layers.LayerNormalization()
        self.layer_norm2 = layers.LayerNormalization()
        self.layer_ffn = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"),
             layers.Dropout(dropout),
             layers.Dense(embed_dim),]
            )
        self.supports_masking = True

    def call(self, inputs, mask = None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")

        attention_output = self.attention(
            query = inputs, value = inputs, key = inputs, attention_mask = padding_mask
        )
        ffn_input = self.layer_norm1(inputs + attention_output)
        ffn_output = self.layer_ffn(ffn_input)
        return self.layer_norm2(ffn_input + ffn_output)

# Transformer Decoder Layer
In this section, we define a custom TransformerDecoder layer, which is a crucial component of the transformer architecture. This layer will be used in sequence-to-sequence models for natural language processing (NLP) tasks.

In [67]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, sropout,**kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.attention1 = layers.MultiHeadAttention(
            num_heads = num_heads, key_dim = embed_dim
        )
        self.attention2 = layers.MultiHeadAttention(
            num_heads = num_heads, key_dim = embed_dim
        )
        self.layer_ffn = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"),
             layers.Dropout(dropout),
             layers.Dense(embed_dim),]
        )
        self.layer_norm1 = layers.LayerNormalization()
        self.layer_norm2 = layers.LayerNormalization()
        self.layer_norm3 = layers.LayerNormalization()

        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask = None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output1 = self.attention1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out1 = self.layer_norm1(inputs + attention_output1)

        attention_output2 = self.attention2(
            query = out1, value = encoder_outputs, key = encoder_outputs, attention_mask = padding_mask
        )
        out2 = self.layer_norm2(out1 + attention_output2)

        ffn_output = self.layer_ffn(out2)
        return self.layer_norm3(out2 + ffn_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)

# Building the Transformer Model
In this section, we build the transformer model using the previously defined PositionalEmbedding, TransformerEncoder, and TransformerDecoder layers. The model consists of an encoder and a decoder, which are connected to form a sequence-to-sequence model for NLP tasks.

In [68]:
encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads, dropout,name="encoder_1")(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads, dropout,name="decoder_1")(x, encoded_seq_inputs)
x = layers.Dropout(0.4)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

# Model Summary
In this section, we will generate and display a summary of the transformer model. This summary provides an overview of the model architecture, including the layers, their output shapes, and the number of parameters.

# Displaying the Transformer Model Summary

In [69]:
transformer.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding_2 (Positi  (None, None, 128)   2562560     ['encoder_inputs[0][0]']         
 onalEmbedding)                                                                                   
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 encoder_1 (TransformerEncoder)  (None, None, 128)   1186304     ['positional_embedding_

# Callback Functions and Model Compilation
In this section, we define callback functions to monitor the training process and compile the transformer model for training.

In [70]:
# Defining callback functions
early_stopping = EarlyStopping(patience = 5,restore_best_weights=True)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3)

# Compiling model
transformer.compile(
    optimizer = "adam",
    loss="sparse_categorical_crossentropy",
    metrics = ["accuracy"]
)

# Inspecting Samples from Datasets
In this section, we iterate over TensorFlow datasets (train_ds and val_ds) to inspect some samples of English and Nepali sentences.

python

In [71]:
# Print some samples from train_ds and val_ds to inspect the data
for sample in train_ds.take(5):  # Print the first 5 samples
    print("English Sentence:", sample[0]["encoder_inputs"].numpy())
    print("Nepali Sentence:", sample[1].numpy())


English Sentence: [[ 176    6   48 ...    0    0    0]
 [3054    2  297 ...    0    0    0]
 [1871  190   10 ...    0    0    0]
 ...
 [  20  597    4 ...    0    0    0]
 [   2 3098    4 ...    0    0    0]
 [  25  844 1390 ...    0    0    0]]
Nepali Sentence: [[2219   34   75 ...    0    0    0]
 [ 289 4641    8 ...    0    0    0]
 [ 458   11 3003 ...    0    0    0]
 ...
 [ 428  295 6841 ...    0    0    0]
 [5271  183 1190 ...    0    0    0]
 [2607 7638 9021 ...    0    0    0]]
English Sentence: [[  23  376 1338 ...    0    0    0]
 [  83 3427    0 ...    0    0    0]
 [ 394 1596   23 ...    0    0    0]
 ...
 [  38  281 5342 ...    0    0    0]
 [  35   15   51 ...    0    0    0]
 [2161   29    4 ...    0    0    0]]
Nepali Sentence: [[ 5560  1394     9 ...     0     0     0]
 [12516   477 11249 ...     0     0     0]
 [  861   780  2900 ...     0     0     0]
 ...
 [12661   234    65 ...     0     0     0]
 [  412   301   624 ...     0     0     0]
 [ 1012    27  7514 ...   

In [72]:
print(train_ds.element_spec)


({'encoder_inputs': TensorSpec(shape=(None, 20), dtype=tf.int64, name=None), 'decoder_inputs': TensorSpec(shape=(None, 20), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 20), dtype=tf.int64, name=None))


# Reducing Layers and Attention Heads, and Training the Model
In this section, we reduce the number of layers and attention heads in the transformer model and train it using the training dataset (train_ds) with validation on the validation dataset (val_ds).

# Reducing Layers and Attention Heads

In [80]:
# Reduce the number of layers
num_layers = 2
# Reduce the number of attention heads
num_heads = 2
# Training model
transformer.fit(train_ds,
                epochs = 25,
                validation_data = val_ds,
                callbacks = [early_stopping,
                             reduce_lr])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25


<keras.callbacks.History at 0x7e9069098a60>

In [81]:
# Saving weights of model
transformer.save_weights("eng-hin.h5")

In [82]:
# Sample for testing
eng = "This is an error"
print("english_sent: ",eng)
print("Translated Sentence : ",decode_sequence(eng))

english_sent:  Programme for Promoting People's Awareness
Translated Sentence :  यो एउटा त्रुटि       ।     


# Calculating BLEU Score for Test Data
In this section, we calculate the BLEU score for the test data to evaluate the quality of the translated Nepali sentences compared to the original.

In [83]:
# Calculating BLEU score for test data
eng = test["english_sent"].values
original = test["nepali_sent"].values
translated = [decode_sequence(sent) for sent in eng]
bleu = 0

for i in range(test.shape[0]):
    bleu += sentence_bleu([original[i].split()], translated[i].split(), weights = (0.5, 0.5))

print("BLEU score is : ", bleu / test.shape[0])

BLEU score is :  0.00878851258127425


# Calculating METEOR Score for Sentence Evaluation
Setup and Imports
Ensure you have downloaded the required NLTK resource for WordNet before proceeding.

In [84]:
import nltk

# Uncomment the line below and run it once to download the WordNet resource
nltk.download('wordnet')

# Now you can import meteor_score
from nltk.translate import meteor_score

# Your reference and hypothesis sentences
reference = "This is the reference sentence."
hypothesis = "This is the hypothesis sentence."

# Tokenize sentences into words
ref_tokens = reference.split()
hyp_tokens = hypothesis.split()

# Calculate METEOR score
score = meteor_score.meteor_score([ref_tokens], hyp_tokens)

print("METEOR Score:",score)

METEOR Score: 0.7500000000000001


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [78]:
!pip install sacrebleu




# Calculating BLEU Score using SacreBLEU
Imports and Loading Data
Import necessary libraries and load your reference and test data from CSV files using Pandas.

In [85]:
import pandas as pd
import sacrebleu

# Load CSV files into Pandas DataFrames
reference_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

# Assuming your CSV files have a column named 'sentence' for the sentences
reference_sentences = reference_df['english_sent'].tolist()
test_sentences = test_df['nepali_sent'].tolist()

# Combine reference sentences into a list of lists (as sacrebleu expects multiple references)
references = [[ref] for ref in reference_sentences]

# Calculate BLEU score using sacrebleu
bleu = sacrebleu.corpus_bleu(test_sentences, references)

print(f"BLEU Score: {bleu.score}")

BLEU Score: 1.9146030690102511
