In [1]:
import logging
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

import tensorflow_text as text

# Import Dataset

In [2]:
from datasets import load_dataset
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("cfilt/iitb-english-hindi")
list_dataset = dataset['train']['translation']
english_sent = []
hindi_sent = []

for i in tqdm(range(len(list_dataset))):
    english_sent.append(list_dataset[i]['en'])
    hindi_sent.append(list_dataset[i]['hi'])

Using custom data configuration cfilt--iitb-english-hindi-911387c6837f8b91
Reusing dataset parquet (/Users/kawaii/.cache/huggingface/datasets/parquet/cfilt--iitb-english-hindi-911387c6837f8b91/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121)
100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 37.96it/s]
100%|████████████████████████████| 1659083/1659083 [00:00<00:00, 2099857.38it/s]


# MURIL Preprocessor

In [4]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [5]:
preprocessor = hub.load("https://tfhub.dev/google/MuRIL_preprocess/1")
tokenizer = hub.KerasLayer(preprocessor.tokenize)

2023-02-10 00:38:21.135582: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [6]:
def get_tokenized_sentence(tokenizer, string):    
    text_input = tf.constant([string])
    tokens = tokenizer(text_input)
    
    return tokens    

In [8]:
tokens = get_tokenized_sentence(tokenizer=tokenizer, string='This is a sentence')

# MURIL Decoder Input

In [10]:
def get_decoder_input(preprocessor, text_input):
    text_input = tf.constant([text_input])
    return preprocessor(text_input)

In [11]:
dec_inp = get_decoder_input(preprocessor=preprocessor, text_input = 'This is a sentence' )

In [13]:
dec_inp['input_word_ids']

<tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[  104,  1475,  1121,   172, 30936,   105,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [55]:
tokenized_sent = []
sent = np.array(dec_inp['input_word_ids'])
for token in sent[0]:
    if token != 0:
        tokenized_sent.append(token)
        
inputs = tokenized_sent[:-1]
labels = tokenized_sent[1:]

for i in range(128 - len(labels)):
    inputs.append(0)
    labels.append(0)
    

inputs = tf.ragged.constant(inputs)
labels = tf.ragged.constant([labels])

print(inputs)
print(labels)

tf.Tensor(
[  104  1475  1121   172 30936     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0], shape=(128,), dtype=int32)
<tf.RaggedTensor [[1475, 1121, 172, 30936, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 

# Positional Embedding and Encoding

In [52]:
def positional_encoding(length, depth):
    depth = depth/2
    positions = np.arange(length)[:, np.newaxis]
    depths = np.arange(depth)[np.newaxis, :]/depth
    
    angle_rates = 1 / (10000**depths)
    angle_rads = positions * angle_rates
    
    pos_encoding = np.concatenate(
        [np.sin(angle_rads), np.cos(angle_rads)],
        axis = -1)
    
    return tf.cast(pos_encoding, dtype = tf.float32)

In [63]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero = True)
        self.pos_encoding = positional_encoding(length = 2048, depth = d_model)
        
    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)
    
    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

In [66]:
eng_embedding = PositionalEmbedding(vocab_size = 80000, d_model = 512)

eng_embedding(np.array([inputs]))._keras_mask

<tf.Tensor: shape=(1, 128), dtype=bool, numpy=
array([[ True,  True,  True,  True,  True, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, F

# Base Attention Layer

In [34]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

# Cross Attention Layer

In [35]:
class CrossAttention(BaseAttention):
    def call(self, x, context):
        attn_output, attn_scores = self.mha(query = x,
                                            key = context,
                                            value = context,
                                            return_attention_scores = True)
        
        self.last_attn_scores = attn_scores
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        
        return x

In [36]:
sample_ca = CrossAttention(num_heads = 2, key_dim = 768)

# Casual Self Attention Layer

In [39]:
class CasualSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(query = x,
                               value = x,
                               key = x,
                               use_casual_mask = True)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        
        return x

In [40]:
sample_csa = CasualSelfAttention(num_heads = 2, key_dim = 768)

# Feed Forward

In [41]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate = 0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation = 'relu'),
            tf.keras.layers.Dense(d_model),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()
        
    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x)
        
        return x

# Decoder Layer

In [42]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads, dff, dropout_rate = 0.1):
        super(DecoderLayer, self).__init__()
        
        self.casual_self_attention = CasualSelfAttention(
                                        num_heads = num_heads,
                                        key_dim = d_model,
                                        dropout = dropout_rate)
        
        self.cross_attention = CrossAttention(
                                num_heads = num_heads,
                                key_dim = d_model,
                                dropout = dropout_rate)
        
        self.ffn = FeedForward(d_model, dff)
        
    def call(self, x, context):
        x = self.casual_self_attention(x = x)
        x = self.cross_attention(x = x, context = context)
        
        #cache the last attention to plot
        self.last_attn_scores = self.cross_attention.last_attn_scores
        
        #Shape (batch_size, seq_len, d_model)
        x = self.ffn(x)
        
        return x

In [43]:
sample_decoder_layer = DecoderLayer(d_model = 512, num_heads = 8, dff = 2048)

In [44]:
sample_decoder_layer

<__main__.DecoderLayer at 0x30c0bbe50>

# The Decoder

In [67]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads, dff,
                 vocab_size, dropout_rate = 0.1):
        super(Decoder, self).__init__()
        
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.pos_embedding = PositionalEmbedding(vocab_size = vocab_size,
                                                d_model = d_model)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dec_layers = [
            DecoderLayer(d_model = d_model, num_heads = num_heads,
                        dff = dff, dropout_rate = dropout_rate)
            for _ in range(num_layers)]
        
        self.last_attn_scores = None
        
    def call(self, x, context):
        x = self.pos_embedding(x)
        x = self.dropout(x)
        
        for i in range(self.num_layers):
            x = self.dec_layers[i](x, context)
            
        self.last_attn_scores = self.dec_layers[-1].last_attn_scores
        
        return x

In [69]:
np.array([inputs]).shape

(1, 128)

In [71]:
np.array([np.ones(128)]).shape

(1, 128)

In [None]:
sample_decoder = Decoder(num_layers = 4,
                         d_model = 512,
                         num_heads = 8,
                         dff = 2048,
                         vocab_size = 80000)

output = sample_decoder(x = np.array([inputs])
                        context = #output of the encoder)