In [1]:
import logging
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

import tensorflow_text as text

# Import Dataset

In [2]:
from datasets import load_dataset
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("cfilt/iitb-english-hindi")
list_dataset = dataset['train']['translation']
english_sent = []
hindi_sent = []

for i in tqdm(range(len(list_dataset))):
    english_sent.append(list_dataset[i]['en'])
    hindi_sent.append(list_dataset[i]['hi'])

Using custom data configuration cfilt--iitb-english-hindi-911387c6837f8b91
Reusing dataset parquet (/Users/kawaii/.cache/huggingface/datasets/parquet/cfilt--iitb-english-hindi-911387c6837f8b91/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121)
100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 51.32it/s]
100%|████████████████████████████| 1659083/1659083 [00:00<00:00, 2322289.43it/s]


# MURIL Preprocessor

In [5]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [7]:
preprocessor = hub.load("https://tfhub.dev/google/MuRIL_preprocess/1")
tokenizer = hub.KerasLayer(preprocessor.tokenize)

In [20]:
def get_tokenized_sentence(tokenizer, string):    
    text_input = tf.constant([string])
    tokens = tokenizer(text_input)
    
    return tokens    

In [10]:
get_tokenized_sentence(tokenizer=tokenizer, string='This is a sentence')

<tf.RaggedTensor [[[1475],
  [1121],
  [172],
  [30936]]]>

In [12]:
tokens = get_tokenized_sentence(tokenizer=tokenizer, string='This is a sentence')

# MURIL Encoder Input

In [21]:
def get_encoder_input(preprocessor, text_input):
    text_input = tf.constant([text_input])
    return preprocessor(text_input)

In [23]:
enc_inp = get_encoder_input(preprocessor=preprocessor, text_input = 'This is a sentence' )

# MURIL Encoder

In [18]:
encoder = hub.KerasLayer("MuRIL_1", trainable=True)

In [22]:
def get_encoder_output(encoder, preprocessor_output):
    outputs = encoder(preprocessor_output)
    
    # [batch_size, 768]
    pooled_output = outputs["pooled_output"]
    
    # [batch_size, seq_length, 768]
    sequence_output = outputs["sequence_output"]
    
    return sequence_output

In [24]:
get_encoder_output(encoder=encoder, preprocessor_output=enc_inp)

<tf.Tensor: shape=(1, 128, 768), dtype=float32, numpy=
array([[[-2.25443160e-03,  2.25501438e-03, -2.43421877e-04, ...,
         -1.00110667e-02, -1.94040581e-03, -3.23787844e-03],
        [-8.95238761e-03, -3.32291890e-03,  1.57416239e-03, ...,
         -2.00973749e-02,  6.45848922e-05, -1.70063751e-03],
        [ 3.04290047e-03,  2.23215972e-03, -1.03627553e-03, ...,
         -1.47598740e-02,  1.25555089e-04, -1.07521447e-03],
        ...,
        [-5.47535345e-03,  3.62805766e-03,  2.71241507e-03, ...,
         -1.26624415e-02, -1.24334078e-03, -5.86683489e-03],
        [-5.77889616e-03,  3.81670101e-03,  2.82584829e-03, ...,
         -1.29023548e-02, -1.41847553e-03, -5.85314073e-03],
        [-5.71267074e-03,  3.95243615e-03,  2.92897830e-03, ...,
         -1.30609758e-02, -1.46879756e-03, -5.95816970e-03]]],
      dtype=float32)>

In [None]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 80000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [None]:
# add [START] and [END] tokens

START = tf.argmax(tf.constant(reserved_tokens) == ['START']) + 1
END = tf.argmax(tf.constant(reserved_tokens) == ['END']) + 2

print(START)
print(END)

In [None]:
def add_start_end(ragged):
    count = ragged.bounding_shape()[0]
    starts = tf.fill([count,1], START)
    ends = tf.fill([count,1], END)
    return tf.concat([starts, ragged, ends], axis=1)

In [None]:
def cleanup_text(reserved_tokens, token_txt):
    # Drop the reserved tokens, except for "[UNK]".
    bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
    bad_token_re = "|".join(bad_tokens)
    
    bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
    result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

    # Join them into strings.
    result = tf.strings.reduce_join(result, separator=' ', axis=-1)
    return result

In [None]:
class CustomTokenizer(tf.Module):
    def __init__(self, reserved_tokens, vocab_path):
        self.tokenizer = text.BertTokenizer(vocab_path, lower_case = True)
        self.reserved_tokens = reserved_tokens
        self.vocab_path = tf.saved_model.Asset(vocab_path)
        
        vocab = pathlib.Path(vocab_path).read_text().splitlines()
        self.vocab = tf.Variable(vocab)
        
        #tokenizer signature
        self.tokenize.get_concrete_function(tf.TensorSpec(shape=[None], dtype = tf.string))
        
        self.detokenize.get_concrete_function(tf.TensorSpec(shape=[None, None], dtype = tf.int64))
        self.detokenize.get_concrete_function(tf.RaggedTensorSpec(shape=[None, None], dtype = tf.int64))
        
        self.lookup.get_concrete_function(tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.lookup.get_concrete_function(tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))
        
        self.get_vocab_size.get_concrete_function()
        self.get_vocab_path.get_concrete_function()
        self.get_reserved_tokens.get_concrete_function()
        
    @tf.function
    def tokenize(self, strings):
        enc = self.tokenizer.tokenize(strings)
        # Merge the `word` and `word-piece` axes.
        enc = enc.merge_dims(-2,-1)
        enc = add_start_end(enc)
        return enc
    
    
    @tf.function
    def detokenize(self, tokenized):
        words = self.tokenizer.detokenize(tokenized)
        return cleanup_text(self.reserved_tokens, words)
    
    
    @tf.function
    def lookup(self, token_ids):
        return tf.gather(self.vocab, token_ids)
    
    
    
    @tf.function
    def get_vocab_size(self):
        return tf.shape(self.vocab)[0]
    
    
    @tf.function
    def get_vocab_path(self):
        return self.vocab_path
    
    @tf.function
    def get_reserved_tokens(self):
        return tf.constant(self.reserved_tokens)     

In [None]:
tokenizers = tf.Module()
tokenizers.eng = CustomTokenizer(reserved_tokens, 'eng_vocab.txt')
tokenizers.hin = CustomTokenizer(reserved_tokens, 'hindi_vocab.txt')

In [None]:
# try out the tokenizer
print("English sentence = " + english_sent[0])
tokens = tokenizers.eng.tokenize([english_sent[0]])
print("Tokenized text = " + str(tokens))
words = tokenizers.eng.detokenize(tokens)
print("Detokenized text = " + str(words.numpy()[0].decode('utf-8')))

In [None]:
MAX_TOKENS=128
def prepare_batch(eng, hindi):
    hindi = tokenizers.hin.tokenize(hindi)  # Output is ragged.
    hindi = hindi[:, :MAX_TOKENS]    # Trim to MAX_TOKENS.
    hindi = hindi.to_tensor()  # Convert to 0-padded dense Tensor

    eng = tokenizers.eng.tokenize(eng)
    eng = eng[:, :(MAX_TOKENS+1)]
    en_inputs = eng[:, :-1].to_tensor()  # Drop the [END] tokens
    en_labels = eng[:, 1:].to_tensor()   # Drop the [START] tokens

    return hindi, en_inputs, en_labels

In [None]:
hindi_text, english_text, english_label = prepare_batch(english_sent[0], hindi_sent[0])

In [None]:
print(hindi_text.shape)
print(english_text.shape)
print(english_label.shape)

In [None]:
print(english_text)
print(english_label)

In [None]:
def positional_encoding(length, depth):
    depth = depth/2
    positions = np.arange(length)[:, np.newaxis]
    depths = np.arange(depth)[np.newaxis, :]/depth
    
    angle_rate = 1 / (10000**depths)
    angle_rad = positions * angle_rate
    
    pos_encoding = np.concatenate([np.sin(angle_rad), np.cos(angle_rad)], axis = -1)
    
    return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
pos_encoding = positional_encoding(length=2048, depth=768)

# Check the shape.
print(pos_encoding.shape)

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero = True)
        self.pos_encoding = positional_encoding(length = 2048, depth = d_model)
        
    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)
    
    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        
        # This factor sets the relative scale of the embedding and positonal_encoding.
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

In [None]:
embed_hindi = PositionalEmbedding(vocab_size=tokenizers.hin.get_vocab_size(), d_model=768)
embed_eng = PositionalEmbedding(vocab_size=tokenizers.eng.get_vocab_size(), d_model=768)

hin_emb = embed_hindi(hindi_text)
eng_emb = embed_eng(english_text)

In [None]:
eng_emb._keras_mask