In [1]:
import logging
import time

import numpy as np
import tensorflow as tf

import tensorflow_text

In [2]:
import tensorflow as tf
import tensorflow_hub as hub

# Sentences

In [8]:
import docx

def process_sentences():
    full_text_hindi = []
    full_text_garhwali = []
    
    #process hindi
    file_name = "Hindi_sentences1.docx"
    doc = docx.Document(file_name)
    for para in doc.paragraphs:
        full_text_hindi.append(para.text)
            
    #process garhwali
    file_name = "Garhwali_sentences.docx"
    doc = docx.Document(file_name)
    for para in doc.paragraphs:
        full_text_garhwali.append(para.text)
    
    
    return full_text_hindi, full_text_garhwali

In [9]:
hindi_text, garhwali_text = process_sentences()

In [11]:
len(hindi_text)

350

In [12]:
len(garhwali_text)

350

In [13]:
class Encoder():
    def __init__(self):
        self.encoder = hub.KerasLayer("MuRIL_1", trainable=True)
        self.preprocessor = hub.KerasLayer("MuRIL_preprocess_1")
        
    def generate_tokens(self, sentences):
        #list_tensor = tf.convert_to_tensor(sentences)
        #processor_output = self.preprocessor(list_tensor)
        processor_output = self.preprocessor(tf.constant([sentences]))
        encoder_output = self.encoder(processor_output)
        
        return encoder_output["sequence_output"]

In [14]:
class DecoderLabelsInputs():
    def __init__(self):
        self.preprocessor = hub.KerasLayer("MuRIL_preprocess_1")
        
    def generate_labels_inputs(self, sentences):
        #list_tensor = tf.convert_to_tensor(sentences)
        #tokens = self.preprocessor(list_tensor)['input_word_ids']
        
        tokens = self.preprocessor(tf.constant([sentences]))['input_word_ids']
        
        for i in range(tokens.shape[0]):
            token = np.array(tokens[i])
            token_list = []
            for _ in token:
                if _ != 0:
                    token_list.append(_)

            input_tokens = token_list[:-1]
            input_labels = token_list[1:]

            for j in range(128):
                input_tokens.append(0)
                input_labels.append(0)
            
            #reduce length to 128 
            input_labels = tf.ragged.constant([input_labels[:128]]).to_tensor()
            input_tokens = tf.ragged.constant([input_tokens[:128]]).to_tensor()
            
            #create decoder_label and decoder_input
            if i == 0:
                decoder_label = input_labels
                decoder_input = input_tokens
            
            #concat to decoder_label and decoder_input
            else:
                decoder_label = tf.concat([decoder_label, input_labels], axis = 0)
                decoder_input = tf.concat([decoder_input, input_tokens], axis = 0)
                
        
        return decoder_label, decoder_input

In [15]:
def positional_encoding(length, depth):
    depth = depth/2
    positions = np.arange(length)[:, np.newaxis]
    depths = np.arange(depth)[np.newaxis, :]/depth
    
    angle_rates = 1 / (10000**depths)
    angle_rads = positions * angle_rates
    
    pos_encoding = np.concatenate(
        [np.sin(angle_rads), np.cos(angle_rads)],
        axis = -1)
    
    return tf.cast(pos_encoding, dtype = tf.float32)

In [16]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero = True)
        self.pos_encoding = positional_encoding(length = 2048, depth = d_model)
        
    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)
    
    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

In [17]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(num_heads=8, key_dim=768)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

In [18]:
class CausalMask():
    def __init__(self):
        super().__init__()
        
    def _compute_causal_mask(self, query, value=None):    
        q_seq_length = tf.shape(query)[1]
        v_seq_length = q_seq_length if value is None else tf.shape(value)[1]
        return tf.linalg.band_part(tf.ones((1, q_seq_length, v_seq_length), tf.bool), -1, 0)

    def _compute_attention_mask(self, query, value, key=None, attention_mask=None, use_causal_mask=False):
        query_mask = getattr(query, "_keras_mask", None)
        value_mask = getattr(value, "_keras_mask", None)
        key_mask = getattr(key, "_keras_mask", None)
        auto_mask = None
        if query_mask is not None:
            query_mask = tf.cast(query_mask, tf.bool)  # defensive casting
            # B = batch size, T = max query length
            auto_mask = query_mask[:, :, tf.newaxis]  # shape is [B, T, 1]
        if value_mask is not None:
            value_mask = tf.cast(value_mask, tf.bool)  # defensive casting
            # B = batch size, S == max value length
            mask = value_mask[:, tf.newaxis, :]  # shape is [B, 1, S]
            auto_mask = mask if auto_mask is None else auto_mask & mask
        if key_mask is not None:
            key_mask = tf.cast(key_mask, tf.bool)  # defensive casting
            # B == batch size, S == max key length == max value length
            mask = key_mask[:, tf.newaxis, :]  # shape is [B, 1, S]
            auto_mask = mask if auto_mask is None else auto_mask & mask
        if use_causal_mask:
            # the shape of the causal mask is [1, T, S]
            mask = self._compute_causal_mask(query, value)
            auto_mask = mask if auto_mask is None else auto_mask & mask
        if auto_mask is not None:
            # merge attention_mask & automatic mask, to shape [B, T, S]
            attention_mask = (
                auto_mask
                if attention_mask is None
                else tf.cast(attention_mask, bool) & auto_mask
            )
        return attention_mask

In [19]:
class CrossAttention(BaseAttention, CausalMask):
    def call(self, x, context):
        mask = CausalMask()
        mask = mask._compute_attention_mask(query=x, 
                                        value=context, 
                                        key=context, 
                                        use_causal_mask=False)
        
        attn_output = self.mha(query = x,
                                key = context,
                                value = context,
                                attention_mask = mask)
        
        #self.last_attn_scores = attn_scores
        
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        
        return x

In [20]:
class CausalAttention(BaseAttention):
    def call(self, x):
        mask = CausalMask()
        mask = mask._compute_attention_mask(query=x, 
                                        value=x, 
                                        key=x, 
                                        use_causal_mask=True)
        
        attn_output = self.mha(query = x,
                                key = x,
                                value = x,
                                attention_mask = mask)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        
        return x

In [21]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model = 768, dff = 2048, dropout_rate = 0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation = 'relu'),
            tf.keras.layers.Dense(d_model),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()
        
    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x)
        
        return x

In [22]:
class DecoderLayer(tf.keras.layers.Layer):      
    def __init__(self):
        super(DecoderLayer, self).__init__()
        self.causal_self_attention = CausalAttention()
        self.cross_attention = CrossAttention()
        self.ffn = FeedForward()
        
    
    def call(self, x, context):
        x = self.causal_self_attention(x = x)
        x = self.cross_attention(x = x, context = context)
        
        x = self.ffn(x)
        return x

In [23]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers):
        super(Decoder, self).__init__()
        
        self.num_layers = num_layers
        self.pos_embedding = PositionalEmbedding(vocab_size = 197285, 
                                                 d_model = 768)
        self.dropout = tf.keras.layers.Dropout(0.1)
        
        self.dec_layers = [
            DecoderLayer()
            for _ in range(num_layers)
        ]
        
    def call(self, x, context):
        x = self.pos_embedding(x)
        x = self.dropout(x)
        
        for i in range(self.num_layers):
            x = self.dec_layers[i](x, context)
            
        return x

In [24]:
class TransformerDecoder(tf.keras.Model):
    def __init__(self):
        super().__init__()        
        self.decoder = Decoder(num_layers = 4)
        self.final_layer = tf.keras.layers.Dense(197285)
    
        
    def call(self, inputs):
        encoder_output, decoder_input = inputs
 
        decoder_output = self.decoder(decoder_input, encoder_output)
        
        logits = self.final_layer(decoder_output)
        return logits

In [25]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self):
        super().__init__()
        self.d_model = 768
        self.d_model = tf.cast(self.d_model, tf.float32)
        
        self.warmup_steps = 4000
        
    def __call__(self, step):
        step = tf.cast(step, dtype = tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [26]:
learning_rate = CustomSchedule()
optimizer = tf.keras.optimizers.Adam(learning_rate, 
                                     beta_1 = 0.9,
                                     beta_2 = 0.98,
                                     epsilon = 1e-9)

In [27]:
def masked_loss(decoder_label, pred_label):
    mask = decoder_label != 0
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
                                            from_logits = True,
                                            reduction = 'none')
    
    loss = loss_object(decoder_label, pred_label)
    
    mask = tf.cast(mask, dtype = loss.dtype)
    
    loss *= mask
    
    loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
    return loss

In [28]:
def masked_accuracy(decoder_label, pred_label):
    pred_label = tf.argmax(pred_label, axis = 2)
    decoder_label = tf.cast(decoder_label, pred_label.dtype)
    
    match = decoder_label == pred_label
    mask = decoder_label != 0
    
    match = match & mask
    
    match = tf.cast(match, dtype = tf.float32)
    mask = tf.cast(mask, dtype = tf.float32)
    
    return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [29]:
from tqdm import tqdm

encoder_output_list = []
decoder_input_list = []
decoder_label_list = []

garhwali_sent = garhwali_text
hindi_sent = hindi_text

encoder = Encoder()
decoder_labels_inputs = DecoderLabelsInputs()

for i in tqdm(range(len(hindi_sent))):
    encoder_output = encoder.generate_tokens(hindi_sent[i])
    encoder_output_list.append(encoder_output)
    
    decoder_label, decoder_input = decoder_labels_inputs.generate_labels_inputs(garhwali_sent[i])
    decoder_input_list.append(decoder_input)
    decoder_label_list.append(decoder_label)

100%|█████████████████████████████████████████| 350/350 [00:48<00:00,  7.18it/s]


In [30]:
tf_1 = encoder_output_list[0]
tf_2 = encoder_output_list[1]

encoder_output = tf.concat([tf_1, tf_2], axis = 0)

for i in tqdm(range(len(encoder_output_list[2:]))):
    tf_2 = encoder_output_list[i]
    encoder_output = tf.concat([encoder_output, tf_2], axis = 0)

100%|████████████████████████████████████████| 348/348 [00:03<00:00, 113.71it/s]


In [31]:
tf_1 = decoder_input_list[0]
tf_2 = decoder_input_list[1]

decoder_input = tf.concat([tf_1, tf_2], axis = 0)

for i in tqdm(range(len(decoder_input_list[2:]))):
    tf_2 = decoder_input_list[i]
    decoder_input = tf.concat([decoder_input, tf_2], axis = 0)

100%|██████████████████████████████████████| 348/348 [00:00<00:00, 16940.78it/s]


In [32]:
tf_1 = decoder_label_list[0]
tf_2 = decoder_label_list[1]

decoder_label = tf.concat([tf_1, tf_2], axis = 0)

for i in tqdm(range(len(decoder_label_list[2:]))):
    tf_2 = decoder_label_list[i]
    decoder_label = tf.concat([decoder_label, tf_2], axis = 0)

100%|██████████████████████████████████████| 348/348 [00:00<00:00, 29125.95it/s]


In [34]:
transformer_decoder = TransformerDecoder()
transformer_decoder.compile(loss = masked_loss,
                          optimizer = optimizer,
                          metrics = [masked_accuracy])

In [35]:
transformer_decoder.fit(x = (encoder_output, decoder_input),
                               y = decoder_label,
                               epochs = 20,
                               verbose = 1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2bbf4d700>

In [36]:
history = transformer_decoder.fit(x = (encoder_output, decoder_input),
                               y = decoder_label,
                               epochs = 10,
                               verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
