In [1]:
from byte_pair_level_transformer import Transformer,SentenceEmbedding
import torch
import numpy as np
import re
import collections

In [2]:
file_path = r'D:\\test_locally_notebooks\\test_pytorch\\solving_mak_in_BPE\\spa.txt'


In [3]:
def text_preprocessing(text):
  # text = re.sub(r"([?.!,¿])", r" \1 ", text)
  text = re.sub(r'[" "]+', " ", text)
  # text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
  text = re.sub(r"[^a-zA-Z]+", " ", text)

  text = text.strip().lower()

  return text

In [4]:
num_data = 5000

In [6]:
with open(file_path,'r',encoding='utf-8') as f:
  lines = f.readlines()
english_sentence,spanish_sentence = [],[]
for total_example,line in enumerate(lines):
  if (total_example < num_data ):
    line = line.lower()
    data = line.replace("\n","").split("\t")
    data[0] = text_preprocessing(data[0])
    data[1] = text_preprocessing(data[1])
    english_sentence.append(data[0])
    spanish_sentence.append(data[1])



In [7]:
spanish_vocabulary = []
english_vocabulary = []

In [8]:
for en,sp in zip(english_sentence,spanish_sentence):
  en_tokens = en.split()
  sp_tokens = sp.split()
  for en_token,sp_token in zip(en_tokens,sp_tokens):
    if en_token not in english_vocabulary:
      english_vocabulary.append(en_token)
    if sp_token not in spanish_vocabulary:
      spanish_vocabulary.append(sp_token)
  # print(english_vocabulary)




In [None]:
len(english_vocabulary),len(spanish_vocabulary)

In [10]:
import collections
en_word_freq_dict  = collections.defaultdict(int)

for word in english_vocabulary:
  en_word_freq_dict[" ".join(word) + ' #'] += 1
# en_word_freq_dict

In [11]:

sp_word_freq_dict  = collections.defaultdict(int)

for word in spanish_vocabulary:
  sp_word_freq_dict[" ".join(word) + ' #'] += 1
# sp_word_freq_dict

In [12]:
def get_pairs(word_freq_dict):
  '''
  goal:
      used to get the pairs dict:
                key(tuple{str,str}): represent the byte pairs
                values(int): represent the frequency of the byte pair
      and return the pairs dict
  '''
  pairs = collections.defaultdict(int)
  for word, freq in word_freq_dict.items():
    chars = word.split()
    for i in range(len(chars)-1):
      pairs[chars[i],chars[i+1]] += freq
  return pairs

def merge_byte_pairs(best_pair, word_freq_dict):
  '''
  goal:
      used to merge the byte pairs that has highest frequency
      and return the merged dict{new word_freq_dict}
  '''
  # print(best_pair)
  merged_dict = {}
  bigram = re.escape(' '.join(best_pair))
  # print(f'bigram {bigram}')
  p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
  # print(f'p {p}')
  for word in word_freq_dict:
    # print(word)
    w_out = p.sub(''.join(best_pair), word) # merging best byte pair.
    # print(f'w_out {w_out}')
    merged_dict[w_out] = word_freq_dict[word]
  return merged_dict

def get_subword_tokens(word_freq_dict):
  char_freq_dict = collections.defaultdict(int)
  for word,freq in word_freq_dict.items():
    chars = word.split()
    for char in chars:
      char_freq_dict[char] += freq
  return char_freq_dict


In [13]:
for i in range(1000):
  # if (i == 0):
    # print(f'{get_subword_tokens(en_word_freq_dict)}')
    # print('')
  pairs = get_pairs(en_word_freq_dict)
  try:
    best_pair = max(pairs,key = pairs.get)
  except:
    break
  # print(f"Iteration {i}: ")
  en_word_freq_dict = merge_byte_pairs(best_pair,en_word_freq_dict)
  en_subword_tokens = get_subword_tokens(en_word_freq_dict)
  # print(en_subword_tokens)
  # print(len(en_subword_tokens))
  # print()
  i += 1

In [14]:

for i in range(1000):
  # if (i == 0):
  #   print(f'{get_subword_tokens(sp_word_freq_dict)}')
  #   print('')
  pairs = get_pairs(sp_word_freq_dict)
  try:
    best_pair = max(pairs,key = pairs.get)
  except:
    break
  # print(f"Iteration {i}: ")
  sp_word_freq_dict = merge_byte_pairs(best_pair,sp_word_freq_dict)
  sp_subword_tokens = get_subword_tokens(sp_word_freq_dict)
  # print(sp_subword_tokens)
  # print(len(sp_subword_tokens))
  # print()
  i += 1

In [15]:
def measure_token_length(token):
    if token[-4:] == '#':
        return len(token[:-4]) + 1
    else:
        return len(token)

Byte pair vocabulary

In [16]:
en_sorted_tokens_tuple = sorted(en_subword_tokens.items(), key=lambda item: (measure_token_length(item[0]), item[1]), reverse=True)
sp_sorted_tokens_tuple = sorted(sp_subword_tokens.items(), key=lambda item: (measure_token_length(item[0]), item[1]), reverse=True)


In [None]:
en_sorted_tokens_tuple[0],en_sorted_tokens_tuple[-1]

In [18]:
en_vocab_tokenization = [token for (token, freq) in en_sorted_tokens_tuple]
sp_vocab_tokenization = [token for (token, freq) in sp_sorted_tokens_tuple]

In [None]:
len(en_vocab_tokenization),len(sp_vocab_tokenization)

In [None]:
en_vocab_tokenization[:10]

In [21]:
en_lng_to_index = {}
en_index_to_lng = {}
k = 4
for i in en_vocab_tokenization:
  en_lng_to_index[i] = k
  en_index_to_lng[k] = i
  k += 1


sp_lng_to_index = {}
sp_index_to_lng = {}
k = 4
for i in sp_vocab_tokenization:
  sp_lng_to_index[i] = k
  sp_index_to_lng[k] = i
  k += 1

In [22]:


START_TOKEN = '<START>'
PADDING_TOKEN = '<PAD>'
END_TOKEN = '<END>'

en_index_to_lng[0] = '<OOV>'
sp_index_to_lng[0] = '<OOV>'
en_index_to_lng[1] = '<START>'
sp_index_to_lng[1] = '<START>'
en_index_to_lng[2] = '<END>'
sp_index_to_lng[2] = '<END>'
en_index_to_lng[3] = '<PAD>'
sp_index_to_lng[3] = '<PAD>'

en_lng_to_index['<OOV>'] = 0
sp_lng_to_index['<OOV>'] = 0
en_lng_to_index['<START>'] = 1
sp_lng_to_index['<START>'] = 1
en_lng_to_index['<END>'] = 2
sp_lng_to_index['<END>'] = 2
en_lng_to_index['<PAD>'] = 3
sp_lng_to_index['<PAD>'] = 3

In [None]:
en_lng_to_index,sp_lng_to_index

In [24]:
len(english_sentence),len(spanish_sentence)

(5000, 5000)

In [None]:
english_sentence[:10]

In [None]:
spanish_sentence[:10]

In [None]:
max(len(x) for x in english_sentence),max(len(x) for x in spanish_sentence)

In [None]:
# computing avg length
print(sum(len(x) for x in english_sentence)/len(english_sentence))
print(sum(len(x) for x in spanish_sentence)/len(spanish_sentence))


In [49]:
d_model = 152
batch_size = 16
ffn_hidden = 152
num_heads = 2
drop_prob = 0.1
num_stacked = 1
max_token_length = 17
sp_vocab_size = len(sp_lng_to_index)
en_vocab_size = len(en_lng_to_index)

limit the length of the sequence

In [None]:
max(len(x) for x in english_sentence),max(len(x) for x in spanish_sentence)

In [51]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
  '''
  overriding certain methods of the Dataset class
  '''
  def __init__(self,english_sentence,spanish_sentence):
    self.english_sentence = english_sentence
    self.spanish_sentence = spanish_sentence

  def __len__(self):
    return len(self.english_sentence)

  def __getitem__(self,index):
    return self.english_sentence[index],self.spanish_sentence[index]



In [52]:
dataset = TextDataset(english_sentence,spanish_sentence)

In [None]:
len(dataset)

In [54]:

train_loader = DataLoader(dataset,batch_size=batch_size)
iterator = iter(train_loader)

In [None]:
for batch_num,batch in enumerate(iterator):
  print(batch)
  # break
  if (batch_num > 3):
    break

In [56]:
transformer = Transformer(d_model,
                          ffn_hidden,
                          num_heads,
                          drop_prob,
                          num_stacked,
                          max_token_length,
                          sp_vocab_size,
                          en_lng_to_index,
                          sp_lng_to_index,
                          START_TOKEN,
                          END_TOKEN,
                          PADDING_TOKEN)

In [None]:
transformer

In [58]:
from torch import nn

criterian = nn.CrossEntropyLoss(ignore_index=sp_lng_to_index[PADDING_TOKEN],
                                reduction='none')

# When computing the loss, we are ignoring cases when the label is the padding token
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


In [59]:
NEG_INFTY = -1e9

def create_masks(eng_base_mask, sp_base_mask):
    num_sentences = eng_base_mask.shape[0] # {represent batch size}
    look_ahead_mask = torch.full([max_token_length, max_token_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_token_length, max_token_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_token_length, max_token_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_token_length, max_token_length] , False)

    for idx in range(num_sentences):
      eng_sentence_length,sp_sentence_length = 0,0
      for en_i,sp_i in zip(eng_base_mask[idx],sp_base_mask[idx]):
        if en_i.item() != 3:
          eng_sentence_length += 1
        if sp_i.item() != 3:
          sp_sentence_length += 1
      eng_sentence_length, sp_sentence_length = len(eng_batch[idx]), len(sp_batch[idx])
      eng_chars_to_padding_mask = np.arange(eng_sentence_length , max_token_length)
      sp_chars_to_padding_mask = np.arange(sp_sentence_length + 1, max_token_length)
      encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
      encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, sp_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, sp_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, sp_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask


In [60]:
sentence_embedding = SentenceEmbedding(
                          
                          max_token_length,
                          d_model,
                          en_lng_to_index,
                          START_TOKEN,
                          END_TOKEN,
                          PADDING_TOKEN)

In [None]:
eng_batch = ("happy",'good','joyful','sri ram jai ram')
sp_batch = ("happy",'good','joyful','hbhwsd')

encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(sentence_embedding.batch_tokenize(eng_batch,False,False),sentence_embedding.batch_tokenize(sp_batch,False,False))
encoder_self_attention_mask.shape

In [None]:
encoder_self_attention_mask[0]

In [None]:
decoder_self_attention_mask[0]

In [None]:
decoder_cross_attention_mask[0]

In [None]:
decoder_self_attention_mask[1]

In [66]:
len(en_index_to_lng)

910

In [None]:
transformer.train() # used to set the model in training mode
# transformer.to(device)
total_loss = 0
num_epochs = 7

for epoch in range(num_epochs):
  # print(f"epoch {epoch}")
  iterator = iter(train_loader)
  '''
  looping on iterator we will get the batch of input_sequence and target_sequence
  [(batch_of_input_sequence),(batch_of_target_sequence)]
  '''

  for batch_num,batch  in enumerate(iterator):
    '''
    batch_num{int}: current batch number
    batch{list{tuple}}: batch of input_sequence and target_sequence
    '''
    transformer.train()
    eng_batch,sp_batch = batch
    # creating the mask for the eng_batch and sp_batch
    encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(sentence_embedding.batch_tokenize(eng_batch,False,False),sentence_embedding.batch_tokenize(sp_batch,False,False))
    '''
    ***important***
    {the gradients of each mini-batch should be computed independantly}

    .zero-grad():set gradient to zero at the start of training of each mini-batch
                 so that while backpropogation the gradient is not accumulated:
                              (means gradient of these mini batch would not effect the gradient of other mini-batch)

    '''
    optim.zero_grad()

    sp_predictions = transformer(eng_batch,
                                 sp_batch,
                                 encoder_self_attention_mask,
                                 decoder_self_attention_mask,
                                 decoder_cross_attention_mask,
                                 enc_start_token = False,
                                 enc_end_token = False,
                                 dec_start_token = True,
                                 dec_end_token = True)
    '''
    labels{tensor}:
          converting spanish sentences into their index values based on spanish_to_index
    '''
    # print(sp_batch)
    labels = transformer.decoder.sentence_embedding.batch_tokenize(sp_batch, start_token=False, end_token=True)  # shspe (batch_size,max_sequence_length) {max_sequence_length  = num_queries}
    # print(labels.shape)
    '''
    loss = criterian(....):
        represent loss of each mini-batch
        by computimg loss over all characters(or tokens) in a batch of sentences
        loss[:max_sequnce_length] = loss of 1st sentence of batch
    '''
    loss = criterian(
        sp_predictions.view(-1,sp_vocab_size), # shape (batch_size * num_queries,sp_vocab_size)
        labels.view(-1) # shape (bach_size * num_queries.)
    ) # shape (batch_size * num_queries)
    # print(loss[:50])

    '''
    valid_indices:
                setting true value where labels are not padding token
                and false value where labels are padding token
    '''
    valid_indicies = torch.where(labels.view(-1) == sp_lng_to_index[PADDING_TOKEN], False, True) # shape (batch_size * num_queries)
    # print(valid_indicies[:50])
    # print(loss.sum())
    # print(valid_indicies.sum())
    '''
    loss = loss.sum()/valid...:
                represent a loss value(single number), where the loss of all padding tokens are ignored

    '''
    loss = loss.sum() / valid_indicies.sum()
    # print(loss)

    '''
    loss.backward():
                compute gradient or derivative using the loss function and model's parameters
                *** Theory ***
                          L = loss function
                          w = model's weight or pparameter
                          dL/dw = gradient of loss function wrt weight w
                          dL/dw:
                                represent how the loss function is changing if we change the model's parameter w(increase or decrease w value)

                                computed using chain rule of calculus

    optim.step():
              updating the model parameter with above computed gradient or derivative
              using a specific optmizer equation{Adam,Gradient descent,SGD,...}

    .item():
            return the value of torch tensor which containe only one value

    '''
    loss.backward()
    optim.step()


    if (batch_num % 100 == 0):
      print(f"epoch {epoch} batch {batch_num} loss {loss.item()}")
      print(f"English: {eng_batch[0]}")
      print(f"Spanish Translation: {sp_batch[0]}")
      '''
      sp_predictions[0]:
                  represent the output of transformer for a first trainable example in a batch
      sp_sentence_predicted:
                  represent the predicted index of spanish sentence for a first trainable example in a batch
      '''

      sp_sentence_predicted = torch.argmax(sp_predictions[0], # shape (num_queries,sp_vocab_size)
                                           axis=1)  # shape (num_queries,)
      # print(sp_sentence_predicted)
      predicted_sentence = ""
      for idx in sp_sentence_predicted:
        # print(idx.item())
        if idx == sp_lng_to_index[END_TOKEN]:
          break
        predicted_sentence += sp_index_to_lng[idx.item()]
      print(f'Spaniish Prediction: {predicted_sentence}')


      transformer.eval()  # used to set the model in evaluation mode
      '''
      ***Important***
         a =  ('dksdkk')
         type(a),len(a)
         --> str, 6
         a =  ('dksdkk',)
         type(a),len(a)
         ---> tuple, 1
      '''
      sp_sentence = ("",)
      eng_sentence  = ["everyone is happy."]
      for i,sentence in enumerate(eng_sentence):
        eng_sentence[i] = text_preprocessing(sentence)
      eng_sentence = tuple(eng_sentence)
      for word_counter in range(max_token_length):
          encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(sentence_embedding.batch_tokenize(eng_sentence,False,False),sentence_embedding.batch_tokenize(sp_sentence,False,False))
          predictions = transformer(eng_sentence,
                                          sp_sentence,
                                          encoder_self_attention_mask,
                                          decoder_self_attention_mask,
                                          decoder_cross_attention_mask,
                                          enc_start_token=False,
                                          enc_end_token=False,
                                          dec_start_token=True,
                                          dec_end_token=False)
          # print(predictions.shape)
          '''
          next_token_:
                  contains raw prediction values for each token in vocabulary
                  example:
                        "<start>i am happy<end>"
                        input token <start> feeded to transformer decoder
                        then output of transformer represent raw prediction values,
                        predicting what could be the next word (after <start> token) form the vocabulary
                        shape (sp_vocab_size,)

                                  I    <end>   a      m     h
                        output = [21.5 , 7.9  , 0.1 , -20 , -2.4 ,...] {random_values}
                        highest prediction value is the next word

                        {real raw values range depnds on type activation function that is used}


          '''
          next_token_raw_distribution = predictions[0][word_counter] # shape (sp_vocab_size,)
          # print(next_token_raw_distribution)
          next_token_index = torch.argmax(next_token_raw_distribution).item()
          # print(next_token_index)
          next_token = sp_index_to_lng[next_token_index]
          # print(next_token)
          sp_sentence = (sp_sentence[0] + next_token,)
          if (next_token == END_TOKEN):
            break

      print(f'Evaluation translation of {eng_sentence[0]} : {sp_sentence[0]}')
      print(f'-----------------------------------------------------------')


    # break








In [None]:
len(sp_index_to_lng),len(sp_lng_to_index)

In [None]:
sp_index_to_lng[3800]

In [None]:
transformer.eval()
def translate(input_sentence):
  '''
      ***Important***
         a =  ('dksdkk')
         type(a),len(a)
         --> str, 6
         a =  ('dksdkk',)
         type(a),len(a)
         ---> tuple, 1
      '''
  sp_sentence = ("",)
  eng_sentence  = [input_sentence]
  for i,sentence in enumerate(eng_sentence):
      eng_sentence[i] = text_preprocessing(sentence)
  eng_sentence = tuple(eng_sentence)
  for word_counter in range(max_token_length):
      encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(sentence_embedding.batch_tokenize(eng_sentence,False,False),sentence_embedding.batch_tokenize(sp_sentence,False,False))
      predictions = transformer(eng_sentence,
                                          sp_sentence,
                                          encoder_self_attention_mask,
                                          decoder_self_attention_mask,
                                          decoder_cross_attention_mask,
                                          enc_start_token=False,
                                          enc_end_token=False,
                                          dec_start_token=True,
                                          dec_end_token=False)
      # print(predictions.shape)
      '''
      next_token_:
              contains raw prediction values for each token in vocabulary
              example:
                    "<start>i am happy<end>"
                    input token <start> feeded to transformer decoder
                    then output of transformer represent raw prediction values,
                    predicting what could be the next word (after <start> token) form the vocabulary
                    shape (sp_vocab_size,)

                              I    <end>   a      m     h
                    output = [21.5 , 7.9  , 0.1 , -20 , -2.4 ,...] {random_values}
                    highest prediction value is the next word

                    {real raw values range depnds on type activation function that is used}


      '''
      next_token_raw_distribution = predictions[0][word_counter] # shape (sp_vocab_size,)
      # print(next_token_raw_distribution)
      next_token_index = torch.argmax(next_token_raw_distribution).item()
      # print(next_token_index)
      next_token = sp_index_to_lng[next_token_index]
      # print(next_token)
      sp_sentence = (sp_sentence[0] + next_token,)
      if (next_token == END_TOKEN):
        break

  print(f'Evaluation translation of  {eng_sentence[0]} : {sp_sentence[0]}')



In [None]:
translate("i am happy")

In [None]:
translate('i am sad')