In [1]:
from byte_pair_level_transformer import Transformer
import torch
import numpy as np
import re
import collections

In [2]:
file_path = '/content/drive/MyDrive/spa.txt'


In [3]:
def text_preprocessing(text):
  # text = re.sub(r"([?.!,¿])", r" \1 ", text)
  text = re.sub(r'[" "]+', " ", text)
  # text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
  text = re.sub(r"[^a-zA-Z]+", " ", text)

  text = text.strip().lower()

  return text

In [4]:
num_data = 70000

In [5]:
with open(file_path,'r') as f:
  lines = f.readlines()
english_sentence,spanish_sentence = [],[]
for total_example,line in enumerate(lines):
  if (total_example < num_data ):
    line = line.lower()
    data = line.replace("\n","").split("\t")
    data[0] = text_preprocessing(data[0])
    data[1] = text_preprocessing(data[1])
    english_sentence.append(data[0])
    spanish_sentence.append(data[1])



In [6]:
spanish_vocabulary = []
english_vocabulary = []

In [7]:
for en,sp in zip(english_sentence,spanish_sentence):
  en_tokens = en.split()
  sp_tokens = sp.split()
  for en_token,sp_token in zip(en_tokens,sp_tokens):
    if en_token not in english_vocabulary:
      english_vocabulary.append(en_token)
    if sp_token not in spanish_vocabulary:
      spanish_vocabulary.append(sp_token)
  # print(english_vocabulary)




In [8]:
# vocab_limit = 9000

In [9]:
# english_vocabulary = english_vocabulary[:vocab_limit]

# spanish_vocabulary = spanish_vocabulary[:vocab_limit]


In [10]:
# index_to_spanish = {index:word for index,word in enumerate(spanish_vocabulary)}
# spanish_to_index = {word:index for index,word in enumerate(spanish_vocabulary)}
# index_to_english = {index:word for index,word in enumerate(english_vocabulary)}
# english_to_index = {word:index for index,word in enumerate(english_vocabulary)}
# # print(spanish_to_index)

In [11]:
len(english_vocabulary),len(spanish_vocabulary)

(7985, 14068)

In [12]:
import collections
en_word_freq_dict  = collections.defaultdict(int)

for word in english_vocabulary:
  en_word_freq_dict[" ".join(word) + ' #'] += 1
# en_word_freq_dict

In [13]:

sp_word_freq_dict  = collections.defaultdict(int)

for word in spanish_vocabulary:
  sp_word_freq_dict[" ".join(word) + ' #'] += 1
# sp_word_freq_dict

In [14]:
def get_pairs(word_freq_dict):
  '''
  goal:
      used to get the pairs dict:
                key(tuple{str,str}): represent the byte pairs
                values(int): represent the frequency of the byte pair
      and return the pairs dict
  '''
  pairs = collections.defaultdict(int)
  for word, freq in word_freq_dict.items():
    chars = word.split()
    for i in range(len(chars)-1):
      pairs[chars[i],chars[i+1]] += freq
  return pairs

def merge_byte_pairs(best_pair, word_freq_dict):
  '''
  goal:
      used to merge the byte pairs that has highest frequency
      and return the merged dict{new word_freq_dict}
  '''
  # print(best_pair)
  merged_dict = {}
  bigram = re.escape(' '.join(best_pair))
  # print(f'bigram {bigram}')
  p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
  # print(f'p {p}')
  for word in word_freq_dict:
    # print(word)
    w_out = p.sub(''.join(best_pair), word) # merging best byte pair.
    # print(f'w_out {w_out}')
    merged_dict[w_out] = word_freq_dict[word]
  return merged_dict

def get_subword_tokens(word_freq_dict):
  char_freq_dict = collections.defaultdict(int)
  for word,freq in word_freq_dict.items():
    chars = word.split()
    for char in chars:
      char_freq_dict[char] += freq
  return char_freq_dict


In [15]:
for i in range(9000):
  # if (i == 0):
    # print(f'{get_subword_tokens(en_word_freq_dict)}')
    # print('')
  pairs = get_pairs(en_word_freq_dict)
  try:
    best_pair = max(pairs,key = pairs.get)
  except:
    break
  # print(f"Iteration {i}: ")
  en_word_freq_dict = merge_byte_pairs(best_pair,en_word_freq_dict)
  en_subword_tokens = get_subword_tokens(en_word_freq_dict)
  # print(en_subword_tokens)
  # print(len(en_subword_tokens))
  # print()
  i += 1

In [16]:

for i in range(9000):
  # if (i == 0):
  #   print(f'{get_subword_tokens(sp_word_freq_dict)}')
  #   print('')
  pairs = get_pairs(sp_word_freq_dict)
  try:
    best_pair = max(pairs,key = pairs.get)
  except:
    break
  # print(f"Iteration {i}: ")
  sp_word_freq_dict = merge_byte_pairs(best_pair,sp_word_freq_dict)
  sp_subword_tokens = get_subword_tokens(sp_word_freq_dict)
  # print(sp_subword_tokens)
  # print(len(sp_subword_tokens))
  # print()
  i += 1

In [17]:
def measure_token_length(token):
    if token[-4:] == '#':
        return len(token[:-4]) + 1
    else:
        return len(token)

Byte pair vocabulary

In [18]:
en_sorted_tokens_tuple = sorted(en_subword_tokens.items(), key=lambda item: (measure_token_length(item[0]), item[1]), reverse=True)
sp_sorted_tokens_tuple = sorted(sp_subword_tokens.items(), key=lambda item: (measure_token_length(item[0]), item[1]), reverse=True)


In [19]:
en_sorted_tokens_tuple[0],en_sorted_tokens_tuple[-1]

(('hyperventilating#', 1), ('q', 1))

In [20]:
en_vocab_tokenization = [token for (token, freq) in en_sorted_tokens_tuple]
sp_vocab_tokenization = [token for (token, freq) in sp_sorted_tokens_tuple]

In [21]:
len(en_vocab_tokenization),len(sp_vocab_tokenization)

(7322, 8354)

In [22]:
en_vocab_tokenization[:10]

['hyperventilating#',
 'congratulations#',
 'understandable#',
 'claustrophobic#',
 'underestimated#',
 'disappointment#',
 'discrimination#',
 'concentrating#',
 'misunderstood#',
 'flabbergasted#']

In [23]:
en_lng_to_index = {}
en_index_to_lng = {}
k = 4
for i in en_vocab_tokenization:
  en_lng_to_index[i] = k
  en_index_to_lng[k] = i
  k += 1


sp_lng_to_index = {}
sp_index_to_lng = {}
k = 4
for i in sp_vocab_tokenization:
  sp_lng_to_index[i] = k
  sp_index_to_lng[k] = i
  k += 1

In [24]:


START_TOKEN = '<START>'
PADDING_TOKEN = '<PAD>'
END_TOKEN = '<END>'

en_index_to_lng[0] = '<OOV>'
sp_index_to_lng[0] = '<OOV>'
en_index_to_lng[1] = '<START>'
sp_index_to_lng[1] = '<START>'
en_index_to_lng[2] = '<END>'
sp_index_to_lng[2] = '<END>'
en_index_to_lng[3] = '<PAD>'
sp_index_to_lng[3] = '<PAD>'

en_lng_to_index['<OOV>'] = 0
sp_lng_to_index['<OOV>'] = 0
en_lng_to_index['<START>'] = 1
sp_lng_to_index['<START>'] = 1
en_lng_to_index['<END>'] = 2
sp_lng_to_index['<END>'] = 2
en_lng_to_index['<PAD>'] = 3
sp_lng_to_index['<PAD>'] = 3

In [25]:
en_lng_to_index,sp_lng_to_index

({'hyperventilating#': 4,
  'congratulations#': 5,
  'understandable#': 6,
  'claustrophobic#': 7,
  'underestimated#': 8,
  'disappointment#': 9,
  'discrimination#': 10,
  'concentrating#': 11,
  'misunderstood#': 12,
  'flabbergasted#': 13,
  'cardiologists#': 14,
  'consciousness#': 15,
  'archaeologist#': 16,
  'disappointing#': 17,
  'pseudoscience#': 18,
  'overemotional#': 19,
  'uncomfortable#': 20,
  'extraordinary#': 21,
  'inappropriate#': 22,
  'disillusioned#': 23,
  'inconsiderate#': 24,
  'misunderstand#': 25,
  'underestimate#': 26,
  'indispensable#': 27,
  'individualist#': 28,
  'congratulated#': 29,
  'hypochondriac#': 30,
  'investigating#': 31,
  'butterfingers#': 32,
  'eavesdropping#': 33,
  'encouragingly#': 34,
  'understanding#': 35,
  'experimenting#': 36,
  'unbelievable#': 37,
  'embarrassing#': 38,
  'melodramatic#': 39,
  'homeschooled#': 40,
  'housesitting#': 41,
  'overreacting#': 42,
  'thanksgiving#': 43,
  'honeymooning#': 44,
  'disappointed#': 4

In [34]:
english_sentence = english_sentence[:20000]
spanish_sentence = spanish_sentence[:20000]

In [35]:
len(english_sentence),len(spanish_sentence)

(20000, 20000)

In [36]:
english_sentence[:10]

['go', 'go', 'go', 'go', 'hi', 'run', 'run', 'who', 'fire', 'fire']

In [37]:
spanish_sentence[:10]

['ve',
 'vete',
 'vaya',
 'v yase',
 'hola',
 'corre',
 'corred',
 'qui n',
 'fuego',
 'incendio']

In [38]:
max(len(x) for x in english_sentence),max(len(x) for x in spanish_sentence)

(19, 66)

In [39]:
# computing avg length
print(sum(len(x) for x in english_sentence)/len(english_sentence))
print(sum(len(x) for x in spanish_sentence)/len(spanish_sentence))


14.9992
16.39655


In [43]:
d_model = 152
batch_size = 16
ffn_hidden = 152
num_heads = 2
drop_prob = 0.1
num_stacked = 1
max_token_length = 7
sp_vocab_size = len(sp_lng_to_index)
en_vocab_size = len(en_lng_to_index)

limit the length of the sequence

In [45]:
max(len(x) for x in english_sentence),max(len(x) for x in spanish_sentence)

(19, 66)

In [46]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
  '''
  overriding certain methods of the Dataset class
  '''
  def __init__(self,english_sentence,spanish_sentence):
    self.english_sentence = english_sentence
    self.spanish_sentence = spanish_sentence

  def __len__(self):
    return len(self.english_sentence)

  def __getitem__(self,index):
    return self.english_sentence[index],self.spanish_sentence[index]



In [47]:
dataset = TextDataset(english_sentence,spanish_sentence)

In [48]:
len(dataset)

20000

In [49]:

train_loader = DataLoader(dataset,batch_size=batch_size)
iterator = iter(train_loader)

In [50]:
for batch_num,batch in enumerate(iterator):
  print(batch)
  # break
  if (batch_num > 3):
    break

[('go', 'go', 'go', 'go', 'hi', 'run', 'run', 'who', 'fire', 'fire', 'fire', 'help', 'help', 'help', 'jump', 'jump'), ('ve', 'vete', 'vaya', 'v yase', 'hola', 'corre', 'corred', 'qui n', 'fuego', 'incendio', 'disparad', 'ayuda', 'socorro auxilio', 'auxilio', 'salta', 'salte')]
[('stop', 'stop', 'stop', 'wait', 'wait', 'go on', 'go on', 'hello', 'i ran', 'i ran', 'i try', 'i won', 'oh no', 'relax', 'smile', 'attack'), ('parad', 'para', 'pare', 'espera', 'esperen', 'contin a', 'contin e', 'hola', 'corr', 'corr a', 'lo intento', 'he ganado', 'oh no', 'tom telo con soda', 'sonr e', 'al ataque')]
[('attack', 'get up', 'go now', 'got it', 'got it', 'got it', 'he ran', 'hop in', 'hug me', 'i fell', 'i know', 'i left', 'i lied', 'i lost', 'i quit', 'i quit'), ('atacad', 'levanta', 've ahora mismo', 'lo tengo', 'lo pillas', 'entendiste', 'l corri', 'm tete adentro', 'abr zame', 'me ca', 'yo lo s', 'sal', 'ment', 'perd', 'dimito', 'renunci')]
[('i work', 'i m', 'i m up', 'listen', 'listen', 'lis

In [51]:
transformer = Transformer(d_model,
                          ffn_hidden,
                          num_heads,
                          drop_prob,
                          num_stacked,
                          max_token_length,
                          sp_vocab_size,
                          en_lng_to_index,
                          sp_lng_to_index,
                          START_TOKEN,
                          END_TOKEN,
                          PADDING_TOKEN)

In [52]:
transformer

Transformer(
  (encoder): Encoder(
    (sentence_embedding): SentenceEmbedding(
      (embedding): Embedding(7326, 152)
      (position_encoder): AbsolutePositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (qkv_layer): Linear(in_features=152, out_features=456, bias=True)
          (linear_layer): Linear(in_features=152, out_features=152, bias=True)
        )
        (norm1): LayerNormalization()
        (dropout): Dropout(p=0.1, inplace=False)
        (ffn): PositionalwiseFeedForwrd(
          (linear1): Linear(in_features=152, out_features=152, bias=True)
          (linear2): Linear(in_features=152, out_features=152, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm2): LayerNormalization()
      )
    )
  )
  (decoder): Decoder(
    (sentence_embedding): SentenceEmbedding(
      (embedding): Embe

In [53]:
from torch import nn

criterian = nn.CrossEntropyLoss(ignore_index=sp_lng_to_index[PADDING_TOKEN],
                                reduction='none')

# When computing the loss, we are ignoring cases when the label is the padding token
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


In [54]:
NEG_INFTY = -1e9

def create_masks(eng_batch, sp_batch):
    num_sentences = len(eng_batch) # {represent batch size}
    look_ahead_mask = torch.full([max_token_length, max_token_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_token_length, max_token_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_token_length, max_token_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_token_length, max_token_length] , False)

    for idx in range(num_sentences):
      eng_sentence_length, sp_sentence_length = len(eng_batch[idx]), len(sp_batch[idx])
      eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_token_length)
      sp_chars_to_padding_mask = np.arange(sp_sentence_length + 1, max_token_length)
      encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
      encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, sp_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, sp_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, sp_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask


Explaination 1 :

encoder_self_attention_mask and decoder_cross_attention_mask are used so that transformer do not pay attention to the padding tokens (which is done by putting zeros (till the length of the sentence + 1) and remaning part in sequence is covered by -infinity)







        example:
        max_sequence_length  = 8,   
                  sentence =  'good'(len = 4),num_sentence = 1
                  zero values should be till index len(sentence) + 1


        mask = [[[0 0 0 0 0 -inf -inf -inf],
                 [0 0 0 0 0 -inf -inf -inf],
                 [0 0 0 0 0 -inf -inf -inf],
                 [0 0 0 0 0 -inf -inf -inf],
                 [0 0 0 0 0 -inf -inf -inf],
                 [0 0 0 0 0 -inf -inf -inf],
                 [0 0 0 0 0 -inf -inf -inf],
                 [0 0 0 0 0 -inf -inf -inf]]]
                shape (num_sentence,max_sequence_length,max_sequence_length)
        
        attention_weights = [[[0.00426842, 0.00752416, 0.00349225, 0.00898395, 0.00556792, -0.00568639, -0.00325177, 0.00724002],
                             [0.00921758, 0.00367077, 0.00327958, 0.00263405, 0.00764168, -0.00392446, -0.00628386, 0.00685052],
                             [0.00997227, 0.00228168, 0.00833329, 0.00146394, 0.00922879, -0.00393896, -0.00372312, 0.00919514],
                             [0.00588389, 0.00815015, 0.00625498, 0.00393098, 0.0071409, -0.00682445, -0.00449244, 0.00170309],
                             [0.00125849, 0.00692958, 0.00917532, 0.00639848, 0.00209307, -0.00023777, -0.00540265, 0.00118428],
                             [0.00485591, 0.00195363, 0.00936389, 0.00918742, 0.00358588, -0.00993245, -0.00042846, 0.00660049],
                             [0.00730663, 0.00275739, 0.00828811, 0.00286777, 0.00250849, -0.00248524, -0.00326519, 0.00197197],
                             [0.00901291, 0.00702945, 0.00767226, 0.00873171, 0.0090118, -0.00064111, -0.00999714, 0.00365651]]]
                             shape (batch_size = 1,max_sequence_length,max_sequence_length)


        result =  (mask + attention_weights)
                         [[[0.00426842, 0.00752416, 0.00349225, 0.00898395, 0.00556792, -inf, -inf, -inf],
                         [0.00921758, 0.00367077, 0.00327958, 0.00263405, 0.00764168, -inf, -inf, -inf],
                         [0.00997227, 0.00228168, 0.00833329, 0.00146394, 0.00922879, -inf, -inf, -inf],
                         [0.00588389, 0.00815015, 0.00625498, 0.00393098, 0.0071409, -inf, -inf,-inf],
                         [0.00125849, 0.00692958, 0.00917532, 0.00639848, 0.00209307, -inf, -inf, -inf],
                         [0.00485591, 0.00195363, 0.00936389, 0.00918742, 0.00358588, -inf, -inf,-inf],
                         [0.00730663, 0.00275739, 0.00828811, 0.00286777, 0.00250849, -inf, -inf, -inf],
                         [0.00901291, 0.00702945, 0.00767226, 0.00873171, 0.0090118, -inf, -inf, -inf]]]
            


        softmax(result,dim = -1)
                  [[[0.1997, 0.2003, 0.1995, 0.2006, 0.1999, 0.0000, 0.0000, 0.0000],
                    [0.2008, 0.1997, 0.1996, 0.1995, 0.2005, 0.0000, 0.0000, 0.0000],
                    [0.2007, 0.1992, 0.2004, 0.1990, 0.2006, 0.0000, 0.0000, 0.0000],
                    [0.1999, 0.2004, 0.2000, 0.1995, 0.2002, 0.0000, 0.0000, 0.0000],
                    [0.1992, 0.2004, 0.2008, 0.2002, 0.1994, 0.0000, 0.0000, 0.0000],
                    [0.1998, 0.1992, 0.2007, 0.2007, 0.1996, 0.0000, 0.0000, 0.0000],
                    [0.2005, 0.1996, 0.2007, 0.1996, 0.1996, 0.0000, 0.0000, 0.0000],
                    [0.2001, 0.1997, 0.1999, 0.2001, 0.2001, 0.0000, 0.0000, 0.0000]]]

                         




Explaination 2 :

decoder_self_attention_mask is mainly used by the decoder's first sublayer known as masked_self_attention which is used so that while producing target token ,decoder should not able to see(or attend)  the future token or words





      example:
        max_sequence_length  = 8,   
                  sentence =  'good'(len = 4),num_sentence = 1
                  zero values should be till index len(sentence) + 1


        mask = [[[0 -inf -inf -inf -inf -inf -inf -inf],
                 [0  0  -inf  -inf -inf -inf -inf -inf],
                 [0  0   0   -inf  -inf -inf -inf -inf],
                 [0  0   0   0  -inf -inf -inf -inf],
                 [0  0   0   0   0 -inf -inf -inf],
                 [-inf -inf -inf -inf -inf  -inf -inf -inf],
                 [-inf -inf -inf -inf -inf -inf -inf -inf],
                 [-inf -inf -inf -inf -inf -inf -inf -inf ]]
                shape (num_sentence,max_sequence_length,max_sequence_length)


                Last  few vectors are filled  with infinity values
                because zeros have filled the total index which they can fill
                that is (len(good) + 1) and after that all is padding token where we do not need to pay attention that's why after paying attention to the last token of the sequence ,the next rows are filled with -infinity

                & Similar above steps...

In [55]:
eng_batch = ("happy",'good','joyful','sri ram jai ram')
sp_batch = ("happy",'good','joyful','hbhwsd')

encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch,sp_batch)
encoder_self_attention_mask.shape

torch.Size([4, 7, 7])

In [56]:
encoder_self_attention_mask[0]

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00, -1.0000e+09],
        [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09]])

In [57]:
decoder_self_attention_mask[0]

tensor([[ 0.0000e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -1.0000e+09,
         -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00, -1.0000e+09],
        [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09]])

In [58]:
decoder_cross_attention_mask[0]

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00, -1.0000e+09],
        [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09]])

In [59]:
decoder_self_attention_mask[1]

tensor([[ 0.0000e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -1.0000e+09,
         -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         -1.0000e+09, -1.0000e+09],
        [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09],
        [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09]])

In [60]:
len(en_index_to_lng)

7326

In [None]:
transformer.train() # used to set the model in training mode
# transformer.to(device)
total_loss = 0
num_epochs = 7

for epoch in range(num_epochs):
  # print(f"epoch {epoch}")
  iterator = iter(train_loader)
  '''
  looping on iterator we will get the batch of input_sequence and target_sequence
  [(batch_of_input_sequence),(batch_of_target_sequence)]
  '''

  for batch_num,batch  in enumerate(iterator):
    '''
    batch_num{int}: current batch number
    batch{list{tuple}}: batch of input_sequence and target_sequence
    '''
    transformer.train()
    eng_batch,sp_batch = batch
    # creating the mask for the eng_batch and sp_batch
    encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch, sp_batch)
    '''
    ***important***
    {the gradients of each mini-batch should be computed independantly}

    .zero-grad():set gradient to zero at the start of training of each mini-batch
                 so that while backpropogation the gradient is not accumulated:
                              (means gradient of these mini batch would not effect the gradient of other mini-batch)

    '''
    optim.zero_grad()

    sp_predictions = transformer(eng_batch,
                                 sp_batch,
                                 encoder_self_attention_mask,
                                 decoder_self_attention_mask,
                                 decoder_cross_attention_mask,
                                 enc_start_token = False,
                                 enc_end_token = False,
                                 dec_start_token = True,
                                 dec_end_token = True)
    '''
    labels{tensor}:
          converting spanish sentences into their index values based on spanish_to_index
    '''
    # print(sp_batch)
    labels = transformer.decoder.sentence_embedding.batch_tokenize(sp_batch, start_token=False, end_token=True)  # shspe (batch_size,max_sequence_length) {max_sequence_length  = num_queries}
    # print(labels.shape)
    '''
    loss = criterian(....):
        represent loss of each mini-batch
        by computimg loss over all characters(or tokens) in a batch of sentences
        loss[:max_sequnce_length] = loss of 1st sentence of batch
    '''
    loss = criterian(
        sp_predictions.view(-1,sp_vocab_size), # shape (batch_size * num_queries,sp_vocab_size)
        labels.view(-1) # shape (bach_size * num_queries.)
    ) # shape (batch_size * num_queries)
    # print(loss[:50])

    '''
    valid_indices:
                setting true value where labels are not padding token
                and false value where labels are padding token
    '''
    valid_indicies = torch.where(labels.view(-1) == sp_lng_to_index[PADDING_TOKEN], False, True) # shape (batch_size * num_queries)
    # print(valid_indicies[:50])
    # print(loss.sum())
    # print(valid_indicies.sum())
    '''
    loss = loss.sum()/valid...:
                represent a loss value(single number), where the loss of all padding tokens are ignored

    '''
    loss = loss.sum() / valid_indicies.sum()
    # print(loss)

    '''
    loss.backward():
                compute gradient or derivative using the loss function and model's parameters
                *** Theory ***
                          L = loss function
                          w = model's weight or pparameter
                          dL/dw = gradient of loss function wrt weight w
                          dL/dw:
                                represent how the loss function is changing if we change the model's parameter w(increase or decrease w value)

                                computed using chain rule of calculus

    optim.step():
              updating the model parameter with above computed gradient or derivative
              using a specific optmizer equation{Adam,Gradient descent,SGD,...}

    .item():
            return the value of torch tensor which containe only one value

    '''
    loss.backward()
    optim.step()


    if (batch_num % 100 == 0):
      print(f"epoch {epoch} batch {batch_num} loss {loss.item()}")
      print(f"English: {eng_batch[0]}")
      print(f"Spanish Translation: {sp_batch[0]}")
      '''
      sp_predictions[0]:
                  represent the output of transformer for a first trainable example in a batch
      sp_sentence_predicted:
                  represent the predicted index of spanish sentence for a first trainable example in a batch
      '''

      sp_sentence_predicted = torch.argmax(sp_predictions[0], # shape (num_queries,sp_vocab_size)
                                           axis=1)  # shape (num_queries,)
      # print(sp_sentence_predicted)
      predicted_sentence = ""
      for idx in sp_sentence_predicted:
        # print(idx.item())
        if idx == sp_lng_to_index[END_TOKEN]:
          break
        predicted_sentence += sp_index_to_lng[idx.item()]
      print(f'Spaniish Prediction: {predicted_sentence}')


      transformer.eval()  # used to set the model in evaluation mode
      '''
      ***Important***
         a =  ('dksdkk')
         type(a),len(a)
         --> str, 6
         a =  ('dksdkk',)
         type(a),len(a)
         ---> tuple, 1
      '''
      sp_sentence = ("",)
      eng_sentence  = ["everyone is happy."]
      for i,sentence in enumerate(eng_sentence):
        eng_sentence[i] = text_preprocessing(sentence)
      eng_sentence = tuple(eng_sentence)
      for word_counter in range(max_token_length):
          encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, sp_sentence)
          predictions = transformer(eng_sentence,
                                          sp_sentence,
                                          encoder_self_attention_mask,
                                          decoder_self_attention_mask,
                                          decoder_cross_attention_mask,
                                          enc_start_token=False,
                                          enc_end_token=False,
                                          dec_start_token=True,
                                          dec_end_token=False)
          # print(predictions.shape)
          '''
          next_token_:
                  contains raw prediction values for each token in vocabulary
                  example:
                        "<start>i am happy<end>"
                        input token <start> feeded to transformer decoder
                        then output of transformer represent raw prediction values,
                        predicting what could be the next word (after <start> token) form the vocabulary
                        shape (sp_vocab_size,)

                                  I    <end>   a      m     h
                        output = [21.5 , 7.9  , 0.1 , -20 , -2.4 ,...] {random_values}
                        highest prediction value is the next word

                        {real raw values range depnds on type activation function that is used}


          '''
          next_token_raw_distribution = predictions[0][word_counter] # shape (sp_vocab_size,)
          # print(next_token_raw_distribution)
          next_token_index = torch.argmax(next_token_raw_distribution).item()
          # print(next_token_index)
          next_token = sp_index_to_lng[next_token_index]
          # print(next_token)
          sp_sentence = (sp_sentence[0] + next_token,)
          if (next_token == END_TOKEN):
            break

      print(f'Evaluation translation of {eng_sentence[0]} : {sp_sentence[0]}')
      print(f'-----------------------------------------------------------')


    # break








epoch 0 batch 0 loss 8.971413612365723
English: go
Spanish Translation: ve
Spaniish Prediction: asfixi#cirug#eguarruinadas#despu#omeespesa#
Evaluation translation of everyone is happy : cirug#cirug#empeora#amenhabithabithabit
-----------------------------------------------------------
epoch 0 batch 100 loss 7.5302581787109375
English: be sensible
Spanish Translation: sed razonables
Spaniish Prediction: 
Evaluation translation of everyone is happy : <END>
-----------------------------------------------------------
epoch 0 batch 200 loss 6.425028324127197
English: she stood up
Spanish Translation: ella se levant
Spaniish Prediction: 
Evaluation translation of everyone is happy : <END>
-----------------------------------------------------------
epoch 0 batch 300 loss 6.201528549194336
English: life is tough
Spanish Translation: la vida es dura
Spaniish Prediction: 
Evaluation translation of everyone is happy : <END>
-----------------------------------------------------------
epoch 0 batch

In [None]:
len(sp_index_to_lng),len(sp_lng_to_index)

In [None]:
sp_index_to_lng[3800]

In [None]:
transformer.eval()
def translate(input_sentence):
  '''
      ***Important***
         a =  ('dksdkk')
         type(a),len(a)
         --> str, 6
         a =  ('dksdkk',)
         type(a),len(a)
         ---> tuple, 1
      '''
  sp_sentence = ("",)
  eng_sentence  = [input_sentence]
  for i,sentence in enumerate(eng_sentence):
      eng_sentence[i] = text_preprocessing(sentence)
  eng_sentence = tuple(eng_sentence)
  for word_counter in range(max_token_length):
      encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, sp_sentence)
      predictions = transformer(eng_sentence,
                                          sp_sentence,
                                          encoder_self_attention_mask,
                                          decoder_self_attention_mask,
                                          decoder_cross_attention_mask,
                                          enc_start_token=False,
                                          enc_end_token=False,
                                          dec_start_token=True,
                                          dec_end_token=False)
      # print(predictions.shape)
      '''
      next_token_:
              contains raw prediction values for each token in vocabulary
              example:
                    "<start>i am happy<end>"
                    input token <start> feeded to transformer decoder
                    then output of transformer represent raw prediction values,
                    predicting what could be the next word (after <start> token) form the vocabulary
                    shape (sp_vocab_size,)

                              I    <end>   a      m     h
                    output = [21.5 , 7.9  , 0.1 , -20 , -2.4 ,...] {random_values}
                    highest prediction value is the next word

                    {real raw values range depnds on type activation function that is used}


      '''
      next_token_raw_distribution = predictions[0][word_counter] # shape (sp_vocab_size,)
      # print(next_token_raw_distribution)
      next_token_index = torch.argmax(next_token_raw_distribution).item()
      # print(next_token_index)
      next_token = sp_index_to_lng[next_token_index]
      # print(next_token)
      sp_sentence = (sp_sentence[0] + next_token,)
      if (next_token == END_TOKEN):
        break

  print(f'Evaluation translation of  {eng_sentence[0]} : {sp_sentence[0]}')



In [None]:
translate("i am happy")

In [None]:
translate('i am sad')