In [1]:
# from byte_pair_level_transformer import Transformer
import torch
import numpy as np
import re
import collections

In [2]:
file_path = '/content/drive/MyDrive/spa.txt'


In [3]:
def text_preprocessing(text):
  # text = re.sub(r"([?.!,¿])", r" \1 ", text)
  text = re.sub(r'[" "]+', " ", text)
  # text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
  text = re.sub(r"[^a-zA-Z]+", " ", text)

  text = text.strip().lower()

  return text

In [4]:
num_data = 10000

In [5]:
with open(file_path,'r') as f:
  lines = f.readlines()
english_sentence,spanish_sentence = [],[]
for total_example,line in enumerate(lines):
  if (total_example < num_data ):
    line = line.lower()
    data = line.replace("\n","").split("\t")
    data[0] = text_preprocessing(data[0])
    data[1] = text_preprocessing(data[1])
    english_sentence.append(data[0])
    spanish_sentence.append(data[1])



In [6]:
spanish_vocabulary = []
english_vocabulary = []

In [7]:
for en,sp in zip(english_sentence,spanish_sentence):
  en_tokens = en.split()
  sp_tokens = sp.split()
  for en_token,sp_token in zip(en_tokens,sp_tokens):
    if en_token not in english_vocabulary:
      english_vocabulary.append(en_token)
    if sp_token not in spanish_vocabulary:
      spanish_vocabulary.append(sp_token)
  # print(english_vocabulary)




In [8]:
len(english_vocabulary),len(spanish_vocabulary)

(1983, 3979)

Symbol '#' represent end of the word


In [9]:
import collections
en_word_freq_dict  = collections.defaultdict(int)

for word in english_vocabulary:
  en_word_freq_dict[" ".join(word) + ' #'] += 1
# en_word_freq_dict

In [10]:

sp_word_freq_dict  = collections.defaultdict(int)

for word in spanish_vocabulary:
  sp_word_freq_dict[" ".join(word) + ' #'] += 1
# sp_word_freq_dict

In [11]:
def get_pairs(word_freq_dict):
  '''
  goal:
      used to get the pairs dict:
                key(tuple{str,str}): represent the byte pairs
                values(int): represent the frequency of the byte pair
      and return the pairs dict
  '''
  pairs = collections.defaultdict(int)
  for word, freq in word_freq_dict.items():
    chars = word.split()
    for i in range(len(chars)-1):
      pairs[chars[i],chars[i+1]] += freq
  return pairs

def merge_byte_pairs(best_pair, word_freq_dict):
  '''
  goal:
      used to merge the byte pairs that has highest frequency
      and return the merged dict{new word_freq_dict}
  '''
  # print(best_pair)
  merged_dict = {}
  bigram = re.escape(' '.join(best_pair))
  # print(f'bigram {bigram}')
  p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
  # print(f'p {p}')
  for word in word_freq_dict:
    # print(word)
    w_out = p.sub(''.join(best_pair), word) # merging best byte pair.
    # print(f'w_out {w_out}')
    merged_dict[w_out] = word_freq_dict[word]
  return merged_dict

def get_subword_tokens(word_freq_dict):
  char_freq_dict = collections.defaultdict(int)
  for word,freq in word_freq_dict.items():
    chars = word.split()
    for char in chars:
      char_freq_dict[char] += freq
  return char_freq_dict


In [12]:
for i in range(2000):
  # if (i == 0):
    # print(f'{get_subword_tokens(en_word_freq_dict)}')
    # print('')
  pairs = get_pairs(en_word_freq_dict)
  try:
    best_pair = max(pairs,key = pairs.get)
  except:
    break
  # print(f"Iteration {i}: ")
  en_word_freq_dict = merge_byte_pairs(best_pair,en_word_freq_dict)
  en_subword_tokens = get_subword_tokens(en_word_freq_dict)
  # print(en_subword_tokens)
  # print(len(en_subword_tokens))
  # print()
  i += 1

In [13]:

for i in range(2000):
  # if (i == 0):
  #   print(f'{get_subword_tokens(sp_word_freq_dict)}')
  #   print('')
  pairs = get_pairs(sp_word_freq_dict)
  try:
    best_pair = max(pairs,key = pairs.get)
  except:
    break
  # print(f"Iteration {i}: ")
  sp_word_freq_dict = merge_byte_pairs(best_pair,sp_word_freq_dict)
  sp_subword_tokens = get_subword_tokens(sp_word_freq_dict)
  # print(sp_subword_tokens)
  # print(len(sp_subword_tokens))
  # print()
  i += 1

In [14]:
def measure_token_length(token):
    if token[-4:] == '#':
        return len(token[:-4]) + 1
    else:
        return len(token)

Byte pair vocabulary

In [15]:
en_sorted_tokens_tuple = sorted(en_subword_tokens.items(), key=lambda item: (measure_token_length(item[0]), item[1]), reverse=True)
sp_sorted_tokens_tuple = sorted(sp_subword_tokens.items(), key=lambda item: (measure_token_length(item[0]), item[1]), reverse=True)


In [16]:
en_sorted_tokens_tuple[0],en_sorted_tokens_tuple[-1]

(('unbelievable#', 1), ('x', 1))

In [17]:
en_vocab_tokenization = [token for (token, freq) in en_sorted_tokens_tuple]
sp_vocab_tokenization = [token for (token, freq) in sp_sorted_tokens_tuple]

In [18]:
len(en_vocab_tokenization),len(sp_vocab_tokenization)

(1695, 1921)

In [19]:
en_vocab_tokenization[:10]

['unbelievable#',
 'contributed#',
 'exaggerated#',
 'apologized#',
 'understand#',
 'reasonable#',
 'respectful#',
 'supportive#',
 'fantastic#',
 'seriously#']

In [20]:
en_lng_to_index = {}
en_index_to_lng = {}
k = 4
for i in en_vocab_tokenization:
  en_lng_to_index[i] = k
  en_index_to_lng[k] = i
  k += 1


sp_lng_to_index = {}
sp_index_to_lng = {}
k = 4
for i in sp_vocab_tokenization:
  sp_lng_to_index[i] = k
  sp_index_to_lng[k] = i
  k += 1

In [21]:


START_TOKEN = '<START>'
PADDING_TOKEN = '<PAD>'
END_TOKEN = '<END>'

en_index_to_lng[0] = '<OOV>'
sp_index_to_lng[0] = '<OOV>'
en_index_to_lng[1] = '<START>'
sp_index_to_lng[1] = '<START>'
en_index_to_lng[2] = '<END>'
sp_index_to_lng[2] = '<END>'
en_index_to_lng[3] = '<PAD>'
sp_index_to_lng[3] = '<PAD>'

en_lng_to_index['<OOV>'] = 0
sp_lng_to_index['<OOV>'] = 0
en_lng_to_index['<START>'] = 1
sp_lng_to_index['<START>'] = 1
en_lng_to_index['<END>'] = 2
sp_lng_to_index['<END>'] = 2
en_lng_to_index['<PAD>'] = 3
sp_lng_to_index['<PAD>'] = 3

In [35]:
en_lng_to_index['<START>'],en_lng_to_index['unbelievable#']

(1, 4)

In [32]:
en_lng_to_index,sp_lng_to_index

({'unbelievable#': 4,
  'contributed#': 5,
  'exaggerated#': 6,
  'apologized#': 7,
  'understand#': 8,
  'reasonable#': 9,
  'respectful#': 10,
  'supportive#': 11,
  'fantastic#': 12,
  'seriously#': 13,
  'apologize#': 14,
  'overslept#': 15,
  'attentive#': 16,
  'confident#': 17,
  'merciless#': 18,
  'objective#': 19,
  'realistic#': 20,
  'succeeded#': 21,
  'surrender#': 22,
  'something#': 23,
  'beautiful#': 24,
  'thrilling#': 25,
  'appeared#': 26,
  'terrific#': 27,
  'chuckled#': 28,
  'disagree#': 29,
  'remember#': 30,
  'screamed#': 31,
  'creative#': 32,
  'discreet#': 33,
  'merciful#': 34,
  'prepared#': 35,
  'punctual#': 36,
  'sensible#': 37,
  'specific#': 38,
  'thorough#': 39,
  'tolerant#': 40,
  'vigilant#': 41,
  'watchful#': 42,
  'straight#': 43,
  'resigned#': 44,
  'happened#': 45,
  'memorize#': 46,
  'promised#': 47,
  'anything#': 48,
  'describe#': 49,
  'upstairs#': 50,
  'annoying#': 51,
  'arrogant#': 52,
  'horrible#': 53,
  'romantic#': 54,
  '

In [23]:

def tokenize_word(string, sorted_tokens, unknown_token='<OOV>'):

    if string == '':
        return []
    if sorted_tokens == []:
        return [unknown_token]

    string_tokens = []
    for i in range(len(sorted_tokens)):
        token = sorted_tokens[i]
        token_reg = re.escape(token.replace('.', '[.]'))
        # print(f'token {token}')
        # print(f'string {string}')
        try:
            matched_positions = [(m.start(0), m.end(0)) for m in re.finditer(token_reg, string)]
            # print(matched_positions)
            # break
        except:
          continue

        if len(matched_positions) == 0:
            continue

        print(f'token {token}')
        print(f'string {string}')
        print(f'matched_positions {matched_positions}')
        substring_end_positions = [matched_position[0] for matched_position in matched_positions]
        print(f'substring_end_positions {substring_end_positions}')
        substring_start_position = 0
        for substring_end_position in substring_end_positions:
            substring = string[substring_start_position:substring_end_position]
            print(f'substring {substring}')
            string_tokens += tokenize_word(string=substring, sorted_tokens=sorted_tokens[i+1:], unknown_token=unknown_token)
            print(f'string_tokens {string_tokens}')
            string_tokens += [token]
            print(f'string_tokens {string_tokens}')
            substring_start_position = substring_end_position + len(token)
            print(f'substring_start_position {substring_start_position}')
            print('---------------------------')
        remaining_substring = string[substring_start_position:]
        print(f'remaining_substring {remaining_substring}')
        print('---------------------------')
        string_tokens += tokenize_word(string=remaining_substring, sorted_tokens=sorted_tokens[i+1:], unknown_token=unknown_token)
        print(f'string_tokens {string_tokens}')
        print('----------------------------')
        break

    return string_tokens



def get_idx(sentence,lng_to_index):

    b = []
    for j in sentence.split():
      j = j+'#'
      # print(f'word {j}')
      if j in lng_to_index:
        b.append(lng_to_index[j])
        # print(b)
      # handling oov tokens using byte pair
      else:
          temp = tokenize_word(string=j, sorted_tokens=list(lng_to_index.keys()), unknown_token='</u>')
          if temp:
            for k in temp:
              b.append(lng_to_index[k])
            # print(b)
          else:
            b.append(0)
            # print(b)
    return b

to use


In [24]:
text_sentences = 'follow the guidelines of the company'
en_vector= get_idx(text_sentences,en_lng_to_index)




token es#
string guidelines#
matched_positions [(8, 11)]
substring_end_positions [8]
substring guidelin
token lin
string guidelin
matched_positions [(5, 8)]
substring_end_positions [5]
substring guide
token gu
string guide
matched_positions [(0, 2)]
substring_end_positions [0]
substring 
string_tokens []
string_tokens ['gu']
substring_start_position 2
---------------------------
remaining_substring ide
---------------------------
token de
string ide
matched_positions [(1, 3)]
substring_end_positions [1]
substring i
token i
string i
matched_positions [(0, 1)]
substring_end_positions [0]
substring 
string_tokens []
string_tokens ['i']
substring_start_position 1
---------------------------
remaining_substring 
---------------------------
string_tokens ['i']
----------------------------
string_tokens ['i']
string_tokens ['i', 'de']
substring_start_position 3
---------------------------
remaining_substring 
---------------------------
string_tokens ['i', 'de']
----------------------------
s

In [25]:
print(en_vector)
print(f'Some ananlysis on sentence: {text_sentences}')
print(f'length of token in byte pair level encoding {len(en_vector)}')
print(f'length of token in character level encoding {len(text_sentences)}')
print(f'length of token in word level encoding {len(text_sentences.split())}')


[218, 1165, 1553, 1692, 1620, 1451, 1248, 1341, 1165, 1291, 1675, 1445, 1695]
Some ananlysis on sentence: follow the guidelines of the company
length of token in byte pair level encoding 13
length of token in character level encoding 36
length of token in word level encoding 6


adding start and end token to it

In [26]:
en_vector.insert(0,en_lng_to_index['<START>'])
en_vector.append(en_lng_to_index['<END>'])
en_vector

[1,
 218,
 1165,
 1553,
 1692,
 1620,
 1451,
 1248,
 1341,
 1165,
 1291,
 1675,
 1445,
 1695,
 2]

In [27]:






### touse

b = ''
for idx in en_vector:
  b += en_index_to_lng[idx] + ' '
b = re.sub(r'[" "]+', "", b)
b = b.replace('#',' ').strip()

print(b)

<START>follow the guidelines of the company <END>


In [28]:

b = ''
for idx in en_vector:
  print(idx)
  b = b + en_index_to_lng[idx]
  print(b)
  print('-------------')


1
<START>
-------------
218
<START>follow#
-------------
1165
<START>follow#the#
-------------
1553
<START>follow#the#gu
-------------
1692
<START>follow#the#gui
-------------
1620
<START>follow#the#guide
-------------
1451
<START>follow#the#guidelin
-------------
1248
<START>follow#the#guidelines#
-------------
1341
<START>follow#the#guidelines#of#
-------------
1165
<START>follow#the#guidelines#of#the#
-------------
1291
<START>follow#the#guidelines#of#the#com
-------------
1675
<START>follow#the#guidelines#of#the#comp
-------------
1445
<START>follow#the#guidelines#of#the#company
-------------
1695
<START>follow#the#guidelines#of#the#company#
-------------
2
<START>follow#the#guidelines#of#the#company#<END>
-------------


old vocabulary

In [29]:
len(english_vocabulary),len(spanish_vocabulary)

(1983, 3979)

new vocabulary

In [30]:

len(en_index_to_lng),len(sp_index_to_lng)

(1699, 1925)