
# **BERT BPE algorithm**

In [None]:
#import modules and packages
import pip
from importlib.util import find_spec

required_packages = ['transformers', 'torch']

for package in required_packages:
  if find_spec(package) is None:
    print(f'Installing package: {package}...')
    pip.main(['install', package])

import torch
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
from pprint import pprint

#Defined variable for the pre-trained model and the tokenizer
model_name = 'PlanTL-GOB-ES/roberta-base-bne'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name, output_hidden_states = True, )
model.eval()
#model_mask = pipeline('fill-mask', model=model_name)


#In this variable, the list of words that will be analyzed is specified.
word_list = ["aceptable", "brevedad", "taxista", "consumismo", "poemario", "alumnado", "porrazo", "rosaleda", "portero", "zapatería", "preescolar", "irracional", "inaceptable", "subespecie", "antidisturbios", "sobrevolar", "contraataque", "deshacer", "protaurino", "monoparental", "mueble", "prepucio", "sorpresa", "oficina", "impresionar", "rústico", "técnica", "indicio", "rutina", "trigo", "verde", "ayer", "casa", "camisa", "álbum", "papel", "collar", "luna", "radio", "lejos"]

#Iterate over each word in the list
for word in word_list:
    marked_text = word
    print(f'input: {marked_text}')

    #Tokenize the sentence with the BERT tokenizer, which is based on the BPE algorithm.
    tokenized_text = tokenizer.tokenize(marked_text)

    #Here, we print out the input, the tokenized word and the total number of tokens that the BPE algorithm has detected.
    print(f'tokenized: {tokenized_text}')
    print(f'number of tokens: {len(tokenized_text)}')
    print("\n")



# **OUR BPE tokenizer from scratch**

In [None]:
#Install and import libraries
from collections import Counter, defaultdict
from transformers import AutoTokenizer

class BPE():
    """Byte-Pair Encoding: Subword-based tokenization algorithm."""
#Initialize BPE with corpus and number of iterations for merging
    def __init__(self, corpus, num_iterations):
        """Initialize BPE tokenizer."""
        self.corpus = corpus
        self.num_iterations = num_iterations

        # pre-tokenize the corpus into words, BERT pre-tokenizer is used here
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.word_freqs = defaultdict(int)
        self.splits = {}
        self.merges = {}


    def train(self):
        """Train BPE tokenizer."""

        # compute the frequencies of each word in the corpus
        for text in self.corpus:
          #pre-tokenize the text using the BERT tokenizer
            words_with_offsets = self.tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
           #extract words from pre-tokenization
            new_words = [word for word, offset in words_with_offsets]
            for word in new_words:
                self.word_freqs[word] += 1

        # compute the base vocabulary of all characters in the corpus
        alphabet = []
        for word in self.word_freqs.keys():
            for letter in word:
                if letter not in alphabet:
                    alphabet.append(letter)
        alphabet.sort()

        # add the special token </w> at the beginning of the vocabulary
        vocab = ["</w>"] + alphabet.copy()

        # split each word into individual characters before training
        self.splits = {word: [c for c in word] for word in self.word_freqs.keys()}

        # merge the most frequent pair iteratively until the vocabulary size is reached
        for i in range(self.num_iterations):

            # compute the frequency of each pair
            pair_freqs = self.compute_pair_freqs()

            # find the most frequent pair
            best_pair = ""
            max_freq = None
            for pair, freq in pair_freqs.items():
                if max_freq is None or max_freq < freq:
                    best_pair = pair
                    max_freq = freq

            # merge the most frequent pair
            if i == self.num_iterations-1:
              print('\niteration', i)
              print('vocabulary: ', vocab)
              print('best pair:', best_pair)
           #merge the best pair of subword units
            self.splits = self.merge_pair(*best_pair)
            self.merges[best_pair] = best_pair[0] + best_pair[1]
            vocab.append(best_pair[0] + best_pair[1])
        return self.merges


    def compute_pair_freqs(self):
        """Compute the frequency of each pair."""
        #initialize a dictionary to store the frequency of each pair
        pair_freqs = defaultdict(int)
        #iterate over each word and its frequency in the corpus
        for word, freq in self.word_freqs.items():
            split = self.splits[word]  #and split subword units for the current word
           #if the word consists of 1 subword unit, skip to the next one
            if len(split) == 1:
                continue
         #iterate over each pair of subword units in the split above
            for i in range(len(split) - 1):
                pair = (split[i], split[i + 1])
                pair_freqs[pair] += freq
        return pair_freqs


    def merge_pair(self, a, b):
        """Merge the given pair."""
        #iterate over each word
        for word in self.word_freqs:
            split = self.splits[word]
          #if the word consists of 1 subword units, skip to the next one
            if len(split) == 1:
                continue
            i = 0
            #If the current pair of subword units matches the given pair (a, b),
            #merge them by replacing them with the concatenated subword unit (a + b)
            while i < len(split) - 1:
                if split[i] == a and split[i + 1] == b:
                    split = split[:i] + [a + b] + split[i + 2 :]
                #If the current pair does not match, move to the next pair
                else:
                    i += 1
            self.splits[word] = split
        return self.splits


    def tokenize(self, text):
        """Tokenize a given text with trained BPE tokenizer (including pre-tokenization, split, and merge)."""

        pre_tokenize_result = self.tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
        pre_tokenized_text = [word for word, offset in pre_tokenize_result]
        #Split the pre-tokenized text into individual characters
        splits_text = [[l for l in word] for word in pre_tokenized_text]
        #Iterate over each pair and its corresponding merge operation
        for pair, merge in self.merges.items():
          #iterate over each spllitted text
            for idx, split in enumerate(splits_text):
                i = 0
              #If the current pair of characters matches the pair to merge,
              #replace them with the merged subword unit
                while i < len(split) - 1:
                    if split[i] == pair[0] and split[i + 1] == pair[1]:
                        split = split[:i] + [merge] + split[i + 2 :]
                    else:
                        i += 1
                splits_text[idx] = split
        result = sum(splits_text, [])
        return result

In [None]:
#import the Wikipedia corpus used for training
with open('corpus.txt', encoding="utf8") as f:
    corpus = f.readlines()
    print(corpus[:5])

# create a BPE tokenizer object
MyBPE = BPE(corpus=corpus, num_iterations=10000) #here we can adjust the number of iterations

# train BPE tokenizer with Wikipedia corpus
print(MyBPE.train())

# tokenize the given text
word_list =["aceptable", "brevedad", "taxista", "consumismo", "poemario", "alumnado", "porrazo", "rosaleda", "portero", "zapatería", "preescolar", "irracional", "inaceptable", "subespecie", "antidisturbios", "sobrevolar", "contraataque", "deshacer", "protaurino", "monoparental", "mueble", "prepucio", "sorpresa", "oficina", "impresionar", "rústico", "técnica", "indicio", "rutina", "trigo", "verde", "ayer", "casa", "camisa", "álbum", "papel", "collar", "luna", "radio", "lejos"]
for word in word_list:
    marked_text = word
    print(MyBPE.tokenize(word))

['\ufeffThe Project Gutenberg eBook of Niebla (Nivola)\n', '    \n', 'This ebook is for the use of anyone anywhere in the United States and\n', 'most other parts of the world at no cost and with almost no restrictions\n', 'whatsoever. You may copy it, give it away or re-use it under the terms\n']

iteration 9
vocabulary:  ['</w>', '!', '#', '$', '%', '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '«', '»', '¿', 'Á', 'É', 'Í', 'Ó', 'Ú', 'á', 'è', 'é', 'í', 'ñ', 'ó', 'ú', 'ü', '—', '‘', '’', '“', '”', '•', '™', '\ufeff', 'en', 'es', 'er', 'qu', 'de', 'la', 'os', 'que', 'ar']
best pair: ('d', 'o')
{('e', 'n'): 'en', ('e', 's'): 'es', ('e', 'r'): 'er', ('q',