# Step 1 (Byte Pair Encoding (BPE))

In [48]:
def flatten(t):
    return [item for sublist in t for item in sublist]

corpus = flatten([["low"] * 5, ["lower"] * 2, ["widest"] * 3, ["newest"] * 5])
corpus = [list(s) for s in corpus]
print(corpus)

[['l', 'o', 'w'], ['l', 'o', 'w'], ['l', 'o', 'w'], ['l', 'o', 'w'], ['l', 'o', 'w'], ['l', 'o', 'w', 'e', 'r'], ['l', 'o', 'w', 'e', 'r'], ['w', 'i', 'd', 'e', 's', 't'], ['w', 'i', 'd', 'e', 's', 't'], ['w', 'i', 'd', 'e', 's', 't'], ['n', 'e', 'w', 'e', 's', 't'], ['n', 'e', 'w', 'e', 's', 't'], ['n', 'e', 'w', 'e', 's', 't'], ['n', 'e', 'w', 'e', 's', 't'], ['n', 'e', 'w', 'e', 's', 't']]


In [92]:
class Tokenizer_BPE:
    def __init__(self, verbose=True):
        self.vocab = None
        self.verbose = verbose

    def __printv(self, string):
        if self.verbose:
            print(string)

    def __find_max_key(self, d: dict) -> str:
        max_value = -1
        for key, value in d.items():
            if value > max_value:
                max_key, max_value = key, value
        return max_key

    def __find_max_pair(self, corpus) -> str:
        pair_freq = dict()
        for word in corpus:
            for char_idx in range(len(word) - 1):
                pair = "".join(word[char_idx: char_idx + 2])
                pair_freq[pair] = pair_freq.get(pair, 0) + 1
        max_freq_pair = self.__find_max_key(pair_freq)
        self.__printv(f"{max_freq_pair} --> {pair_freq}")
        return max_freq_pair

    def __merge_pair(self, corpus, pair: str):
        for word_idx, word in enumerate(corpus):
            merged_word = []
            char_idx = 0
            while char_idx < len(word):
                if char_idx == len(word) - 1:
                    merged_word.append(word[char_idx])
                    break
                current_pair = "".join(word[char_idx: char_idx + 2])
                if current_pair == pair:
                    merged_word.append(pair)
                    char_idx += 2
                else:
                    merged_word.append(word[char_idx])
                    char_idx += 1
            corpus[word_idx] = merged_word
        return corpus

    def __get_unique_chars(self, corpus):
        return list(set(flatten(corpus)))

    def train(self, corpus: list, vocab_size: int) -> list:
        output = {"vocab": self.__get_unique_chars(corpus)}
        while len(output["vocab"]) < vocab_size:
            self.__printv(corpus)
            try:
                max_freq_pair = self.__find_max_pair(corpus)
                corpus = self.__merge_pair(corpus, max_freq_pair)
                output["vocab"].append(max_freq_pair)
                # self.__printv("")
            except Exception as e:
                break
        output["vocab"] = output["vocab"][:vocab_size]
        self.vocab = output["vocab"]
        return output
    
    def tokenize(self, word: str):
        word = list(word)
        for merge_pair in self.vocab:
            before_merge_len = len(word)
            word = self.__merge_pair([word], merge_pair)[0]
            if before_merge_len != len(word):
                self.__printv(f"Merged '{merge_pair}' --> {word}")
        return word

In [93]:
bpe_tokenizer = Tokenizer_BPE(verbose=True)
print("### Training ###")
output = bpe_tokenizer.train(corpus.copy(), vocab_size=100)
print(output)
print("### Tokenizing ###")
print(bpe_tokenizer.tokenize("lowest"))

### Training ###
[['l', 'o', 'w'], ['l', 'o', 'w'], ['l', 'o', 'w'], ['l', 'o', 'w'], ['l', 'o', 'w'], ['l', 'o', 'w', 'e', 'r'], ['l', 'o', 'w', 'e', 'r'], ['w', 'i', 'd', 'e', 's', 't'], ['w', 'i', 'd', 'e', 's', 't'], ['w', 'i', 'd', 'e', 's', 't'], ['n', 'e', 'w', 'e', 's', 't'], ['n', 'e', 'w', 'e', 's', 't'], ['n', 'e', 'w', 'e', 's', 't'], ['n', 'e', 'w', 'e', 's', 't'], ['n', 'e', 'w', 'e', 's', 't']]
es --> {'lo': 7, 'ow': 7, 'we': 7, 'er': 2, 'wi': 3, 'id': 3, 'de': 3, 'es': 8, 'st': 8, 'ne': 5, 'ew': 5}
[['l', 'o', 'w'], ['l', 'o', 'w'], ['l', 'o', 'w'], ['l', 'o', 'w'], ['l', 'o', 'w'], ['l', 'o', 'w', 'e', 'r'], ['l', 'o', 'w', 'e', 'r'], ['w', 'i', 'd', 'es', 't'], ['w', 'i', 'd', 'es', 't'], ['w', 'i', 'd', 'es', 't'], ['n', 'e', 'w', 'es', 't'], ['n', 'e', 'w', 'es', 't'], ['n', 'e', 'w', 'es', 't'], ['n', 'e', 'w', 'es', 't'], ['n', 'e', 'w', 'es', 't']]
est --> {'lo': 7, 'ow': 7, 'we': 2, 'er': 2, 'wi': 3, 'id': 3, 'des': 3, 'est': 8, 'ne': 5, 'ew': 5, 'wes': 5}
[['l'

# Step 2 (HuggingFace Tokenizer)

In [None]:
! pip install tokenizers
! wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
! wget http://www.gutenberg.org/cache/epub/16457/pg16457.txt
! unzip wikitext-103-raw-v1.zip

In [26]:
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece
from tokenizers.trainers import BpeTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace

files = [f"wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
# files = ["pg16457.txt"]

# BPE
tokenizer_bpe = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=3*10**6)
tokenizer_bpe.pre_tokenizer = Whitespace()  # Avoid "it is" as a token
tokenizer_bpe.train(files, trainer)

# WordPiece
tokenizer_wp = Tokenizer(WordPiece(unk_token="[UNK]"))
trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=3*10**6)
tokenizer_wp.pre_tokenizer = Whitespace()  # Avoid "it is" as a token
tokenizer_wp.train(files, trainer)

In [None]:
print(tokenizer_bpe.get_vocab_size(), tokenizer_bpe.get_vocab())
print(tokenizer_wp.get_vocab_size(), tokenizer_wp.get_vocab())

In [27]:
print(tokenizer_bpe.get_vocab_size())
print(tokenizer_wp.get_vocab_size())

777366
812990


In [28]:
output = tokenizer_bpe.encode("This is a deep learning tokenization tutorial. Tokenization is the first step in a deep learning NLP pipeline. We will be comparing the tokens generated by each tokenization model. Excited much?!üòÅ ?")
print("BPE", len(output.tokens), output.tokens)
output = tokenizer_wp.encode("This is a deep learning tokenization tutorial. Tokenization is the first step in a deep learning NLP pipeline. We will be comparing the tokens generated by each tokenization model. Excited much?!üòÅ ?")
print("WP ", len(output.tokens), output.tokens)

BPE 42 ['This', 'is', 'a', 'deep', 'learning', 'token', 'ization', 'tutorial', '.', 'Token', 'ization', 'is', 'the', 'first', 'step', 'in', 'a', 'deep', 'learning', 'NL', 'P', 'pipeline', '.', 'We', 'will', 'be', 'comparing', 'the', 'tokens', 'generated', 'by', 'each', 'token', 'ization', 'model', '.', 'Excited', 'much', '?', '!', '[UNK]', '?']
WP  40 ['This', 'is', 'a', 'deep', 'learning', 'token', '##ization', 'tutorial', '.', 'Token', '##ization', 'is', 'the', 'first', 'step', 'in', 'a', 'deep', 'learning', 'NL', '##P', 'pipeline', '.', 'We', 'will', 'be', 'comparing', 'the', 'tokens', 'generated', 'by', 'each', 'token', '##ization', 'model', '.', 'Excited', 'much', '[UNK]', '?']


In [29]:
with open("pg16457.txt", "r") as f:
    g = f.read()

output = tokenizer_bpe.encode(g)
print("BPE", len(output.tokens))
output = tokenizer_wp.encode(g)
print("WP ", len(output.tokens))

BPE 127262
WP  124054
