# First step

In [3]:
from collections import defaultdict

corpus = "low " * 5 + "lower " * 2 + "widest " * 3 + "newest " * 5
corpus = corpus.rstrip()

def BPE(curpos):
    words = corpus.split()
    word_freq_dict = defaultdict(int)
    for word in words:
        chars = list(word)
        chars.append('__')
        key = tuple(chars)
        word_freq_dict[key] += 1
    vocabulary = defaultdict(int)
    for word, freq in word_freq_dict.items():
        for char in word:
            vocabulary[char] += freq
    while len(vocabulary.keys()) < 30:
        pairs_freq_dict = defaultdict(int)
        for word, freq in word_freq_dict.items():
            for i, next in zip(word, word[1:]):
                pairs_freq_dict[i + next] += freq
        if not pairs_freq_dict:
            break
        best_pair = max(pairs_freq_dict, key=pairs_freq_dict.get)
        vocabulary[best_pair] = pairs_freq_dict[best_pair]

        word_freq_dict_temp = word_freq_dict.copy()
        for word, freq in word_freq_dict_temp.items():
            new_word = []
            skip = False
            for i, next in zip(word, word[1:]):
                if not skip:
                    if i + next == best_pair:
                        new_word.append(i + next)
                        vocabulary[i] -= freq
                        vocabulary[next] -= freq
                        skip = True
                    else:
                        new_word.append(i)
                else:
                    skip = False
            if not skip:
                new_word.append(word[-1])
            new_word = tuple(new_word)
            if word != new_word:
                del word_freq_dict[word]
                word_freq_dict[new_word] = freq
    temp_vocabulary = vocabulary.copy()
    for word, freq in temp_vocabulary.items():
        if freq == 0:
            del vocabulary[word]
    return list(vocabulary.keys())

vocabulary = BPE(corpus)
if "lowest__" in vocabulary:
    print("'lowest' is in vocabulary")
else:
    print("'lowest' is not in vocabulary")

'lowest' is not in vocabulary


# Second step

In [2]:
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece
from tokenizers.trainers import BpeTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace

In [6]:
unknown_token = "<UNK>"
special_tokens = ["<UNK>", "<SEP>", "<MASK>", "<CLS>"]

bpe_tokenizer = Tokenizer(BPE(unk_token=unknown_token))
bpe_tokenizer.pre_tokenizer = Whitespace()
bpe_trainer = BpeTrainer(special_tokens=special_tokens)

wordpiece_tokenizer = Tokenizer(WordPiece(unk_token=unknown_token))
wordpiece_tokenizer.pre_tokenizer = Whitespace()
wordpiece_trainer = WordPieceTrainer(special_tokens=special_tokens)


In [8]:
def tokenize(model_name, data_files):
    if model_name == 'BPE':
        tokenizer = Tokenizer(BPE(unk_token=unknown_token))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = BpeTrainer(special_tokens=special_tokens)
    elif model_name == 'WordPiece':
        tokenizer = Tokenizer(WordPiece(unk_token=unknown_token))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordPieceTrainer(special_tokens=special_tokens)
    tokenizer.train(data_files, trainer)
    return tokenizer

In [9]:
import os
train_data = {
    'small': [f"data{os.sep}pg16457.txt"],
    'large': [f"data{os.sep}wikitext-103-raw{os.sep}wiki.train.raw", 
              f"data{os.sep}wikitext-103-raw{os.sep}wiki.test.raw", 
              f"data{os.sep}wikitext-103-raw{os.sep}wiki.valid.raw"]
}
input = "This is a deep learning tokenization tutorial. Tokenization is the first step in a \
deep learning NLP pipeline. We will be comparing the tokens generated by each \
tokenization model. Excited much?!😍"
for data_size, data_files in train_data.items():
    print(f"{data_size} data files:")
    for model_name in ['BPE', 'WordPiece']:
        print(f"{model_name}: ")
        tokenizer = tokenize(model_name, data_files)
        output = tokenizer.encode(input)
        print(output.tokens)

small data files:
BPE: 
['This', 'is', 'a', 'deep', 'learning', 'to', 'ken', 'ization', 't', 'ut', 'or', 'ial', '.', 'T', 'ok', 'en', 'ization', 'is', 'the', 'first', 'step', 'in', 'a', 'deep', 'learning', 'N', 'L', 'P', 'pi', 'pe', 'line', '.', 'We', 'will', 'be', 'comparing', 'the', 'to', 'k', 'ens', 'generated', 'by', 'each', 'to', 'ken', 'ization', 'model', '.', 'Ex', 'c', 'ited', 'much', '?', '!', '<UNK>']
WordPiece: 
['This', 'is', 'a', 'deep', 'learning', 'to', '##ken', '##ization', 't', '##ut', '##oria', '##l', '.', 'To', '##ken', '##ization', 'is', 'the', 'first', 'step', 'in', 'a', 'deep', 'learning', 'N', '##L', '##P', 'pip', '##el', '##ine', '.', 'We', 'will', 'be', 'comparing', 'the', 'to', '##ken', '##s', 'generated', 'by', 'each', 'to', '##ken', '##ization', 'model', '.', 'Ex', '##ci', '##ted', 'much', '<UNK>']
large data files:
BPE: 
['This', 'is', 'a', 'deep', 'learning', 'to', 'ken', 'ization', 'tut', 'orial', '.', 'Tok', 'en', 'ization', 'is', 'the', 'first', 'step',

## Third step

In [16]:
with open(f"data{os.sep}pg16457.txt") as f:
    contents = f.read()

for data_size, data_files in train_data.items():
    print(f"{data_size} data files:")
    for model_name in ['BPE', 'WordPiece']:
        print(f"{model_name}: ")
        tokenizer = tokenize(model_name, data_files)
        output = tokenizer.encode(contents)
        print(f"Number of tokens: {len(output)}")

small data files:
BPE: 
Number of tokens: 122739
WordPiece: 
Number of tokens: 122739
large data files:
BPE: 
Number of tokens: 140872
WordPiece: 
Number of tokens: 140735
