In [51]:
# made by google but never publish. So it's an approximate method
# tokenizer word_piece -> BERT

corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens."
]

In [11]:
from huggingface_hub import notebook_login

#notebook_login()

In [52]:
# pre-tokenize 
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [53]:
# calculate the freqs foreach word in the corpus

from collections import defaultdict 

word_freqs = defaultdict(int)

for text in corpus:
    word_freqs_with_offset = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, _ in word_freqs_with_offset]
    for word in new_words:
        word_freqs[word] += 1

# for w in word_freqs.keys():
#     print(f"word_freqs.keys() = {w}")
#     print(f"w[0] = {w[0]}")

In [54]:
# create the alphabet of the corpus

alphabet = []

for word in word_freqs.keys():
    if word[0] not in alphabet:
        alphabet.append(word[0]) # fst letter foreach word
    for letter in word[1:]:
        if f"##{letter}" not in alphabet:
            alphabet.append(f"##{letter}")

alphabet.sort()
print([alphabet])

[['##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y']]


In [55]:
# create the vocab

vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()
# vocab

In [56]:
# dividing each word into list of char

splits = {
    word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)]
    for word in word_freqs.keys()
}

In [57]:
# calculate score of each pair

def compute_pair_scores(splits):
    letter_freqs = defaultdict(int)
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            letter_freqs[split[0]] += freq
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            letter_freqs[split[i]] += freq
            pair_freqs[pair] += freq
        letter_freqs[split[-1]] += freq

    scores = {
        pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])
        for pair, freq in pair_freqs.items()
    }

    return scores

pair_scores = compute_pair_scores(splits)
for i, key in enumerate(pair_scores.keys()):
    print(f"{key}: {pair_scores[key]}")
    if i >=5:
        break


('T', '##h'): 0.125
('##h', '##i'): 0.03409090909090909
('##i', '##s'): 0.02727272727272727
('i', '##s'): 0.1
('t', '##h'): 0.03571428571428571
('##h', '##e'): 0.011904761904761904


In [58]:
# now we have the search for the best pair

best_pair = ""
max_score = None

for pair, score in pair_scores.items():
    if max_score is None or max_score < score:
        max_score = score
        best_pair = pair

print(f"(best_pair, max_score) = {best_pair, max_score}")

# so after the fusion we will get => ('a', '##b') -> 'ab' and append it to the vocab

(best_pair, max_score) = (('a', '##b'), 0.2)


In [59]:
# function for merge the best_pair

def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue
        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                merge = a + b[2:] if b.startswith("##") else a + b
                split = split[:i] + [merge] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

# test

# splits = merge_pair("a", "##b", splits)
# splits["about"]

In [60]:
# NOW WE CAN REGROUP ALL THIS.

target_vocab_size = 70

while len(vocab) < target_vocab_size:
    # calculate the score based on the 'word piece tokenizer' logic
    scores = compute_pair_scores(splits)

    # finding the best_pair to merge
    best_pair, max_score = "", None
    for pair, score in scores.items():
        if max_score is None or max_score < score:
            max_score = score
            best_pair = pair
    
    # merging the 'best_pair'
    splits = merge_pair(*best_pair, splits)

    # new token to append to vocab
    new_token = (
        best_pair[0] + best_pair[1][2:]
        if best_pair[1].startswith("##")
        else best_pair[0] + best_pair[1]
    )
    vocab.append(new_token)

In [61]:
print(f"vocab =>\n{vocab}")

vocab =>
['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y', 'ab', '##fu', 'Fa', 'Fac', '##ct', '##ful', '##full', '##fully', 'Th', 'ch', '##hm', 'cha', 'chap', 'chapt', '##thm', 'Hu', 'Hug', 'Hugg', 'sh', 'th', 'is', '##thms', '##za', '##zat', '##ut']
