Let define a Tokenizer class

In [1]:
import regex as re

In [2]:
with open("shakespeare.txt","r") as f:
    entry_text = f.read()

In [3]:

pattern =  r"[\w\d]+|[^\w\d\s]"
pattern = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""

compiled_pattern = re.compile(pattern)
text_chunks = re.findall(compiled_pattern, entry_text)

In [4]:
def define_initial_vocab(input_text):
    init_vocab = set(input_text)
    return sorted(list(init_vocab))



In [5]:
text = list(entry_text)

In [6]:
def count_pair_occurences(input_text):
    count ={}

    for i in range(len(input_text)-1):
        pair =(input_text[i],input_text[i+1])
        if pair in count:
            count[pair]+=1
        else :
            count[pair]=1
    return count



In [7]:
def find_max_occurence(occurence_count):
    old_val=0
    for key,val in occurence_count.items():
        if val>old_val:
            max_key = key
            old_val = val
    return max_key,old_val

In [8]:
def merge(text, pair):
    """Splits a token on a given substring.

    Args:
        token: The token to split.
        substring: The substring to split on.

    Returns:
        A list of the split tokens.
    """
    newtext = []
    i = 0
    while i < len(text):
        # if not at the very last position AND the pair matches, replace it
        if text[i] == pair[0] and i < len(text) - 1 and text[i+1] == pair[1]:
            newtext.append("".join(pair))
            i += 2
        else:
            newtext.append(text[i])
            i += 1
    return newtext




In [16]:
n_iterations = 20
text_=text
vocab = define_initial_vocab(entry_text)
init_vocab_size = len(vocab)
for _ in range(n_iterations):
    occurence_count = count_pair_occurences(text_)
    key,val = find_max_occurence(occurence_count)
    if val ==1 :
        break
    vocab.append("".join(key))

    text_ = merge(text_, key)


    

In [17]:
vocab

['\t',
 '\n',
 ' ',
 '!',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'À',
 'Æ',
 'Ç',
 'É',
 'à',
 'â',
 'æ',
 'ç',
 'è',
 'é',
 'ê',
 'ë',
 'î',
 'œ',
 '—',
 '‘',
 '’',
 '“',
 '”',
 '•',
 '…',
 '™',
 '\ufeff',
 'e ',
 'th',
 't ',
 's ',
 '.\n',
 ', ',
 'd ',
 'er',
 'ou',
 'in',
 'an',
 'y ',
 'or',
 'o ',
 'en',
 'ar',
 'on',
 'll',
 ' th',
 'ha']

In [12]:
occurence_count

{('\ufeff', 'T'): 2,
 ('T', 'h'): 15730,
 ('h', 'e'): 85840,
 ('e', ' '): 130875,
 (' ', 'P'): 3687,
 ('P', 'r'): 1416,
 ('r', 'o'): 16686,
 ('o', 'j'): 162,
 ('j', 'e'): 1085,
 ('e', 'c'): 5497,
 ('c', 't'): 3704,
 ('t', ' '): 77593,
 (' ', 'G'): 3066,
 ('G', 'u'): 199,
 ('u', 't'): 14001,
 ('t', 'e'): 24136,
 ('e', 'n'): 35541,
 ('n', 'b'): 244,
 ('b', 'e'): 18078,
 ('e', 'r'): 58595,
 ('r', 'g'): 1843,
 ('g', ' '): 12177,
 (' ', 'e'): 11423,
 ('e', 'B'): 16,
 ('B', 'o'): 480,
 ('o', 'o'): 12789,
 ('o', 'k'): 2791,
 ('k', ' '): 6882,
 (' ', 'o'): 36755,
 ('o', 'f'): 19710,
 ('f', ' '): 23819,
 (' ', 'T'): 5572,
 (' ', 'C'): 4493,
 ('C', 'o'): 2373,
 ('o', 'm'): 16764,
 ('m', 'p'): 3521,
 ('p', 'l'): 4424,
 ('l', 'e'): 21957,
 ('e', 't'): 15220,
 (' ', 'W'): 3346,
 ('W', 'o'): 562,
 ('o', 'r'): 38574,
 ('r', 'k'): 1848,
 ('k', 's'): 1884,
 ('s', ' '): 74697,
 ('W', 'i'): 2254,
 ('i', 'l'): 15446,
 ('l', 'l'): 31309,
 ('l', 'i'): 13400,
 ('i', 'a'): 3251,
 ('a', 'm'): 8923,
 ('m', ' ')

In [None]:
def merge_vocabulary(pair_occurence_count, vocab_size):
    # Perform BPE iterations
    for _ in range(num_iterations):
        # Find the most frequent pair
        most_frequent_pair = max(pairs, key=lambda pair: vocab[pair[0]] + vocab[pair[1]])

        # Merge the pair
        new_token = ''.join(most_frequent_pair)
        vocab[new_token] = vocab.pop(most_frequent_pair[0]) + vocab.pop(most_frequent_pair[1])
        pairs.remove(most_frequent_pair)
        for i in range(len(pairs)):
            pairs[i] = (pairs[i][0].replace(most_frequent_pair[0], new_token), pairs[i][1])
            pairs[i] = (pairs[i][0].replace(most_frequent_pair[1], new_token), pairs[i][1])

        # Stop if the vocabulary size reaches the desired limit
        if len(vocab) >= vocab_size:
            break

In [None]:
vocab = {token: 1 for token in tokens}

In [None]:
vocab