In [1]:
import json

def gec_generator_text():
    with open('train.json', 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            parsed_data = json.loads(line)
            
            input_text = parsed_data["input"]
            target_text = parsed_data["target"]
            
            index_of_grammar = input_text.index("grammar:") + len("grammar:")
            text_after_grammar = input_text[index_of_grammar:].strip()
            
            yield(text_after_grammar, target_text);

In [3]:
# !pip install tokenizers

In [4]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [6]:
from tokenizers import pre_tokenizers

pre_tokenizer = pre_tokenizers.WhitespaceSplit()

In [8]:
pre_tokenizer.pre_tokenize_str("Am fost la mare mare departare de casa. Imi place sa-mi cumpar jucarii. Hhah !")

[('Am', (0, 2)),
 ('fost', (3, 7)),
 ('la', (8, 10)),
 ('mare', (11, 15)),
 ('mare', (16, 20)),
 ('departare', (21, 30)),
 ('de', (31, 33)),
 ('casa.', (34, 39)),
 ('Imi', (40, 43)),
 ('place', (44, 49)),
 ('sa-mi', (50, 55)),
 ('cumpar', (56, 62)),
 ('jucarii.', (63, 71)),
 ('Hhah', (72, 76)),
 ('!', (77, 78))]

In [10]:
from collections import defaultdict

word_freqs = defaultdict(int)

for text in corpus:
    words_with_offsets = pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

print(word_freqs)

defaultdict(<class 'int'>, {'This': 3, 'is': 2, 'the': 1, 'Hugging': 1, 'Face': 1, 'Course.': 1, 'chapter': 1, 'about': 1, 'tokenization.': 1, 'section': 1, 'shows': 1, 'several': 1, 'tokenizer': 1, 'algorithms.': 1, 'Hopefully,': 1, 'you': 1, 'will': 1, 'be': 1, 'able': 1, 'to': 1, 'understand': 1, 'how': 1, 'they': 1, 'are': 1, 'trained': 1, 'and': 1, 'generate': 1, 'tokens.': 1})


In [11]:
alphabet = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

print(alphabet)

[',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z']


In [20]:
vocab = ["<|endoftext|>"] + alphabet.copy()
splits = {word: [c for c in word] for word in word_freqs.keys()}
splits

{'This': ['T', 'h', 'i', 's'],
 'is': ['i', 's'],
 'the': ['t', 'h', 'e'],
 'Hugging': ['H', 'u', 'g', 'g', 'i', 'n', 'g'],
 'Face': ['F', 'a', 'c', 'e'],
 'Course.': ['C', 'o', 'u', 'r', 's', 'e', '.'],
 'chapter': ['c', 'h', 'a', 'p', 't', 'e', 'r'],
 'about': ['a', 'b', 'o', 'u', 't'],
 'tokenization.': ['t',
  'o',
  'k',
  'e',
  'n',
  'i',
  'z',
  'a',
  't',
  'i',
  'o',
  'n',
  '.'],
 'section': ['s', 'e', 'c', 't', 'i', 'o', 'n'],
 'shows': ['s', 'h', 'o', 'w', 's'],
 'several': ['s', 'e', 'v', 'e', 'r', 'a', 'l'],
 'tokenizer': ['t', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'],
 'algorithms.': ['a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's', '.'],
 'Hopefully,': ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y', ','],
 'you': ['y', 'o', 'u'],
 'will': ['w', 'i', 'l', 'l'],
 'be': ['b', 'e'],
 'able': ['a', 'b', 'l', 'e'],
 'to': ['t', 'o'],
 'understand': ['u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd'],
 'how': ['h', 'o', 'w'],
 'they': ['t', 'h', 'e', 'y'],
 'are': ['a', 'r'

In [21]:
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs

In [22]:
pair_freqs = compute_pair_freqs(splits)

for i, key in enumerate(pair_freqs.keys()):
    print(f"{key}: {pair_freqs[key]}")
    if i >= 5:
        break

('T', 'h'): 3
('h', 'i'): 3
('i', 's'): 5
('t', 'h'): 3
('h', 'e'): 2
('H', 'u'): 1


In [23]:
best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq

print(best_pair, max_freq)

('i', 's') 5


In [24]:
merges = {("i", "s"): "is"}
vocab.append("is")

In [25]:
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [26]:
splits = merge_pair("i", "s", splits)
print(splits["is"])

['is']


In [27]:
vocab_size = 50

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ""
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(*best_pair, splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])

In [28]:
print(merges)

{('i', 's'): 'is', ('e', 'r'): 'er', ('t', 'o'): 'to', ('e', 'n'): 'en', ('T', 'h'): 'Th', ('Th', 'is'): 'This', ('t', 'h'): 'th', ('o', 'u'): 'ou', ('s', 'e'): 'se', ('to', 'k'): 'tok', ('tok', 'en'): 'token', ('n', 'd'): 'nd', ('th', 'e'): 'the', ('i', 'n'): 'in', ('a', 'b'): 'ab', ('token', 'i'): 'tokeni', ('tokeni', 'z'): 'tokeniz', ('a', 't'): 'at', ('i', 'o'): 'io', ('io', 'n'): 'ion'}


In [29]:
print(vocab)

['<|endoftext|>', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'is', 'er', 'to', 'en', 'Th', 'This', 'th', 'ou', 'se', 'tok', 'token', 'nd', 'the', 'in', 'ab', 'tokeni', 'tokeniz', 'at', 'io', 'ion']


In [30]:
def tokenize(text):
    pre_tokenize_result = pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    splits = [[l for l in word] for word in pre_tokenized_text]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])

In [31]:
tokenize("This is not a token.")

['This', 'is', 'n', 'o', 't', 'a', 'token', '.']