# 1. Vocabulary initizlization

In [33]:

vocab: dict[int, bytes] = {i: bytes([i]) for i in range(256)}

In [34]:
"你好".encode("utf-8"), b'\xe4\xbd\xa0\xe5\xa5\xbd'.decode()

(b'\xe4\xbd\xa0\xe5\xa5\xbd', '你好')

In [35]:
vocab[0], vocab[1], vocab[66]

(b'\x00', b'\x01', b'B')

# 2. Pre-tokenization

In [39]:
import regex as re
PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
re.findall(PAT, "some text that i'll pre-tokenize")

['some', ' text', ' that', ' i', "'ll", ' pre', '-', 'tokenize']

In [42]:
char_iter = re.finditer(PAT, "some text that i'll pre-tokenize")
for c in char_iter:
    print(c.group(), c.span())

some (0, 4)
 text (4, 9)
 that (9, 14)
 i (14, 16)
'll (16, 19)
 pre (19, 23)
- (23, 24)
tokenize (24, 32)


In [51]:
data_path = "/home/lin/AI-Learning/CS336/data/TinyStoriesV2-GPT4-valid.txt"
def get_data(path: str) -> list[str]:
    with open(path, "r", encoding="utf-8") as f:
        data = f.read()
    return data
data = get_data(data_path)
data[:500]

'u don\'t have to be scared of the loud dog, I\'ll protect you". The mole felt so safe with the little girl. She was very kind and the mole soon came to trust her. He leaned against her and she kept him safe. The mole had found his best friend.\n<|endoftext|>\nOnce upon a time, in a warm and sunny place, there was a big pit. A little boy named Tom liked to play near the pit. One day, Tom lost his red ball. He was very sad.\nTom asked his friend, Sam, to help him search for the ball. They looked high a'

In [114]:
from collections import defaultdict

test_pre_tokenized = defaultdict(int)
for c in re.findall(PAT, "some text that i'll pre-tokenize some some"):
    test_pre_tokenized[c.encode()] += 1

In [115]:
test_pre_tokenized

defaultdict(int,
            {b'some': 1,
             b' text': 1,
             b' that': 1,
             b' i': 1,
             b"'ll": 1,
             b' pre': 1,
             b'-': 1,
             b'tokenize': 1,
             b' some': 2})

In [119]:
test_pre_tokenized = defaultdict(int)
for c in re.finditer(PAT, "some text that i'll pre-tokenize some some"):
    # test_pre_tokenized[c.group().encode()] += 1
    key = tuple([c for c in iter(c.group())]) # TODO: 这里测试先不使用encode
    test_pre_tokenized[key] += 1

test_pre_tokenized

defaultdict(int,
            {('s', 'o', 'm', 'e'): 1,
             (' ', 't', 'e', 'x', 't'): 1,
             (' ', 't', 'h', 'a', 't'): 1,
             (' ', 'i'): 1,
             ("'", 'l', 'l'): 1,
             (' ', 'p', 'r', 'e'): 1,
             ('-',): 1,
             ('t', 'o', 'k', 'e', 'n', 'i', 'z', 'e'): 1,
             (' ', 's', 'o', 'm', 'e'): 2})

In [None]:
from collections import Counter
from tqdm import tqdm
test_pairs_counter = Counter()
max_value = 0
max_pair = None   # 获取出现最多次(在有多个相同的情况下, 取最后一个)的pair

# 更新Counter, 同时返回最大的pair以及出现的最多的次数
def update_pairs_counter(counter, word, num):
    for c1, c2 in zip(pair[:-1], pair[1:]):
        counter[(c1, c2)] += v


for k, v in tqdm(test_pre_tokenized.items(), desc="Counting pairs...", unit='words', ncols=80):
    for c1, c2 in zip(k[:-1], k[1:]):
        test_pairs_counter[(c1, c2)] += v
        if test_pairs_counter[(c1, c2)] >= max_value:
            max_value = test_pairs_counter[(c1, c2)]
            max_pair = (c1, c2)

Counting pairs...: 100%|███████████████████| 9/9 [00:00<00:00, 114044.52words/s]


In [134]:
test_pairs_counter

Counter({('s', 'o'): 3,
         ('o', 'm'): 3,
         ('m', 'e'): 3,
         (' ', 't'): 2,
         (' ', 's'): 2,
         ('t', 'e'): 1,
         ('e', 'x'): 1,
         ('x', 't'): 1,
         ('t', 'h'): 1,
         ('h', 'a'): 1,
         ('a', 't'): 1,
         (' ', 'i'): 1,
         ("'", 'l'): 1,
         ('l', 'l'): 1,
         (' ', 'p'): 1,
         ('p', 'r'): 1,
         ('r', 'e'): 1,
         ('t', 'o'): 1,
         ('o', 'k'): 1,
         ('k', 'e'): 1,
         ('e', 'n'): 1,
         ('n', 'i'): 1,
         ('i', 'z'): 1,
         ('z', 'e'): 1})

In [135]:
max_value, max_pair

(3, ('m', 'e'))

In [None]:
target = ('s', 'o', 'm', 'e')
max_pair = ('s', 'e')
start_index = -1

def get_match_pair_index(target: tuple[str], max_pair: tuple[str]) -> int:
    if max_pair[0] not in pair or max_pair[1] not in pair:
        return -1
    for i in range(len(target) - 1):
        if max_pair == (target[i], target[i + 1]):
            return i
    return -1 

def update_tokenization(tokenized_dict, max_pair):
    for pair in tokenized_dict:
        start = get_match_pair_index(pair, max_pair)
        if start != -1:
            # 先更新pair, 然后根据新的pair, 更新Counter
            new_pair = list(pair)
            new_pair[start] = new_pair[start] + new_pair[start + 1]
            new_pair.pop(start + 1)
            tokenized_dict[tuple(new_pair)] += tokenized_dict[pair]
            del tokenized_dict[pair]



In [5]:
from collections import Counter
c = Counter(list(range(10)) + [4])
c

Counter({4: 2, 0: 1, 1: 1, 2: 1, 3: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1})

In [6]:
del c[0]

In [22]:
c

Counter({1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1})

In [4]:
test = (b'a', b'b')
b"".join(test)

b'ab'