### Step 1: Creating Tokens

In [1]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

print('Total number of characters: ',  len(raw_text))
print(raw_text[:199])

Total number of characters:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married 


#### 1.1 Basic Tokenizer Examples

In [2]:
import re

text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [None]:
# Considering punctuations as tokens too.
result = re.split(r'([,.]|\s)', text)
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [None]:
# Removing whitespaces from tokens (Optional)
# Removing whitespaces saves the memory and computing requirements. However, keeping whiteshapces can be useful if we train
# models that are sensitive to the exact structure of the text (for example, python code, which is sensitive to indentation and spacing)
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [9]:
text = "Hello world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


#### 1.2 Applying Basic Tokenizer to Raw Text

In [21]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [22]:
print("Total number of tokens: ", len(preprocessed))

Total number of tokens:  4690


### Step 2: Creating Token IDs

In [23]:
list_of_tokens = sorted(set(preprocessed))
vocab_size = len(list_of_tokens)
print("Vocab size: ", vocab_size)

Vocab size:  1130


In [25]:
vocab = {token: idx for idx, token in enumerate(list_of_tokens)}

In [26]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [79]:
class Tokenizer:
    
    def __init__(self):
        self.token_to_id = None
        self.id_to_token = None
        self.unknown_token = '<[unk]>'
        self.trained = False
        
    def apply_regex_splitting(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        return preprocessed
    
    def train(self, text):
        preprocessed = self.apply_regex_splitting(text)
        unique_tokens = sorted(set(preprocessed))
        
        self.token_to_id = {token: idx for idx, token in enumerate(unique_tokens)}
        self.token_to_id[self.unknown_token] = len(unique_tokens)
        
        self.id_to_token = {idx: token for token, idx in self.token_to_id.items()}
        self.trained = True
    
    def encode(self, input_text):
        if not self.train:
            raise RuntimeError("You have to train tokenizer first...")  
        
        preprocessed = self.apply_regex_splitting(input_text)
        result = [
            self.token_to_id.get(token, self.token_to_id.get(self.unknown_token)) 
            for token in preprocessed
        ]
        return result
    
    def decode(self, ids):
        if not self.train:
            raise RuntimeError("You have to train tokenizer first...")  
        
        text = ' '.join([self.id_to_token[id] for id in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [80]:
tokenizer = Tokenizer()
tokenizer.train(raw_text)

In [None]:
text = """"It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 1130, 5, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [82]:
tokenizer.decode(ids)

'" <[unk]>, It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

### Step 3: (Optional) Adding Speacial Context Tokens

**Some common special tokens**:

- [BOS] Beginning of sequence
- [EOS] End of sequence
- [PAD] Padding

In [109]:
class TokenizerV2:
    
    def __init__(self, special_tokens: list = []):
        self.token_to_id = None
        self.id_to_token = None
        self.special_tokens = special_tokens
        self.trained = False
        
    def apply_regex_splitting(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        return preprocessed
    
    def train(self, text):
        preprocessed = self.apply_regex_splitting(text)
        unique_tokens = sorted(set(preprocessed))
        
        self.token_to_id = {token: idx for idx, token in enumerate(unique_tokens)}
        
        if self.special_tokens != []:
            idx = len(unique_tokens)
            for token in self.special_tokens:
                self.token_to_id[token] = idx
                idx += 1
        
        self.id_to_token = {idx: token for token, idx in self.token_to_id.items()}
        self.trained = True
    
    def encode(self, input_text):
        if not self.train:
            raise RuntimeError("You have to train tokenizer first...")  
        
        preprocessed = self.apply_regex_splitting(input_text)
        result = [
            self.token_to_id.get(token, self.token_to_id.get('<|unk|>')) 
            for token in preprocessed
        ]
        return result
    
    def decode(self, ids):
        if not self.train:
            raise RuntimeError("You have to train tokenizer first...")  
        
        text = ' '.join([self.id_to_token[id] for id in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [110]:
tokenizer = TokenizerV2(special_tokens=['<|unk|>', '<|endoftext|>'])
tokenizer.train(raw_text)

In [111]:
text = """"Hello! It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 1130, 0, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [112]:
tokenizer.decode(ids)

'" <|unk|>! It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [113]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [114]:
tokenizer.encode(text)

[1130, 5, 355, 1126, 628, 975, 10, 1131, 55, 988, 956, 984, 722, 988, 1130, 7]

In [115]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

### Byte Pair Encoding

#### BPE Tokenizer

In [2]:
import regex as re

In [3]:
GPT2_SPLIT_PATTERN = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""

In [38]:
class BPETokenizer():
    
    def __init__(self, pattern):
        self.vocab = {}
        self.merges = {}
        self.pattern = pattern
        self.compiled_pattern = re.compile(self.pattern)
    
    def get_tokens(self, text):
        text_chunks = re.findall(self.compiled_pattern, text)
        ids = [tuple(ch.encode('utf-8')) for ch in text_chunks]
        return ids
    
    def get_stats(self, ids, counts=None):
        counts = {} if counts is None else counts
        for pair in zip(ids, ids[1:]):
            counts[pair] = counts.get(pair, 0) + 1
        return counts
    
    def get_merge(self, ids, pair, idx):
        newids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
                newids.append(idx)
                i += 2
            else:
                newids.append(ids[i])
                i += 1
        return newids
    
    def update_vocab(self, merges):
        self.vocab = {idx:  bytes([idx]) for idx in range(256)}
        for (p0, p1), idx in merges.items():
            self.vocab[idx] = self.vocab[p0] + self.vocab[p1]
    
    def train(self, text, vocab_size, verbose=False):
        ids = self.get_tokens(text)
        num_merges = vocab_size - 256
        merges = {}
        for i in range(num_merges):
            stats = {}
            for chunk_ids in ids:
                self.get_stats(chunk_ids, stats)
            pair = max(stats, key=stats.get)
            idx = 256 + i
            
            ids = [self.get_merge(chunk_ids, pair, idx) for chunk_ids in ids]
            merges[pair] = idx
            
            if verbose:
                    print(f'merge {i+1}/{num_merges}: {pair} into a new token {idx}')
        
        self.merges = merges
        self.update_vocab(merges)
    
    def encode(self, text):
        tokens = list(text.encode('utf-8'))
        while len(tokens) > 2:
            stats = self.get_stats(tokens)
            pair = min(stats, key = lambda p: self.merges.get(p, float('inf')))
            if pair not in self.merges:
                break
            idx = self.merges[pair]
            tokens = self.get_merge(tokens, pair, idx)
        return tokens
    
    def decode(self, ids):        
        tokens = b"".join(self.vocab[idx] for idx in ids)
        text = tokens.decode('utf-8', errors='replace')
        return text

#### Small Demo of Tokenizer Training Process

In [40]:
tokenizer = BaseBPETokenizer(GPT2_SPLIT_PATTERN)
ids = tokenizer.get_tokens(raw_text)
ids

[(73,),
 (32, 72, 65, 68),
 (32, 97, 108, 119, 97, 121, 115),
 (32, 116, 104, 111, 117, 103, 104, 116),
 (32, 74, 97, 99, 107),
 (32, 71, 105, 115, 98, 117, 114, 110),
 (32, 114, 97, 116, 104, 101, 114),
 (32, 97),
 (32, 99, 104, 101, 97, 112),
 (32, 103, 101, 110, 105, 117, 115),
 (45, 45),
 (116, 104, 111, 117, 103, 104),
 (32, 97),
 (32, 103, 111, 111, 100),
 (32, 102, 101, 108, 108, 111, 119),
 (32, 101, 110, 111, 117, 103, 104),
 (45, 45),
 (115, 111),
 (32, 105, 116),
 (32, 119, 97, 115),
 (32, 110, 111),
 (32, 103, 114, 101, 97, 116),
 (32, 115, 117, 114, 112, 114, 105, 115, 101),
 (32, 116, 111),
 (32, 109, 101),
 (32, 116, 111),
 (32, 104, 101, 97, 114),
 (32, 116, 104, 97, 116),
 (44,),
 (32, 105, 110),
 (32, 116, 104, 101),
 (32, 104, 101, 105, 103, 104, 116),
 (32, 111, 102),
 (32, 104, 105, 115),
 (32, 103, 108, 111, 114, 121),
 (44,),
 (32, 104, 101),
 (32, 104, 97, 100),
 (32, 100, 114, 111, 112, 112, 101, 100),
 (32, 104, 105, 115),
 (32, 112, 97, 105, 110, 116, 105, 11

In [27]:
stats = tokenizer.get_stats(ids)
stats

{((73,), (32, 72, 65, 68)): 1,
 ((32, 72, 65, 68), (32, 97, 108, 119, 97, 121, 115)): 1,
 ((32, 97, 108, 119, 97, 121, 115),
  (32, 116, 104, 111, 117, 103, 104, 116)): 1,
 ((32, 116, 104, 111, 117, 103, 104, 116), (32, 74, 97, 99, 107)): 1,
 ((32, 74, 97, 99, 107), (32, 71, 105, 115, 98, 117, 114, 110)): 2,
 ((32, 71, 105, 115, 98, 117, 114, 110), (32, 114, 97, 116, 104, 101, 114)): 1,
 ((32, 114, 97, 116, 104, 101, 114), (32, 97)): 1,
 ((32, 97), (32, 99, 104, 101, 97, 112)): 1,
 ((32, 99, 104, 101, 97, 112), (32, 103, 101, 110, 105, 117, 115)): 1,
 ((32, 103, 101, 110, 105, 117, 115), (45, 45)): 1,
 ((45, 45), (116, 104, 111, 117, 103, 104)): 1,
 ((116, 104, 111, 117, 103, 104), (32, 97)): 1,
 ((32, 97), (32, 103, 111, 111, 100)): 2,
 ((32, 103, 111, 111, 100), (32, 102, 101, 108, 108, 111, 119)): 1,
 ((32, 102, 101, 108, 108, 111, 119), (32, 101, 110, 111, 117, 103, 104)): 1,
 ((32, 101, 110, 111, 117, 103, 104), (45, 45)): 1,
 ((45, 45), (115, 111)): 2,
 ((115, 111), (32, 105, 116

In [29]:
max_pair = max(stats, key=stats.get)
max_pair, stats[max_pair]

(((10,), (10,)), 82)

In [32]:
newids = tokenizer.get_merge(ids, max_pair, 256 + 1)
stats = tokenizer.get_stats(newids)
max_pair = max(stats, key=stats.get)
max_pair, stats[max_pair]

(((46,), 257), 43)

#### Training BPE Tokenizer

In [39]:
tokenizer = BPETokenizer(GPT2_SPLIT_PATTERN)
tokenizer.train(raw_text, 276, verbose=True)

merge 1/20: (32, 116) into a new token 256
merge 2/20: (104, 101) into a new token 257
merge 3/20: (32, 97) into a new token 258
merge 4/20: (105, 110) into a new token 259
merge 5/20: (32, 104) into a new token 260
merge 6/20: (32, 115) into a new token 261
merge 7/20: (32, 119) into a new token 262
merge 8/20: (32, 111) into a new token 263
merge 9/20: (256, 257) into a new token 264
merge 10/20: (111, 117) into a new token 265
merge 11/20: (114, 101) into a new token 266
merge 12/20: (105, 116) into a new token 267
merge 13/20: (32, 109) into a new token 268
merge 14/20: (105, 115) into a new token 269
merge 15/20: (101, 100) into a new token 270
merge 16/20: (97, 116) into a new token 271
merge 17/20: (110, 100) into a new token 272
merge 18/20: (32, 98) into a new token 273
merge 19/20: (259, 103) into a new token 274
merge 20/20: (32, 112) into a new token 275


In [40]:
raw_text[:100]

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g'

In [46]:
tokenizer.encode(raw_text)

[73,
 32,
 72,
 65,
 68,
 258,
 108,
 119,
 97,
 121,
 115,
 256,
 104,
 265,
 103,
 104,
 116,
 32,
 74,
 97,
 99,
 107,
 32,
 71,
 269,
 98,
 117,
 114,
 110,
 32,
 114,
 271,
 257,
 114,
 258,
 32,
 99,
 257,
 97,
 112,
 32,
 103,
 101,
 110,
 105,
 117,
 115,
 45,
 45,
 116,
 104,
 265,
 103,
 104,
 258,
 32,
 103,
 111,
 111,
 100,
 32,
 102,
 101,
 108,
 108,
 111,
 119,
 32,
 101,
 110,
 265,
 103,
 104,
 45,
 45,
 115,
 111,
 32,
 267,
 262,
 97,
 115,
 32,
 110,
 111,
 32,
 103,
 266,
 271,
 261,
 117,
 114,
 112,
 114,
 269,
 101,
 256,
 111,
 268,
 101,
 256,
 111,
 32,
 257,
 97,
 114,
 256,
 104,
 271,
 44,
 32,
 259,
 264,
 32,
 257,
 105,
 103,
 104,
 116,
 263,
 102,
 260,
 269,
 32,
 103,
 108,
 111,
 114,
 121,
 44,
 32,
 257,
 260,
 97,
 100,
 32,
 100,
 114,
 111,
 112,
 112,
 270,
 260,
 269,
 275,
 97,
 259,
 116,
 274,
 44,
 268,
 97,
 114,
 114,
 105,
 270,
 258,
 32,
 114,
 105,
 99,
 104,
 262,
 105,
 100,
 111,
 119,
 44,
 258,
 272,
 32,
 101,
 115,
 116,
 9

In [42]:
tokenizer.decode(tokenizer.encode(raw_text))[:100]

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g'

#### Tiktoken

In [43]:
import importlib
import importlib.metadata
import tiktoken

print('tiktoken version: ', importlib.metadata.version('tiktoken'))

tiktoken version:  0.8.0


In [44]:
tiktoken_tokenizer = tiktoken.get_encoding('gpt2')

In [45]:
tiktoken_tokenizer.encode(raw_text)

[40,
 367,
 2885,
 1464,
 1807,
 3619,
 402,
 271,
 10899,
 2138,
 257,
 7026,
 15632,
 438,
 2016,
 257,
 922,
 5891,
 1576,
 438,
 568,
 340,
 373,
 645,
 1049,
 5975,
 284,
 502,
 284,
 3285,
 326,
 11,
 287,
 262,
 6001,
 286,
 465,
 13476,
 11,
 339,
 550,
 5710,
 465,
 12036,
 11,
 6405,
 257,
 5527,
 27075,
 11,
 290,
 4920,
 2241,
 287,
 257,
 4489,
 64,
 319,
 262,
 34686,
 41976,
 13,
 357,
 10915,
 314,
 2138,
 1807,
 340,
 561,
 423,
 587,
 10598,
 393,
 28537,
 2014,
 198,
 198,
 1,
 464,
 6001,
 286,
 465,
 13476,
 1,
 438,
 5562,
 373,
 644,
 262,
 1466,
 1444,
 340,
 13,
 314,
 460,
 3285,
 9074,
 13,
 46606,
 536,
 5469,
 438,
 14363,
 938,
 4842,
 1650,
 353,
 438,
 2934,
 489,
 3255,
 465,
 48422,
 540,
 450,
 67,
 3299,
 13,
 366,
 5189,
 1781,
 340,
 338,
 1016,
 284,
 3758,
 262,
 1988,
 286,
 616,
 4286,
 705,
 1014,
 510,
 26,
 475,
 314,
 836,
 470,
 892,
 286,
 326,
 11,
 1770,
 13,
 8759,
 2763,
 438,
 1169,
 2994,
 284,
 943,
 17034,
 318,
 477,
 314,
 892,


In [47]:
tiktoken_tokenizer.decode(tiktoken_tokenizer.encode(raw_text)[:100])

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Th'

In [49]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace"
)

tokens = tiktoken_tokenizer.encode(text, allowed_special={'<|endoftext|>'})
print(tokens)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271]


In [50]:
tiktoken_tokenizer.decode(tokens)

'Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace'