In [1]:
import urllib.request
import re
import random

In [6]:
url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")

file_path = "the-verdict.txt"

urllib.request.urlretrieve(url, file_path)

with open(file_path, "r", encoding="utf-8") as file:
    raw_text = file.read()

print(f"Number of characters: {len(raw_text)}")
print(raw_text[:100])

Number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [7]:
text = "Hello, world! This is a test."

In [8]:
result = re.split(r'(\s)', text)
result

['Hello,', ' ', 'world!', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test.']

In [9]:
result = re.split(r'([,.]|\s)', text)
result

['Hello',
 ',',
 '',
 ' ',
 'world!',
 ' ',
 'This',
 ' ',
 'is',
 ' ',
 'a',
 ' ',
 'test',
 '.',
 '']

In [10]:
result = [token for token in result if token.strip()]
result


['Hello', ',', 'world!', 'This', 'is', 'a', 'test', '.']

In [11]:
text = "Hello, world! Is this-- is a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [token.strip() for token in result if token.strip()]
result


['Hello', ',', 'world', '!', 'Is', 'this', '--', 'is', 'a', 'test', '?']

In [12]:
def tokenize(text):
    result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    result = [token.strip() for token in result if token.strip()]
    return result

In [13]:
preprocessed = tokenize(raw_text)
print(f"Number of tokens: {len(preprocessed)}")
print(random.sample(preprocessed, 10))

Number of tokens: 4690
['me', 'forward', 'But', 'inflexible', 'his', 'Yes', 'been', 'in', 'had', 'had']


In [14]:
vocab = sorted(set(preprocessed))
len(vocab)

1130

In [15]:
ctoi = { c:i for i, c in enumerate(vocab)}
list(ctoi.items())[:10]



[('!', 0),
 ('"', 1),
 ("'", 2),
 ('(', 3),
 (')', 4),
 (',', 5),
 ('--', 6),
 ('.', 7),
 (':', 8),
 (';', 9)]

In [16]:
itoc = {i:c for c, i in ctoi.items()}
list(itoc.items())[:30]


[(0, '!'),
 (1, '"'),
 (2, "'"),
 (3, '('),
 (4, ')'),
 (5, ','),
 (6, '--'),
 (7, '.'),
 (8, ':'),
 (9, ';'),
 (10, '?'),
 (11, 'A'),
 (12, 'Ah'),
 (13, 'Among'),
 (14, 'And'),
 (15, 'Are'),
 (16, 'Arrt'),
 (17, 'As'),
 (18, 'At'),
 (19, 'Be'),
 (20, 'Begin'),
 (21, 'Burlington'),
 (22, 'But'),
 (23, 'By'),
 (24, 'Carlo'),
 (25, 'Chicago'),
 (26, 'Claude'),
 (27, 'Come'),
 (28, 'Croft'),
 (29, 'Destroyed')]

In [64]:
class TokenizerV1:
    def __init__(self, vocab=None):
        print("TokenizerV1 initialized")
        if vocab is None:
            self.vocab = None
        else:
            self.vocab = vocab
    
    def build_vocab(self, type='word', corpus=None):
        if corpus is None:
            corpus = self.corpus
        if type == 'word':
            result = re.split(r'([,.:;?_!"()\']|--|\s)', corpus)
        elif type == 'char':
            result = list(corpus)
        vocab = sorted(set(token.strip() for token in result if token.strip()))
        self.vocab = vocab
        self.ctoi = { c:i for i, c in enumerate(vocab)}
        self.itoc = {i:c for c, i in self.ctoi.items()}
        return vocab

    def encode(self, text):
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        cleaned_tokens = [token.strip() for token in tokens if token.strip()]
        result = [self.ctoi[token] for token in cleaned_tokens if token in self.vocab]
        return result

    def decode(self, tokens):
        text = " ".join([self.itoc[token] for token in tokens])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text



In [69]:
tokenizer = TokenizerV1()
vocab = tokenizer.build_vocab(type='word', corpus=raw_text)
print(f"Number of tokens: {len(vocab)}")
print(random.sample(vocab, 10))
print(vocab[:30])
print(len(tokenizer.itoc))

TokenizerV1 initialized
Number of tokens: 1130
['so', 'much', 'grace', 'To', 'part', 'then', 'hour', 'deerhound', 'waves', 'suddenly']
['!', '"', "'", '(', ')', ',', '--', '.', ':', ';', '?', 'A', 'Ah', 'Among', 'And', 'Are', 'Arrt', 'As', 'At', 'Be', 'Begin', 'Burlington', 'But', 'By', 'Carlo', 'Chicago', 'Claude', 'Come', 'Croft', 'Destroyed']
1130


In [70]:
test= tokenizer.encode("It's the last he painted, you know, Mrs. Gisburn said with pardonable pride.")
print(test)
print(tokenizer.decode(test))

[56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 67, 7, 38, 851, 1108, 754, 793, 7]
It' s the last he painted, you know, Mrs. Gisburn said with pardonable pride.


In [98]:
class TokenizerV2:
    def __init__(self, vocab=None, corpus=None):
        print("TokenizerV2 initialized")
        self.vocab = vocab
        if not vocab and corpus:
            self.build_vocab(corpus=corpus, type='word')
    
    def build_vocab(self, corpus, type='word'):
        if type == 'word':
            result = re.split(r'([,.:;?_!"()\']|--|\s)', corpus)
            self.vocab_type = 'word'
        elif type == 'char':
            result = list(corpus)
            self.vocab_type = 'char'
        vocab = sorted(set(token.strip() for token in result if token.strip()))
        vocab.extend(["<|endoftext|>", "<|unk|>"])
        self.vocab = vocab
        self.ctoi = { c:i for i, c in enumerate(self.vocab)}
        self.itoc = {i:c for c, i in self.ctoi.items()}
        return vocab

    def encode(self, text):
        if self.vocab_type == 'word':
            tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        elif self.vocab_type == 'char':
            tokens = list(text)
        cleaned_tokens = [token.strip() for token in tokens if token.strip()]
        result = [self.ctoi[token] if token in self.vocab else self.ctoi["<|unk|>"] for token in cleaned_tokens]
        return result

    def decode(self, tokens):
        text = " ".join([self.itoc[token] for token in tokens])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [117]:
tokenizer = TokenizerV2(corpus=raw_text)
vocab = tokenizer.build_vocab(type='word', corpus=raw_text)
tokens = tokenizer.encode("It's the last he painted, you know, Mrs. Gisburn said with pardonable pride.")
print(tokens)
print(tokenizer.decode(tokens))

TokenizerV2 initialized
[56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 67, 7, 38, 851, 1108, 754, 793, 7]
It' s the last he painted, you know, Mrs. Gisburn said with pardonable pride.


In [118]:
tokens = tokenizer.encode("Hello, world! This is a test. Painted, you, Mrs. Gisburn")
print(tokens)
print(tokenizer.decode(tokens))

[1131, 5, 1131, 0, 97, 584, 115, 1131, 7, 1131, 5, 1126, 5, 67, 7, 38]
<|unk|>, <|unk|>! This is a <|unk|>. <|unk|>, you, Mrs. Gisburn


In [119]:
tokens = tokenizer.encode(raw_text[:500])
print(tokens)
print(tokenizer.decode(tokens))

[53, 44, 149, 1003, 57, 38, 818, 115, 256, 486, 6, 1002, 115, 500, 435, 392, 6, 908, 585, 1077, 709, 508, 961, 1016, 663, 1016, 535, 987, 5, 568, 988, 538, 722, 549, 496, 5, 533, 514, 370, 549, 748, 5, 661, 115, 841, 1102, 5, 157, 397, 547, 568, 115, 1066, 727, 988, 84, 7, 3, 99, 53, 818, 1003, 585, 1120, 530, 208, 85, 734, 34, 7, 4, 1, 93, 538, 722, 549, 496, 1, 6, 987, 1077, 1089, 988, 1112, 242, 585, 7, 53, 244, 535, 67, 7, 37, 100, 6, 549, 602, 25, 897, 6, 326, 549, 1042, 116, 7, 1, 73, 297, 585, 2]
I HAD always thought Jack Gisburn rather a cheap genius -- though a good fellow enough -- so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera.( Though I rather thought it would have been Rome or Florence.)" The height of his glory" -- that was what the women called it. I can hear Mrs. Gideon Thwing -- his last Chicago sitter -- deploring his unaccountable abdicati

In [120]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terrace of the palace."
text = " <|endoftext|> ".join([text1, text2])
tokens = tokenizer.encode(text)
print(tokens)
print(tokenizer.decode(tokens))

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 983, 722, 988, 1131, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terrace of the <|unk|>.


In [122]:
%pip install tiktoken

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [125]:
import tiktoken

In [129]:
tokenizer = tiktoken.get_encoding("gpt2")
text = (text)
tokens = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(tokens)
print(tokenizer.decode(tokens))

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 558, 286, 262, 20562, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terrace of the palace.


In [130]:
text = ("Akwirw ier")
tokens = tokenizer.encode(text)
print(tokens)
print(tokenizer.decode(tokens))

[33901, 86, 343, 86, 220, 959]
Akwirw ier


In [132]:
t = [tokenizer.decode([token]) for token in tokens]
t

['Ak', 'w', 'ir', 'w', ' ', 'ier']