In [1]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x29927654bf0>)

In [2]:
with open(file_path, "r", encoding='utf-8') as f:
    raw_text = f.read()

In [3]:
print("Total number of character in the text: ", len(raw_text))
print(raw_text[:99])

Total number of character in the text:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


## Tokenizing

In [4]:
# basic tokenizer
import re

text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\\s)', text)
print(result)

result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', ' world', '.', ' Is this', '--', ' a test', '?', '']
['Hello', ',', 'world', '.', 'Is this', '--', 'a test', '?']


In [5]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [6]:
print(len(preprocessed))

4690


## Tokens to token IDs
It is an intermediate step before converting the token IDs into embedding vectors.

In [7]:
# Build a vocabulary
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print("Vocabulary size:", vocab_size)

Vocabulary size: 1130


In [8]:
vocab = {token:integer for integer,token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i > 10:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)


In [9]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}    # inverse vocab that maps token IDs back to original text tokens

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)   # remove whitespace before punctuation
        return text

In [10]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
       Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids) 

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [11]:
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


## Special context tokens
so the tokenizer can handle unknown (out-of-vocabulary) words. Also to address the usage and addition of special context tokens that can enhance a model's understanding of context or other relevant information in the text.

In [12]:
all_tokens = sorted(list(preprocessed))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}
print(len(vocab.items()))

1132


In [13]:
{k: vocab[k] for k in list(vocab)[-5:]}

{'younger': 4685,
 'your': 4688,
 'yourself': 4689,
 '<|endoftext|>': 4690,
 '<|unk|>': 4691}

In [14]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int     # replace unknown words with <|unk|>
            else "<|unk|>" for item in preprocessed
        ]
        
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)  # replace whitespace before punctuation
        return text

In [15]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [16]:
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))

[4691, 489, 2009, 4684, 2906, 3889, 865, 4690, 1102, 4131, 3836, 3903, 3265, 4131, 4691, 798]


In [17]:
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


## Byte Pair Encoding (BPE)
was used to train GPT-2, GPT-3, and the original model used in ChatGPT. We'll used BPE implementation from `tiktoken` (version 0.7.0)

In [18]:
from importlib.metadata import version
import tiktoken
print(version("tiktoken"))

0.7.0


In [19]:
tokenizer = tiktoken.get_encoding("gpt2")

In [22]:
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of the someunknownPalace."
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print("Token IDs:", integers)

Token IDs: [15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 262, 617, 34680, 11531, 558, 13]


In [23]:
strings = tokenizer.decode(integers)
print("Decoded text:", strings)

Decoded text: Hello, do you like tea? <|endoftext|> In the sunlit terraces of the someunknownPalace.


- BPE tokenizer has a total vocabulary size of 50,257 tokens, with `<|endoftext|>` beign assigned the largest token ID.
- BPE tokenizer encodes and decodes unknown words, such as `someunknownPlace`, correctly. It can handle any unknown word without using `<|unk|>` tokens.
- The underlying algorithm breaks down unknown words into smaller subword units or even individual characters. This allows the model to process and understand the unknown word based on its subword components.

The ability to break down unknown words into individual characters ensures that the tokenizer and, consequently, the LLM that is trained with it can process any text, even if it contains words that were not present in its training data.

In short, BPE builds its vocabulary by iteratively merging the most frequent characters into subwords and frequent subwords into words.

For example, BPE starts with adding individual single characters to its vocabulary ("a", "b", "c", etc.). In the next stage, it merges character combinations that frequently occur together into subwords. For example, "d" and "e" may be merged into "de", which is common in many English words like "define", "defend", etc. The merges are determined by a frequency cutoff.

In [27]:
text = "Akwirw ier"
print("Original text:", text)
tk_ids = tokenizer.encode(text, allowed_special={"|endoftext|"})
print("Token IDs:", tk_ids)
tk = tokenizer.decode(tk_ids)
print("Decoded text:", tk)

Original text: Akwirw ier
Token IDs: [33901, 86, 343, 86, 220, 959]
Decoded text: Akwirw ier


In [33]:
{token:token_id for token, token_id in zip([tokenizer.decode([t]) for t in tk_ids], tk_ids)}

{'Ak': 33901, 'w': 86, 'ir': 343, ' ': 220, 'ier': 959}

## Data Sampling with a sliding window