In [1]:
# BYTE PAIR ENCODING (BPE)
# We implemented a simple tokenization scheme in the previous sections for illustration purposes.

# This section covers a more sophisticated tokenization scheme based on a concept called byte pair encoding (BPE).

# The BPE tokenizer covered in this section was used to train LLMs such as GPT-2, GPT-3, and the original model used in ChatGPT.

# Since implementing BPE can be relatively complicated, we will use an existing Python open-source library called tiktoken (https://github.com/openai/tiktoken).

# This library implements the BPE algorithm very efficiently based on source code in Rust.

In [2]:
import sys
print(sys.executable)

C:\Users\Dell\.conda\envs\llm_env\python.exe


In [3]:
!pip install tiktoken
import importlib

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.12.0


In [4]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.12.0


In [5]:
tokenizer = tiktoken.get_encoding("gpt2")

In [6]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [7]:
# The code above prints the following token IDs:

# We can then convert the token IDs back into text using the decode method, similar to our SimpleTokenizerV2 earlier:

In [8]:
strings = tokenizer.decode(integers)

print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [9]:
# e can make two noteworthy observations based on the token IDs and decoded text above.

# First, the <|endoftext|> token is assigned a relatively large token ID, namely, 50256.

# In fact, the BPE tokenizer, which was used to train models such as GPT-2, GPT-3, and the original model used in ChatGPT, has a
# total vocabulary size of 50,257, with <|endoftext|> being assigned the largest token ID.

# Second, the BPE tokenizer above encodes and decodes unknown words, such as "someunknownPlace" correctly.

# The BPE tokenizer can handle any unknown word. How does it achieve this without using <|unk|> tokens?

# The algorithm underlying BPE breaks down words that aren't in its predefined vocabulary into smaller subword units or even individual characters.

# The enables it to handle out-ofvocabulary words.

# So, thanks to the BPE algorithm, if the tokenizer encounters an unfamiliar word during tokenization, it can represent 
# it as a sequence of subword tokens or characters

# Let us take another simple example to illustrate how the BPE tokenizer deals with unknown tokens

In [10]:
integers = tokenizer.encode("Akwirw ier")
print(integers)

strings = tokenizer.decode(integers)
print(strings)

[33901, 86, 343, 86, 220, 959]
Akwirw ier


In [11]:
# DATA SAMPLING WITH SLIDING WINDOW

In [13]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [22]:
enc_sample = enc_text[50:]
print(enc_sample[:50])

[290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686, 41976, 13, 357, 10915, 314, 2138, 1807, 340, 561, 423, 587, 10598, 393, 28537, 2014, 198, 198, 1, 464, 6001, 286, 465, 13476, 1, 438, 5562, 373, 644, 262, 1466, 1444, 340, 13, 314, 460, 3285, 9074, 13, 46606, 536]


In [27]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [17]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [18]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


In [28]:
# We've now created the input-target pairs that we can turn into use for the LLM training in upcoming chapters.

# There's only one more task before we can turn the tokens into embeddings:implementing an efficient data loader that iterates over 
# the input dataset and returns the inputs and targets as PyTorch tensors, which can be thought of as multidimensional arrays.

# In particular, we are interested in returning two tensors: an input tensor containing the text that the LLM sees and a target tensor 
# that includes the targets for the LLM to predict,

# IMPLEMENTING A DATA LOADER
# For the efficient data loader implementation, we will use PyTorch's built-in Dataset and DataLoader classes.
# Step 1: Tokenize the entire text

# Step 2: Use a sliding window to chunk the book into overlapping sequences of max_length

# Step 3: Return the total number of rows in the dataset

# Step 4: Return a single row from the dataset

In [36]:
pip install torch torchvision torchaudio

Note: you may need to restart the kernel to use updated packages.


In [37]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [38]:
# The GPTDatasetV1 class in listing 2.5 is based on the PyTorch Dataset class.

# It defines how individual rows are fetched from the dataset.

# Each row consists of a number of token IDs (based on a max_length) assigned to an input_chunk tensor.

# The target_chunk tensor contains the corresponding targets.

# I recommend reading on to see how the data returned from this dataset looks like when we combine the dataset with a PyTorch DataLoader -- this will bring additional intuition and clarity.

# The following code will use the GPTDatasetV1 to load the inputs in batches via a PyTorch DataLoader:
# Step 1: Initialize the tokenizer

# Step 2: Create dataset

# Step 3: drop_last=True drops the last batch if it is shorter than the specified batch_size to prevent loss spikes during training

# Step 4: The number of CPU processes to use for preprocessing

In [40]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [42]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [44]:
import torch
print("PyTorch version:", torch.__version__)
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

PyTorch version: 2.10.0+cpu
[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [None]:
# The first_batch variable contains two tensors: the first tensor stores the input token IDs, and the second tensor stores the target token IDs.

# Since the max_length is set to 4, each of the two tensors contains 4 token IDs.

# Note that an input size of 4 is relatively small and only chosen for illustration purposes. It is common to train LLMs with input sizes of at least 256.

To illustrate the meaning of stride=1, let's fetch another batch from this dataset: