<table style="width:100%">
<tr>
<td style="vertical-align:middle; text-align:left;">
<font size="2">
Supplementary code for the <a href="http://mng.bz/orYv">Build a Large Language Model From Scratch</a> book by <a href="https://sebastianraschka.com">Sebastian Raschka</a><br>
<br>Code repository: <a href="https://github.com/rasbt/LLMs-from-scratch">https://github.com/rasbt/LLMs-from-scratch</a>
</font>
</td>
<td style="vertical-align:middle; text-align:left;">
<a href="http://mng.bz/orYv"><img src="https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp" width="100px"></a>
</td>
</tr>
</table>


# The Main Data Loading Pipeline Summarized

The complete chapter code is located in [ch02.ipynb](./ch02.ipynb).

This notebook contains the main takeaway, the data loading pipeline without the intermediate steps.

Packages that are being used in this notebook:

In [2]:
# NBVAL_SKIP
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.8.0
tiktoken version: 0.12.0


In [6]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Transfer Input Text into Tokenized Text into Token Ids
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        print(f"Total tokens in the book: {len(token_ids)}")
        print(f"First 10 token ids: {token_ids[:10]}")

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

        print(f"Total sequences created: {len(self.input_ids)}")
        print(f"First input sequence (token ids): {self.input_ids[0]}")
        print(f"First target sequence (token ids): {self.target_ids[0]}")
        print(f"Secondend input sequence (token ids): {self.input_ids[1]}")
        print(f"Secondend target sequence (token ids): {self.target_ids[1]}")


    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size, max_length, stride,
                         shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader


with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

batch_size = 8
max_length = 4
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=batch_size,
    max_length=max_length,
    stride=max_length
)

Total tokens in the book: 5145
First 10 token ids: [40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138]
Total sequences created: 1286
First input sequence (token ids): tensor([  40,  367, 2885, 1464])
First target sequence (token ids): tensor([ 367, 2885, 1464, 1807])
Secondend input sequence (token ids): tensor([1807, 3619,  402,  271])
Secondend target sequence (token ids): tensor([ 3619,   402,   271, 10899])


In [None]:
vocab_size = 50257 # Total tokens in the book: 5145 
output_dim = 256
context_length = 1024

# Token Id -> Token Embedding
# 50257 x 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(token_embedding_layer.type)

# Position Id -> Position Embedding
# 1024 x 256
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
print(pos_embedding_layer.type)


for batch in dataloader:
    # 8 x 4 
    inputs, targets = batch
    print("Input batch shape (token ids):", inputs.shape)
    print("Target batch shape (token ids):", targets.shape)

    # 8 x 4 x 256
    token_embeddings = token_embedding_layer(inputs)
    print("Token embeddings shape:", token_embeddings.shape)
    print("Token embeddings:", token_embeddings[:1])  # Print the first 1 token embeddings for brevity

    # 4 x 256
    pos_embeddings = pos_embedding_layer(torch.arange(max_length))
    print("Position embeddings shape:", pos_embeddings.shape)
    print("Position embeddings:", pos_embeddings[:1])  # Print the first 1 position embeddings for brevity

    # 8 x 4 x 256 + 4 x 256 --> 8 x 4 x 256
    input_embeddings = token_embeddings + pos_embeddings
    print("Input embeddings shape:", input_embeddings.shape)
    print("Input embeddings:", input_embeddings)
    break

<bound method Module.type of Embedding(50257, 256)>
<bound method Module.type of Embedding(1024, 256)>
Input batch shape (token ids): torch.Size([8, 4])
Target batch shape (token ids): torch.Size([8, 4])
Token embeddings shape: torch.Size([8, 4, 256])
Token embeddings: tensor([[[-0.7671, -0.2551,  2.1522,  ..., -0.8214, -1.0617,  1.5718],
         [-0.2754,  0.5838, -0.0417,  ..., -1.0125,  0.8731,  0.2547],
         [ 0.4039, -0.3161, -0.0123,  ..., -0.5068,  0.6774,  0.2407],
         [-0.2328,  0.9488, -1.3629,  ...,  0.6063,  1.1937,  0.9867]]],
       grad_fn=<SliceBackward0>)
Position embeddings shape: torch.Size([4, 256])
Position embeddings: tensor([[ 1.6189,  0.5117,  0.4124,  0.1558,  0.4487,  0.9925, -1.8128, -0.0837,
          0.8679, -2.0128, -0.1819,  0.5260, -1.2032,  0.8039, -1.3730, -0.0836,
         -0.9995, -0.6364,  0.6700,  1.3646, -0.1285,  0.6210, -2.2872, -1.1659,
          0.0679, -1.1508, -1.1403, -0.2682, -0.9396, -1.7957,  0.4869, -0.1173,
          0.8913, 