# LLM Textbook Chapter 2: Working with text data

This chapter covers common ways to process text data for use with deep learning and LLMs specifically. First, text passages are split into words and punctuation marks and these are converted into numerical tokens. Then, these tokens are embedded as vectors. This chapter covers ways to do that.

In [3]:
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken")) 

torch version: 2.2.2
tiktoken version: 0.9.0


In [4]:
# download "The Verdict" by Edith Wharton, which is our example text
import os
import urllib.request

if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

In [5]:
# take a peek at some of the text and the total character count
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [6]:
# split into tokens with regex
import re

text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [7]:
# better regex
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [8]:
# number of tokens
print(len(preprocessed))

4690


In [9]:
# create a "vocabulary" of tokens mapping them to integers
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

vocab = {token:integer for integer,token in enumerate(all_words)}

for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 20:
        break

1130
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)


### Tokenizer class

We can now implement this in a "tokenizer" class that can take in raw text and encode it into integer tokens, and decode tokens back into text.

In [10]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [11]:
# test our tokenizer given the vocab and some text. first we encode.
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [12]:
# now try decoding that
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

### Special context tokens

Here we show how to deal with special cases like unknown words and end of text characters (separating different input texts so the model knows they're independent.)

In [13]:
# here we can see that we get an error if we encounter an unknown token -- in this case, "Hello"

tokenizer = SimpleTokenizerV1(vocab)

text = "Hello, do you like tea. Is this-- a test?"

tokenizer.encode(text)

KeyError: 'Hello'

In [14]:
# now we add context tokens for end of text and unknown characters to our vocab

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [15]:
# we redo our class to map tokens not in our vocab to <|unk|>
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [16]:
# now let's try this tokenizer on our text

tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

print(tokenizer.encode(text))

# note that unknown tokens Hello and palace were mapped to |unk|
print(tokenizer.decode(tokenizer.encode(text)))

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


## Byte Pair Encoding used in GPT2

Here we show a more advanced version of tokenization, byte pair encoding. The implementation is complicated, so we will just use an existing one. Briefly, the idea is to break unknown words into single characters, pairs of characters, trios of characters, and so on, keeping only the common word parts. So even unknown tokens will get some form of informative integer ID.

The tiktoken package has the byte pair encoding used in GPT2.

In [17]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.9.0


In [18]:
# use tiktoken GPT2 encoding
tokenizer = tiktoken.get_encoding("gpt2")

In [19]:
# try encoding and decoding using this tokenizer

# we make someunknownPlace one word to demonstrate handling of an unknown token
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

strings = tokenizer.decode(integers)

print(strings)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


### Sampling data from a sliding window

We sample text data using a sliding window along tokens in the input text. This is used to create training and test examples where the LLM tries to predict the next token given previous ones.

In [20]:
# load our input text and encode it
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [21]:
# sample tokens from a sliding window
enc_sample = enc_text[50:]
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [22]:
# here's what the prediction task will look like
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


### Creating PyTorch dataloaders for the tokenized text

Here we show how to load in this data and tokenize it using PyTorch. This will give us input chunks to run our LLM on.

In [23]:
import torch
print("PyTorch version:", torch.__version__)

PyTorch version: 2.2.2


In [25]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

In [26]:
# define the dataset -- given a text and tokenizer class, encode the text 
#   and use sliding windows to convert into training and test data

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


In [27]:
# now define the dataloader, which uses the data class to generate batches

def create_dataloader_v1(txt, batch_size, max_length, stride,
                         shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer (byte pair encoding)
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader

In [28]:
# now let's load the data and generate an encoded batch

with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [29]:
# now let's look at input and target text to predict
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


### PyTorch Embeddings for tokens

Here we show how to use PyTorch's nn.Embedding module to create embeddings for tokens.

In [31]:
# create token embeddings using nn.Embedding
# for 6 word vocabulary, with each token represented by a 3-vector
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [32]:
# i believe these are just the initializations, which could be trained
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [33]:
# now we can map input token IDs to their embedded vectors
input_ids = torch.tensor([2, 3, 5, 1])
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


As currently implemented, this is basically a lookup table. In practice, in LLMs, embeddings are learned along with everything else.

### Encoding word position

Transformers have no inherent sense of the positions of words. So we have to encode that position information, because the meanings of words can change based on their orders in sentences. 

There are two ways of doing this: "absolute" positional encoding, and "relative" positional encoding, where the latter encodes the positions of words relative to one another rather than their absolute position. The latter can be tricky to do well, so GPT uses absolute embeddings, and so will we.

Note that GPT embeddings are learned during the training process, in contrast to the sine embeddings proposed in the original Transformer paper, which are fixed and also relative, since the sine function is periodic.

In [34]:
# now we'll embed vocab from byte pair encoding into 256-dim vectors
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [35]:
# load our data using our data loader

max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [36]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [37]:
# now let's take a look at the embeddings
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

# uncomment & execute the following line to see how the embeddings look like
print(token_embeddings)

torch.Size([8, 4, 256])
tensor([[[ 0.4913,  1.1239,  1.4588,  ..., -0.3995, -1.8735, -0.1445],
         [ 0.4481,  0.2536, -0.2655,  ...,  0.4997, -1.1991, -1.1844],
         [-0.2507, -0.0546,  0.6687,  ...,  0.9618,  2.3737, -0.0528],
         [ 0.9457,  0.8657,  1.6191,  ..., -0.4544, -0.7460,  0.3483]],

        [[ 1.5460,  1.7368, -0.7848,  ..., -0.1004,  0.8584, -0.3421],
         [-1.8622, -0.1914, -0.3812,  ...,  1.1220, -0.3496,  0.6091],
         [ 1.9847, -0.6483, -0.1415,  ..., -0.3841, -0.9355,  1.4478],
         [ 0.9647,  1.2974, -1.6207,  ...,  1.1463,  1.5797,  0.3969]],

        [[-0.7713,  0.6572,  0.1663,  ..., -0.8044,  0.0542,  0.7426],
         [ 0.8046,  0.5047,  1.2922,  ...,  1.4648,  0.4097,  0.3205],
         [ 0.0795, -1.7636,  0.5750,  ...,  2.1823,  1.8231, -0.3635],
         [ 0.4267, -0.0647,  0.5686,  ..., -0.5209,  1.3065,  0.8473]],

        ...,

        [[-1.6156,  0.9610, -2.6437,  ..., -0.9645,  1.0888,  1.6383],
         [-0.3985, -0.9235, -1.31

In [40]:
# context embeddings
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

# uncomment & execute the following line to see how the embedding layer weights look like
print(pos_embedding_layer.weight)

Parameter containing:
tensor([[ 0.3544, -1.1020,  1.6459,  ..., -0.6570, -0.4390, -0.0461],
        [-2.1400, -2.5262,  1.4213,  ..., -1.1446,  2.2331, -1.3071],
        [ 0.2795, -1.1833, -0.0892,  ...,  0.4960,  0.8412,  0.0323],
        [-0.4270, -2.8876, -1.3638,  ..., -0.4342, -0.1795,  1.0636]],
       requires_grad=True)


In [42]:
# gpt absolute position embeddings -- we input the index, and learn weights for those indices
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

# uncomment & execute the following line to see how the embeddings look like
print(pos_embeddings)

torch.Size([4, 256])
tensor([[ 0.3544, -1.1020,  1.6459,  ..., -0.6570, -0.4390, -0.0461],
        [-2.1400, -2.5262,  1.4213,  ..., -1.1446,  2.2331, -1.3071],
        [ 0.2795, -1.1833, -0.0892,  ...,  0.4960,  0.8412,  0.0323],
        [-0.4270, -2.8876, -1.3638,  ..., -0.4342, -0.1795,  1.0636]],
       grad_fn=<EmbeddingBackward0>)


In [43]:
# To create the input embeddings used in an LLM, we simply add the token and the positional embeddings:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

# uncomment & execute the following line to see how the embeddings look like
print(input_embeddings)

torch.Size([8, 4, 256])
tensor([[[ 8.4575e-01,  2.1894e-02,  3.1048e+00,  ..., -1.0565e+00,
          -2.3125e+00, -1.9061e-01],
         [-1.6919e+00, -2.2725e+00,  1.1558e+00,  ..., -6.4488e-01,
           1.0340e+00, -2.4915e+00],
         [ 2.8798e-02, -1.2379e+00,  5.7950e-01,  ...,  1.4578e+00,
           3.2149e+00, -2.0498e-02],
         [ 5.1875e-01, -2.0218e+00,  2.5532e-01,  ..., -8.8868e-01,
          -9.2549e-01,  1.4119e+00]],

        [[ 1.9005e+00,  6.3488e-01,  8.6119e-01,  ..., -7.5740e-01,
           4.1942e-01, -3.8816e-01],
         [-4.0022e+00, -2.7175e+00,  1.0401e+00,  ..., -2.2610e-02,
           1.8835e+00, -6.9800e-01],
         [ 2.2642e+00, -1.8316e+00, -2.3071e-01,  ...,  1.1194e-01,
          -9.4324e-02,  1.4801e+00],
         [ 5.3777e-01, -1.5902e+00, -2.9845e+00,  ...,  7.1201e-01,
           1.4002e+00,  1.4605e+00]],

        [[-4.1687e-01, -4.4478e-01,  1.8122e+00,  ..., -1.4613e+00,
          -3.8482e-01,  6.9653e-01],
         [-1.3354e+00, -2.0