# This notebook covers text processing and sampling to get input data ready for LLM input.

Packages that are used in text processing.

In [1]:
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.5.1
tiktoken version: 0.8.0


## 1. Tokenizing text

### The goal is to open "the-verdict.txt" and embed the text inside.

In [2]:
# Read the-verdict.txt text file
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of characters:", len(raw_text))
print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


- Let's develop the simple tokenizer on sample text that later we can apply on the above text inside the-verdict.txt.

In [3]:
"""
This cell splits the words inside the sample text using regex (regular expression).
"""

import re


text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)  # split the text on whitespaces

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


- We can modify above code to also split on commas and periods, not just whitespaces.

In [4]:
# Split the text on whitespaces, commas, and periods
result = re.split(r'([,.]|\s)', text)

print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


- This creates empty strings as we can see in above cell's output. Let's remove them.

In [5]:
"""
Using list comprehension, we strip whitespace from each item,
and then filter out any empty strings.
"""
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


- Looks good! However, we still need to handle other punctuation characters, such as hyphens, question marks, and so on...

In [6]:
text = "Hello, NUISTers. Is this-- a sample text?"

result = re.split(r'([,.:;?_!"()\']|--|\s)', text)  # handle punctuation characters
result = [item.strip() for item in result if item.strip()]  # filter empty strings
print(result)

['Hello', ',', 'NUISTers', '.', 'Is', 'this', '--', 'a', 'sample', 'text', '?']


### Apply the above tokenization on the text inside "the-verdict.txt" file"

The tokenizer we created includes:
1. Splitting text into words
2. Handling special characters
3. Filtering empty strings

In [7]:
# Here we break down the whole text into words
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]  # all words in text
print(preprocessed[:50])  # First 30 words

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself']


Calculate total number of words

In [8]:
print("Total number of words:", len(preprocessed))

Total number of words: 4690


# 2. Token IDs

- Check number of unique tokens

In [9]:
all_words = sorted(set(preprocessed))  # alphabetically sorted unique tokens

print("Total number of Unique words:", len(all_words))

Total number of Unique words: 1130


- Construct a vocabulary to map each unique token with token ID.

In [10]:
vocab = {token:integer for integer, token in enumerate(all_words)}  # Python dictionary

# print out first 21 unique tokens and token IDs
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 20:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)


#### Create a simple tokenizer class.

In [11]:
class SimpleTokenizerV1:
    def __init__(self, vocab: dict) -> None:
        self.str_to_int: dict = vocab  # vocabulary {"word": id} for encoding
        self.int_to_str: dict = {i:s for s,i in vocab.items()}  # vocabulary {id: "word"} for decoding
    
    def encode(self, text: str) -> list[int]:
        """
        Breaks down text into words, converts words into tokens,
        map each token into token IDs based on vocabulary, and
        returns a list of token IDs.
        """
        preprocessed: list[str] = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed: list[str] = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids: list[int] = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids: list[int]) -> str:
        """
        Reconstruct the original text based on token IDs.
        """
        text: str = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text: str = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text


#### Encode the sample text using tokenizer class to generate token IDs.
  
![Map tokens to ids](./images/LLMs_text_processing-encode_text.png "Mapping tokens")

In [12]:
tokenizer = SimpleTokenizerV1(vocab)  # create tokenizer object

text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""

ids = tokenizer.encode(text)  # a list of token ids
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [13]:
print(tokenizer.encode(','))

[5]


#### Decode token IDs to reconstruct the original text back.

![Reconstruct sample text](./images/LLMs_text_processing-decode_text.png "Reconstruct sample text")

In [14]:
tokenizer.decode(ids)  # Python string

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

### Let's try to run the following cell and see what happens!

In [15]:
text = "Hello, do you like tea. Is this-- a test?"
# tokenizer.encode(text)

#### The above "KeyError" occurs because:
    - The word "Hello" doesn't exist in "The Verdict" short story.
    - Therefore, it's not tokenized and doesn't contain in our vocabulary.

# 3. Special context tokens

<!-- ![Context tokens](../images/LLMs-Context_tokens.png "Context tokens") -->
<img src="./images/LLMs_text_processing-Context_tokens.png" alt="Context tokens" width="750" height="600">

- Some tokenizers use special tokens to help the LLM with additional context

- Some of these special tokens are
  - `[BOS]` = (beginning of sequence) marks the beginning of text
  - `[EOS]` = (end of sequence) marks where the text ends (this is usually used to concatenate multiple unrelated texts, e.g., two different Wikipedia articles or two different books, and so on)
  - `[PAD]` = (padding) if we train LLMs with a batch size greater than 1 (we may include multiple texts with different lengths; with the padding token we pad the shorter texts to the longest length so that all texts have an equal length)
  
- `[UNK]` = (unknown) to represent words that are not included in the vocabulary

## Let's add the context tokens to our vocabulary!

In [16]:
all_tokens = sorted(list(set(preprocessed)))  # tokenize text and sort unique tokens
all_tokens.extend(["<|endoftext|>", "<|unk|>"])  # add special context tokens

vocab = {token:integer for integer,token in enumerate(all_tokens)}  # map unique tokens

In [17]:
len(vocab.items())  # unique token and token id pairs in vocabulary

1132

- It was 1130 previously, now it's 1132.

In [18]:
# Check last 5 items in vocabulary
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)


('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


#### Modify SimpleTokenizer Class to Include Context Tokens

In [19]:
class SimpleTokenizerV2:
    def __init__(self, vocab: dict) -> None:
        self.str_to_int: dict = vocab
        self.int_to_str: dict = { i:s for s,i in vocab.items()}
    
    def encode(self, text: str) -> list[int]:
        """
        Breaks down text into words, converts words into tokens,
        if the word is not in vocab, insert special contextual token,
        map each token into token IDs based on vocabulary, and
        returns a list of token IDs.
        """
        preprocessed: list[str] = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed: list[str] = [item.strip() for item in preprocessed if item.strip()]
        
        # Handling unknown tokens
        preprocessed: list[str] = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids: list[int] = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids: list[int]) -> str:
        """
        Reconstruct the original text based on token IDs.
        """
        text: str = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text: str = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text


#### Let's check our modified tokenizer to see if it can handle unknown tokens or not!

In [20]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the UnkownPlace."

text = " <|endoftext|> ".join((text1, text2))  # insert <|endoftext|> between two texts

print(text)


Hello, do you like tea? <|endoftext|> In the sunlit terraces of the UnkownPlace.


In [21]:
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [22]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

# 4. Byte Pair Encoding (BPE)

In [23]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.8.0


### Initialize GPT-2 tokenizer with BPE

In [24]:
tokenizer = tiktoken.get_encoding("gpt2")  # tokenizer used to train GPT-2

In [25]:
tokenizer.n_vocab  # number of vocabularies in this tokenizer

50257

### Encode message

In [26]:
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 262, 791, 74, 593, 27271, 13]


### Reconstruct original text

In [27]:
strings = tokenizer.decode(integers)

print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the UnkownPlace.


<img src="./images/LLMs_text_processing-BPE.png" alt="Context tokens" width="700" height="500">

### Check how BPE handles unknown words

In [28]:
# Unknown text or words
unk_text = ("Aw e some")
print(f"Unknown text: {unk_text}")

# Encode unknown words
unk_int = tokenizer.encode(unk_text, allowed_special={"<|endoftext|>"})
print(f"Unknown text integers: {unk_int}")

# Recontruct unknown words
unk_rec = tokenizer.decode(unk_int)
print(f"Reconstructed unknown text: {unk_rec}")

Unknown text: Aw e some
Unknown text integers: [23155, 304, 617]
Reconstructed unknown text: Aw e some


# 5. Data sampling with a sliding window

In [29]:
with open("./the-verdict.txt", "r") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(enc_text[:10])
print(len(enc_text))

[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138]
5145


In [30]:
enc_sample = enc_text[50:]

<img src="./images/LLMs_text_processing-dataloader.png" alt="Context tokens" width="700" height="510">

In [31]:
# Just to help u understand how sliding window works

context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]  # 1 position shifted to right

print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


<img src="./images/LLMs_text_processing-Sliding_window.png" alt="Context tokens" width="600" height="510">

In [32]:
# To help u understand next word prediction
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)  # shows unique token IDs


[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [33]:

for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    # Original text
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))


 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


In [34]:
import torch
print("PyTorch version:", torch.__version__)


PyTorch version: 2.5.1


<img src="./images/LLMs_text_processing-dataloader.png" alt="Context tokens" width="700" height="510">

### Sliding window data sampling using PyTorch Dataset class and DataLoader function

In [35]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    """
    Prepare input and target pairs dataset.
    """
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk entire text into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        """Returns total number of input-target pairs in dataset."""
        return len(self.input_ids)

    def __getitem__(self, idx):
        """Returns a single input and target pair from the dataset."""
        return self.input_ids[idx], self.target_ids[idx]


In [36]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader


In [37]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()


<img src="./images/LLMs_text_processing-dataloader_stride.png" alt="Context tokens" width="600" height="510">

Demonstrating how stride value affects data sampling with sliding window

In [38]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

# batch_size defines how many input-target pairs we have at each iteration
# Max_length is the same as context_size of window
# stride is the number of token field position as we slide the window through all tokens

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(f"First batch: {first_batch}\n")
second_batch = next(data_iter)
print(f"Second batch: {second_batch}")


First batch: [tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]

Second batch: [tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [39]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)


Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


# 6. Creating Token Embeddings

<img src="./images/LLMs-embedding_layer_weight_matrix.png" alt="Context tokens" width="250" height="510">

In [40]:
vocab_size = 6
output_dim = 3  # embedding size

torch.manual_seed(321)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)  # random small values


In [41]:
print(embedding_layer.weight)


Parameter containing:
tensor([[-0.1302,  0.4343, -0.4491],
        [-1.0824,  2.5830, -0.3784],
        [-0.6681, -0.4460, -0.4942],
        [-1.0153,  0.9791,  1.5577],
        [-0.3924,  0.4283,  0.6376],
        [-0.5494,  0.7509,  1.7671]], requires_grad=True)


<img src="./images/LLMs-embedding_matrix.png" alt="Context tokens" width="550" height="520">

In [42]:
input_ids = torch.tensor([5, 1, 3, 2])


In [43]:
print(embedding_layer(torch.tensor([3])))


tensor([[-1.0153,  0.9791,  1.5577]], grad_fn=<EmbeddingBackward0>)


In [44]:
print(embedding_layer(input_ids))


tensor([[-0.5494,  0.7509,  1.7671],
        [-1.0824,  2.5830, -0.3784],
        [-1.0153,  0.9791,  1.5577],
        [-0.6681, -0.4460, -0.4942]], grad_fn=<EmbeddingBackward0>)


## Creating input embeddings for the verdict story

In [45]:
vocab_size = 50257
output_dim = 256  # Embedding size

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)


In [46]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)


In [47]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)


Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [48]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)
# 8 input-target pairs, 4 words in each pair, 256 embedding dimension.


torch.Size([8, 4, 256])


# Positional Embedding

<img src="./images/LLMs_text_processing-same_embedding_vector.png" alt="Context tokens" width="500" height="500">

- Note: Attention mechanism isn't aware of the relationships between tokens. 
- Tokenizer doesn't inject such information in the input embeddings, hence, input embeddings at this point are position-independent embeddings. 
- Therefore, we need to introduce token positional information into input embeddings.

### OpenAI GPT-2 uses absolute position embeddings.

In [49]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)  # APE layer


In [50]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)


torch.Size([4, 256])


<img src="./images/LLMs_text_processing-Positional_embedding.png" alt="Context tokens" width="800" height="300">

In [51]:
input_embeddings = token_embeddings + pos_embeddings  # Matrix addition
print(input_embeddings.shape)


torch.Size([8, 4, 256])


<img src="./images/LLMs_text_processing-Full_text_processing.png" alt="Context tokens" width="700" height="600">

### Now we have done with preprocessing step, step one of stage one.

# Great Job!