# Setup

In [45]:
# noqa: PLR0913
import re
import urllib.request
from pathlib import Path

import tiktoken
import torch
from dataset import GPTDatasetV1
from tokenizer import REGEX_PATTERN, SimpleTokenizerV1, SimpleTokenizerV2

# Word tokenizers (preprocessing)

In [46]:
url = "https://raw.githubusercontent.com/rasbt/" "LLMs-from-scratch/main/ch02/01_main-chapter-code/" "the-verdict.txt"
file_path = "the-verdict.txt"

if not Path(file_path).exists():
    urllib.request.urlretrieve(url, file_path)

with Path(file_path).open("r") as f:
    raw_text = f.read()

print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20480
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


## Splitting text into tokens

In [47]:
some_text = "Hello, world. This, is a test."
result = re.split(REGEX_PATTERN, some_text, flags=re.IGNORECASE)
result = [token for token in result if token and token.strip()]
print(result)

some_text = "Hello, world. Is this-- a test?"
result = re.split(REGEX_PATTERN, some_text, flags=re.IGNORECASE)
result = [token for token in result if token and token.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']
['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


### Applying regex to short story

In [48]:
pre_prcessed = re.split(REGEX_PATTERN, raw_text, flags=re.IGNORECASE)
pre_prcessed = [token for token in pre_prcessed if token and token.strip()]
print(len(pre_prcessed))
print(pre_prcessed[:30])

4649
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


## Assigning ids to tokens

In [49]:
all_words = sorted(set(pre_prcessed))
vocab_size = len(all_words)
vocab_size

1159

In [50]:
vocab = {token: integer for integer, token in enumerate(all_words)}
max_tokens = 50
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= max_tokens:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Carlo;', 25)
('Chicago', 26)
('Claude', 27)
('Come', 28)
('Croft', 29)
('Destroyed', 30)
('Devonshire', 31)
('Don', 32)
('Dubarry', 33)
('Emperors', 34)
('Florence', 35)
('For', 36)
('Gallery', 37)
('Gideon', 38)
('Gisburn', 39)
('Gisburns', 40)
('Grafton', 41)
('Greek', 42)
('Grindle', 43)
('Grindle:', 44)
('Grindles', 45)
('HAD', 46)
('Had', 47)
('Hang', 48)
('Has', 49)
('He', 50)


## Using simple tokenizer

### Encoding

In [51]:
tokenizer = SimpleTokenizerV1(vocab)

some_text = """"It's the last he painted, you know,"
       Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(some_text)
ids

[1,
 58,
 2,
 872,
 1013,
 615,
 541,
 763,
 5,
 1155,
 608,
 5,
 1,
 69,
 7,
 39,
 873,
 1136,
 773,
 812,
 7]

### Decoding

In [52]:
result = tokenizer.decode(ids)
result

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

### Applying it to never seen text

In [53]:
some_text = "Hello, do you like tea?"
# print(tokenizer.encode(text))  # raises because "Hello" is not part of the vocabulary !

### Adding special tokens <unk> and <|endoftext|>

In [54]:
all_tokens = sorted(set(pre_prcessed))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token: integer for integer, token in enumerate(all_tokens)}
len(vocab.items())

1161

In [55]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
text

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.'

In [56]:
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))
print(tokenizer.decode(tokenizer.encode(text)))

[1160, 5, 362, 1155, 642, 1000, 10, 1159, 57, 1013, 981, 1009, 738, 1013, 1160, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


# BPE Tokenizer


In [57]:
tokenizer = tiktoken.get_encoding("gpt2")
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces " "of someunknownPlace."
token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print("Tokens and their text:")
for token_id in token_ids:
    print(f"Token ID: {token_id}, Text: {tokenizer.decode([token_id])}")

Tokens and their text:
Token ID: 15496, Text: Hello
Token ID: 11, Text: ,
Token ID: 466, Text:  do
Token ID: 345, Text:  you
Token ID: 588, Text:  like
Token ID: 8887, Text:  tea
Token ID: 30, Text: ?
Token ID: 220, Text:  
Token ID: 50256, Text: <|endoftext|>
Token ID: 554, Text:  In
Token ID: 262, Text:  the
Token ID: 4252, Text:  sun
Token ID: 18250, Text: lit
Token ID: 8812, Text:  terr
Token ID: 2114, Text: aces
Token ID: 286, Text:  of
Token ID: 617, Text:  some
Token ID: 34680, Text: unknown
Token ID: 27271, Text: Place
Token ID: 13, Text: .


In [58]:
strings = tokenizer.decode(token_ids)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.


## BPE encoding for unknown words excercise

In [59]:
tokenizer = tiktoken.get_encoding("gpt2")
text = "Rouchi malaga proutokoul !! Whesh manakol!"
token_ids = tokenizer.encode(text)
print(token_ids)
print("Tokens and their text:")
for token_id in token_ids:
    print(f"Token ID: {token_id}, Text: {tokenizer.decode([token_id])}")

[49, 7673, 72, 6428, 8126, 778, 448, 482, 2852, 37867, 370, 956, 71, 582, 461, 349, 0]
Tokens and their text:
Token ID: 49, Text: R
Token ID: 7673, Text: ouch
Token ID: 72, Text: i
Token ID: 6428, Text:  mal
Token ID: 8126, Text: aga
Token ID: 778, Text:  pr
Token ID: 448, Text: out
Token ID: 482, Text: ok
Token ID: 2852, Text: oul
Token ID: 37867, Text:  !!
Token ID: 370, Text:  W
Token ID: 956, Text: hes
Token ID: 71, Text: h
Token ID: 582, Text:  man
Token ID: 461, Text: ak
Token ID: 349, Text: ol
Token ID: 0, Text: !


## Sliding window data loader

The sliding window technique is a fundamental concept in language modeling that helps us create training data for predicting the next word in a sequence. Here's why it's important:


1. **Context Window**: Language models need to understand the context of words to make predictions. A sliding window creates fixed-size sequences of tokens that serve as the context.


2. **Sequential Learning**: By sliding a window over the text, we create multiple training examples from a single text, where each example teaches the model to predict the next token based on the previous ones.


3. **Data Efficiency**: This approach allows us to generate many training examples from a limited amount of text, making efficient use of our data.

Let's see how this works in practice with our encoded text.

In this example, we'll implement a sliding window data loader that creates context-target pairs from our tokenized text. This technique allows us to generate training examples where the model learns to predict the next token based on a sequence of previous tokens.

In [60]:
# Encode the raw text using the BPE tokenizer
encoded_text = tokenizer.encode(raw_text)
# Take a sample of the encoded text, starting from the 50th token
encoded_sample = encoded_text[50:]
print(len(encoded_text))

5146


### Creating Context-Target Pairs

Now that we have our encoded text, we'll create pairs of context and target tokens. This is where the actual sliding window comes into play:

1. **Context Size**: We define a fixed context size (in our example, 4 tokens) that determines how many previous tokens the model can "see" to make its prediction.

2. **Input-Output Pairs**: For each position in the text, we create:
   - An input sequence (x) containing the context tokens
   - A target sequence (y) containing the tokens we want the model to predict

This creates a supervised learning problem where the model learns to predict the next token given a sequence of previous tokens.

In [61]:
# Define the context size
context_size = 4
# Create the input sequence (x) and the target sequence (y)
x = encoded_sample[:context_size]
y = encoded_sample[1 : context_size + 1]
print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


# Sliding Window Process

Let's examine how the sliding window creates training examples. For each step, we'll see:
- The current context (input sequence)
- The target token we want to predict

This visualization helps understand how the window "slides" through the text, creating multiple training examples from a single sequence. Each example teaches the model to predict the next token based on an increasingly longer context.

In [62]:
# Iterate through the encoded sample and create context-desired pairs
for i in range(1, context_size + 1):
    # Get the context (input sequence)
    context = encoded_sample[:i]
    # Get the desired output (next token)
    desired = encoded_sample[i]
    # Print the context and desired output
    print(context, "---->", desired)

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [63]:
# Same but decoding each bpe token:
for i in range(1, context_size + 1):
    context = encoded_sample[:i]
    desired = encoded_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


## Pytorch dataset and dataloaders

#### Dataloader factory

In [64]:
def create_data_loader_v1(  # noqa: PLR0913
    txt: str,
    batch_size: int = 4,
    context_window_size: int = 256,
    stride: int = 128,
    shuffle: bool = True,
    drop_last: bool = True,
    num_workers: int = 0,
) -> None:
    """Factory function to create dataloader with dataset.

    Args:
        txt (str): The input text to be tokenized and processed.
        batch_size (int, optional): Number of samples per batch. Defaults to 4.
        context_window_size (int, optional): Size of the context window for token sequences. Defaults to 256.
        stride (int, optional): Number of tokens to skip between consecutive windows. Defaults to 128.
        shuffle (bool, optional): Whether to shuffle the data. Defaults to True.
        drop_last (bool, optional): Whether to drop the last incomplete batch. Defaults to True.
        num_workers (int, optional): Number of worker processes for data loading. Defaults to 0.

    Returns:
        torch.utils.data.DataLoader: A PyTorch DataLoader instance containing the processed dataset.
    """
    tokenizer = tiktoken.get_encoding("gpt2")
    data_set = GPTDatasetV1(text=txt, tokenizer=tokenizer, context_window_size=context_window_size, stride=stride)

    return torch.utils.data.DataLoader(
        data_set,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
    )

In [65]:
data_loader = create_data_loader_v1(raw_text, batch_size=1, context_window_size=4, stride=1, shuffle=False)
data_iter = iter(data_loader)
first_batch = next(data_iter)
first_batch

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]

In [73]:
data_loader = create_data_loader_v1(
    raw_text, batch_size=8, context_window_size=4, stride=4,
    shuffle=False
)

data_iter = iter(data_loader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
