## Download Example text

In [3]:
from llm_from_scratch.ch_02.text_file import TextFile

url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")
file_path = '/Users/mustafa/Workspaces/learning/llms-from-scratch/src/llm_from_scratch/data/the-verdict.txt'
print(file_path)
tf = TextFile(url, file_path)
tf.download()

/Users/mustafa/Workspaces/learning/llms-from-scratch/src/llm_from_scratch/data/the-verdict.txt


## Simple Tokenizer V1

### Encode

In [4]:
from llm_from_scratch.ch_02.tokenizer import SimpleTokenizerV1

tf.load()
# build vocab
tokenizer = SimpleTokenizerV1(tf.raw_text)
text = """"It's the last he painted, you know," 
 Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

File: /Users/mustafa/Workspaces/learning/llms-from-scratch/src/llm_from_scratch/data/the-verdict.txt
Length: 20479 characters

First 500 characters:
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)

"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it'

... (truncated)
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Carlo;', 25)
('Chicago', 26)
('Claude', 27)
('Come', 28

### Decode

In [None]:
print(tokenizer.decode(ids))

### SimpleTokenizerV1: Encoding including Unknown Tokens

In [None]:
try:
    text = "Hello, do you like tea?"
    ids = tokenizer.encode(text)
    print(ids)
except Exception as e:
    print(f"Error encoding text: {e}")

## Simple Tokenizer V2
### Encode

In [None]:
from llm_from_scratch.ch_02.tokenizer import SimpleTokenizerV2

tokenizer = SimpleTokenizerV2(tf.raw_text)
text = """"It's the last he painted, you know," 
 Mrs. Gisburn said with pardonable pride."""

ids = tokenizer.encode(text)
print(ids)

### Decode

In [None]:
print("SimpleTokenizerV2: Decoding Ids to tokens ")
print(tokenizer.decode(ids))

### Encoding including Unknown Tokens

In [None]:
text = "Hello, do you like tea?"
ids = tokenizer.encode(text)
print(ids )

### Decoding including Unknown Tokens

In [None]:
print(tokenizer.decode(ids))

### Decode multiple documents

In [None]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)
print(tokenizer.encode(text))
print(tokenizer.decode(tokenizer.encode(text)))


## BytePair encoding

In [None]:
from importlib.metadata import version
import tiktoken

print("tiktoken version:", version("tiktoken"))
tokenizer = tiktoken.get_encoding("gpt2")

text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)
strings = tokenizer.decode(integers)
print(strings)

In [None]:
enc_text = tokenizer.encode(tf.raw_text)
print(len(enc_text))

Next, we remove the first 50 tokens from the dataset for demonstration purposes, as it results in a slightly more interesting text passage in the next steps:

In [None]:
enc_sample = enc_text[50:]

One of the easiest and most intuitive ways to create the input–target pairs for the next-word prediction task is to create two variables, x and y, where x contains the input tokens and y contains the targets, which are the inputs shifted by 1:

In [None]:
context_size = 4         #1
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

By processing the inputs along with the targets, which are the inputs shifted by one position

In [None]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)

In [None]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

In [None]:
from llm_from_scratch.ch_02.gpt_dataset import create_dataloader_v1


dataloader = create_dataloader_v1(
    text=tf.raw_text,
    batch_size=1,
    max_length=4,
    stride=1,
    shuffle=False,
    drop_last=True,
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

In [None]:
second_batch = next(data_iter)
print(second_batch)

#### Exercise 2.2 Data loaders with different strides and context sizes

In [5]:
from llm_from_scratch.ch_02.gpt_dataset import create_dataloader_v1

dataloader = create_dataloader_v1(
    text=tf.raw_text,
    batch_size=1,
    max_length=2,
    stride=2,
    shuffle=False,
    drop_last=True,
)

data_iter = iter(dataloader)
batch = next(data_iter)
print("max_length=2 and stride=2")
print(batch)
batch = next(data_iter)
print(batch)

dataloader = create_dataloader_v1(
    text=tf.raw_text,
    batch_size=1,
    max_length=8,
    stride=2,
    shuffle=False,
    drop_last=True,
)

data_iter = iter(dataloader)
batch = next(data_iter)
print("max_length=8 and stride=2")
print(batch)
batch = next(data_iter)
print(batch)

max_length=2 and stride=2
[tensor([[ 40, 367]]), tensor([[ 367, 2885]])]
[tensor([[2885, 1464]]), tensor([[1464, 1807]])]
max_length=8 and stride=2
[tensor([[  40,  367, 2885, 1464, 1807, 3619,  402,  271]]), tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899]])]
[tensor([[ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138]]), tensor([[ 1464,  1807,  3619,   402,   271, 10899,  2138,   257]])]


#### Batch sizes

In [6]:
dataloader = create_dataloader_v1(
    text = tf.raw_text, 
    batch_size=8, 
    max_length=4,
    stride=4, # using stride 4 avoids any overlap between the batches since more overlap could lead to increased overfitting
    shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])



### Creating token embeddings

In [7]:
import torch
# for the sake of simplicity
input_ids = torch.tensor([2, 3, 5, 1])

vocab_size = 6 # suppose we have a small vocabulary of only 6 words
output_dim = 3 # we want to create embeddings of size 3
# Using the vocab_size and output_dim, we can instantiate an embedding layer in PyTorch
torch.manual_seed(123) # for reproducibility
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


The weight matrix of the embedding layer contains small, random values. These values are optimized during LLM training as part of the LLM optimization itself. Moreover, we can see that the weight matrix has six rows and three columns. There is one row for each of the six possible tokens in the vocabulary, and there is one column for each of the three embedding dimensions.

In [8]:
# Now, let’s apply it to a token ID to obtain the embedding vector:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


the embedding layer is essentially a lookup operation that retrieves rows from the embedding layer’s weight matrix via a token ID

In [9]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


#### Positional embeddings

In [10]:
# Create a token embedding layer with 50257 (the size of the GPT-2 vocabulary) and 256 (the size of the embedding dimension)
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

# if we sample data from the data loader, we embed each token in each batch into a 256-dimensional vector.
# If we have a batch size of 8 with four tokens each, the result will be an 8 × 4 × 256 tensor.
max_length = 4
dataloader = create_dataloader_v1(
    text=tf.raw_text,
    batch_size=8,
    max_length=max_length,
    stride=max_length, # no overlap between inputs
    shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [11]:
token_embeddings = token_embedding_layer(inputs)

print(token_embeddings.shape)

torch.Size([8, 4, 256])


#### GPT model’s absolute embedding approach

In [12]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [13]:
# PyTorch will add the 4 × 256–dimensional pos_embeddings to token_embeddings via metrics add operation
# we will then end up with a unique victor for each token that encode the positional information as well.
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
