In [9]:
import torch
import tiktoken

- As the first step, raw text should brake into tokens, which can be words or characters.
- Then, the tokens are converted into integer representations, termed token IDs.

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("total number of characters: ", len(raw_text))
print(raw_text[:99])

total number of characters:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [None]:
import re
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))
print(preprocessed[:30])

4649
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [None]:
all_words = sorted(list(set(preprocessed)))
vocab_size = len(all_words)
print(vocab_size)

1159


In [None]:
vocab = {token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i > 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Carlo;', 25)
('Chicago', 26)
('Claude', 27)
('Come', 28)
('Croft', 29)
('Destroyed', 30)
('Devonshire', 31)
('Don', 32)
('Dubarry', 33)
('Emperors', 34)
('Florence', 35)
('For', 36)
('Gallery', 37)
('Gideon', 38)
('Gisburn', 39)
('Gisburns', 40)
('Grafton', 41)
('Greek', 42)
('Grindle', 43)
('Grindle:', 44)
('Grindles', 45)
('HAD', 46)
('Had', 47)
('Hang', 48)
('Has', 49)
('He', 50)
('Her', 51)


In [None]:
class SimpleTokenizerV1:
    def __init__(self, vocab) -> None:
        self.str_to_int = vocab
        self.int_to_str = {integer: token for token, integer in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[t] for t in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[id] for id in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [None]:
tokenizer = SimpleTokenizerV1(vocab)
text = "She raised her eyebrows with a hint of good-humoured surprise."
ids = tokenizer.encode(text)
print(ids)

[91, 837, 547, 423, 1136, 119, 558, 738, 510, 986, 7]


In [None]:
print(tokenizer.decode(ids))

She raised her eyebrows with a hint of good-humoured surprise.


### =====================
- Special tokens, such as <|unk|> and <|endoftext|>, can be added to enhance the model's understanding and handle various contexts, such as unknown words or marking the boundary between unrelated texts.

In [None]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token: integer for integer, token in enumerate(all_tokens)}
print(len(vocab))

1161


In [None]:
class SimpleTokenizerV2:
    def __init__(self, vocab) -> None:
        self.str_to_int = vocab
        self.int_to_str = {integer: token for token, integer in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [token if token in self.str_to_int else "<|unk|>" for token in preprocessed]
        ids = [self.str_to_int[t] for t in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[id] for id in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [None]:
tokenizer = SimpleTokenizerV2(vocab)
text  = "When should you build an agent?"
ids = tokenizer.encode(text)
print(ids)

[114, 904, 1155, 1160, 161, 1160, 10]


In [None]:
print(tokenizer.decode(ids))

When should you <|unk|> an <|unk|>?


In [None]:
tokenizer = SimpleTokenizerV2(vocab)
text1  = "In its most fundamental form, an agent consists of three core components:"
text2 = "Here’s what this looks like in code when using OpenAI"
text = " <|endoftext|> ".join((text1,text2))
ids = tokenizer.encode(text)
print(ids)

[57, 598, 702, 1160, 472, 5, 161, 1160, 1160, 738, 1030, 1160, 1160, 1159, 1160, 1116, 1024, 1160, 642, 579, 1160, 1117, 1160, 1160]


In [None]:
print(tokenizer.decode(ids))

In its most <|unk|> form, an <|unk|> <|unk|> of three <|unk|> <|unk|> <|endoftext|> <|unk|> what this <|unk|> like in <|unk|> when <|unk|> <|unk|>


In [None]:
import tiktoken

: 

- The byte pair encoding (BPE) tokenizer used for LLMs like GPT-2 and GPT-3 can efficiently handle unknown words by breaking them down into subword units or individual characters.

In [None]:
import tiktoken
from importlib.metadata import version
print("tiktoken version:", version("tiktoken"))

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     " of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]


In [None]:
tokenizer.decode(integers)

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.'

- We use a sliding window approach on tokenized data to generate input- target pairs for LLM training.

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [None]:
enc_sample = enc_text[50:]
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader

In [4]:
class GPTDatasetV1(Dataset):
  def __init__(self, text, tokenizer, max_length, stride):
    self.tokenizer = tokenizer
    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(text)

    for i in range(0, len(token_ids) - max_length, stride):
      input_chunk = token_ids[i:i+max_length]
      target_chunk = token_ids[i+1:i+max_length+1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

  def __len__(self):
    return len(self.input_ids)

In [5]:
def create_dataloader_v1(text,
                         batch_size=4,
                         max_length=256,
                         stride=128,
                         shuffle=True,
                         drop_last=True,
                         num_workers=0):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = GPTDatasetV1(text, tokenizer, max_length, stride)
  dataloader = DataLoader(dataset,
                          batch_size=batch_size,
                          shuffle=shuffle,
                          drop_last=drop_last,
                          num_workers=num_workers)
  return dataloader



In [10]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [None]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


### Embedding

- tokenizing text -> convert tokens to token IDs -> embedding vectors
<img src="https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/15.webp" width="400px">

- initialize embedding weights with random values
- train embedding weights with backpropagation in future (the values will optimized during LLM training)

for example we have vocab_size = 6, embedding_size = 6

In [3]:
vocab_size = 6
output_dim = 3

In [10]:
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer)
print(embedding_layer.weight)
print("third word in vocab has embedding weight of: \n")
print(embedding_layer(torch.tensor([2])))
print("===========================================================================")

Embedding(6, 3)
Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)
third word in vocab has embedding weight of: 

tensor([[ 1.2753, -0.2010, -0.1606]], grad_fn=<EmbeddingBackward0>)


In [11]:
input_ids = torch.tensor([2, 3, 5, 1])
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


- Embedding layer is look-up operation that retreive rows from embedding layer's weight matrix via token ID
- Embedding layer is just a more efficient implementation equivalent to the one-hot encoding and matrix-multiplication approach, it can be seen as a neural network layer that can be optimized via backpropagation.


#### Encode positional information

- A minor shortcoming of LLMs is that their self- attention mechanism, which will be covered later, doesn't have a notion of position or order for the tokens within a sequence.
- The way the previously introduced embedding layer works is that the same token ID always gets mapped to the same vector representation, regardless of where the token ID is positioned in the input sequence

There are two categories of position-aware embedding:
- Relative positional embedding
- Absolute positional embedding

Absolute positional embedding:
- For each position in the input sequence, a unique embedding is added to the token's embedding to convey its exact location

Relative positional embedding
- the emphasis of relative positional embeddings is on the relative position or distance between tokens. This means the model learns the relationships in terms of "how far apart" rather than "at which exact position."
- The advantage here is that the model can generalize better to sequences of varying lengths, even if it hasn't seen such lengths during training.

The choice between them often depends on the specific application and the nature of the data being processed.

OpenAI's GPT models use absolute positional embeddings that are optimized during the training process rather than being fixed or predefined like the positional encodings in the original Transformer model.

Positional embeddings are added to the token embedding vector to create the input embeddings for an LLM (The positional vectors have the same dimension as the original token embeddings).

<img src="https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/18.webp" width="500px">

in GPT-3, the embedding size is 12,288 dimensions (but we take it as 256)
also, vocab_size = 50,257

In [6]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [13]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)
token_embeddings = token_embedding_layer(inputs)
print("\nToken Embedding shape:\n", token_embeddings.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])

Token Embedding shape:
 torch.Size([8, 4, 256])


- the data batch consists of 8 text samples with 4 tokens each has vector embeddings of 256

- For a GPT model's absolute embedding approach, we just need to create another embedding layer that has the same dimension as the token_embedding_layer

In [15]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
# torch.arange(context_length) is equal to tensor([0, 1, 2, 3])
print(pos_embeddings.shape)

torch.Size([4, 256])


In [19]:
# considering broadcasting rule in pytorch
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


<img src="https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/19.webp" width="400px">