<a href="https://colab.research.google.com/github/lizhieffe/canonical_llm_impl/blob/main/LLM_from_scratch_chap_02_Process_Text_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Tutorial for "LLM from Scratch" Chapter 02

https://drive.google.com/drive/u/1/folders/1a9jbhCJr_dddOT-m-4G9MgBTpOdaCs7Q

In [None]:
# @title Install Dependencies
!pip install uv && !uv pip install --system -r https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/requirements.txt

Collecting uv
  Downloading uv-0.7.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading uv-0.7.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uv
Successfully installed uv-0.7.0
/bin/bash: line 1: !uv: command not found


## Download training data text

In [None]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
"the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x7e4089b8e650>)

In [None]:
import re
def get_tokens(text: str):
  preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
  preprocessed = [item.strip() for item in preprocessed if item.strip()]
  return preprocessed

with open(file_path, "r", encoding="utf-8") as f:
  raw_text = f.read()

tokens = get_tokens(raw_text)
print(f"# of words: {len(tokens)}")
print(f"{tokens[:100]}")

# of words: 4690
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--']


## Build Vocab

In [None]:
# @title Simple Vocab V1

all_words = sorted(set(tokens))
vocab_size = len(all_words)
print(f"{vocab_size=}")

vocab: dict[str, int] = {w:index for index, w in enumerate(all_words)}
for i in range(11):
  print(f"{all_words[i]} -> {vocab[all_words[i]]}")

class SimpleTokenzierV1:
  """Do not support tokens not seen in vocab (OOD)."""

  def __init__(self, vocab: dict[str, int]):
    """Ctor.

    Args:
        vocab: String (token) to int mapping.
    """
    self.str_to_int = vocab
    self.int_to_str = {v:k for k, v in vocab.items()}

  def encode(self, text: str) -> list[int]:
    tokens = get_tokens(text)
    return [self.str_to_int[token] for token in tokens]

  def decode(self, tokens: list[int]) -> str:
    text = " ".join([self.int_to_str[it] for it in tokens])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

# Test
simple_tokenizer_v1 = SimpleTokenzierV1(vocab)
text = """It's the last he painted, you know, Mrs. Gisburn said with pardonable pride."""
encoded = simple_tokenizer_v1.encode(text)
print(f"{encoded=}")
decoded = simple_tokenizer_v1.decode(encoded)
print(f"{decoded=}")
print(f"{text=}")


vocab_size=1130
! -> 0
" -> 1
' -> 2
( -> 3
) -> 4
, -> 5
-- -> 6
. -> 7
: -> 8
; -> 9
? -> 10
encoded=[56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 67, 7, 38, 851, 1108, 754, 793, 7]
decoded="It' s the last he painted, you know, Mrs. Gisburn said with pardonable pride."
text="It's the last he painted, you know, Mrs. Gisburn said with pardonable pride."


In [None]:
# @title Simple Vocab V2

all_words = sorted(set(tokens))
all_words.extend(["<|endoftext|>", "<|unk|>"])
vocab_size = len(all_words)
print(f"{vocab_size=}")

vocab: dict[str, int] = {w:index for index, w in enumerate(all_words)}
for i in range(len(all_words) - 10, len(all_words)):
  print(f"{all_words[i]} -> {vocab[all_words[i]]}")

class SimpleTokenzierV2:
  """Compared to V1, this supports extra things:
  1) OOD. <|unk|>
  2) EOS (end of sequence). <|endoftext|>
  """

  def __init__(self, vocab: dict[str, int]):
    """Ctor.

    Args:
        vocab: String (token) to int mapping.
    """
    self.str_to_int = vocab
    self.int_to_str = {v:k for k, v in vocab.items()}

  def encode(self, text: str) -> list[int]:
    tokens = get_tokens(text)
    tokens = [t if t in self.str_to_int else "<|unk|>" for t in tokens]
    return [self.str_to_int[token] for token in tokens]

  def decode(self, tokens: list[int]) -> str:
    text = " ".join([self.int_to_str[it] for it in tokens])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

# Test
simple_tokenizer_v2 = SimpleTokenzierV2(vocab)
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)
encoded = simple_tokenizer_v2.encode(text)
print(f"{encoded=}")
decoded = simple_tokenizer_v2.decode(encoded)
print(f"{decoded=}")


vocab_size=1132
year -> 1122
years -> 1123
yellow -> 1124
yet -> 1125
you -> 1126
younger -> 1127
your -> 1128
yourself -> 1129
<|endoftext|> -> 1130
<|unk|> -> 1131
Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
encoded=[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]
decoded='<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'


### BPE Vocab (Byte pair encoding)

Used by GPT2/3. E.g. "tiktoken"

BPE builds its vocabulary by iteratively merging frequent characters into sub-
words and frequent subwords into words.

In [None]:
!uv pip install tiktoken

from importlib.metadata import version
import tiktoken
print(f"{version('tiktoken')=}")

tokenizer = tiktoken.get_encoding("gpt2")
print(f"{tokenizer.name=}")

text = (
"Hello, do you like tea? <|endoftext|> In the sunlit terraces"
"of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(f"{integers=}")

decoded = tokenizer.decode(integers)
print(f"{decoded=}")

[2mUsing Python 3.11.12 environment at: /usr[0m
[2K[2mResolved [1m7 packages[0m [2min 366ms[0m[0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0

# Build Training dataset and dataloader

- Use BPE tokenizer.

In [None]:
enc_text = tokenizer.encode(raw_text)
print(f"{len(encoded)=}")

enc_sample = enc_text[50:]

context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y: \t{y}\n")

for i in range(context_size):
  context = x[:i+1]
  target = y[i]
  # print(f"{context} ---------> {target}")
  print(f"{tokenizer.decode(context)} ---------> {tokenizer.decode([target])}")

len(encoded)=16
x: [290, 4920, 2241, 287]
y: 	[4920, 2241, 287, 257]

 and --------->  established
 and established --------->  himself
 and established himself --------->  in
 and established himself in --------->  a


In [None]:
# @title Use Pytorch Dataset and DataLoader

import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(txt)
    for i in range(0, len(token_ids) - max_length, stride):
      self.input_ids.append(torch.tensor(token_ids[i:i+max_length]))
      # TODO(lizhi): does the rhs idx exceed the limit?
      self.target_ids.append(torch.tensor(token_ids[i+1:i+max_length+1]))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, index):
    return self.input_ids[index], self.target_ids[index]

# Test
dataset = GPTDatasetV1(raw_text, tokenizer, 8, 1)
print(f"{len(dataset)=}")
context, target = dataset[2]
context = tokenizer.decode(context.numpy())
target = tokenizer.decode(target.numpy())
print(f"{context=}  {target=}")

len(dataset)=5137
context='AD always thought Jack Gisburn rather'  target=' always thought Jack Gisburn rather a'


In [None]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
  return dataloader

# Test
max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=2, max_length=max_length, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(f"{first_batch=}\n")
second_batch = next(data_iter)
print(f"\n{second_batch=}\n")

first_batch=[tensor([[  40,  367, 2885, 1464],
        [ 367, 2885, 1464, 1807]]), tensor([[ 367, 2885, 1464, 1807],
        [2885, 1464, 1807, 3619]])]


second_batch=[tensor([[2885, 1464, 1807, 3619],
        [1464, 1807, 3619,  402]]), tensor([[1464, 1807, 3619,  402],
        [1807, 3619,  402,  271]])]



# Token Embedding

In [None]:
torch.manual_seed(123) # reproducibility
vocab_size = tokenizer.n_vocab
print(f"{vocab_size=}")

EMBEDDING_DIM = 256

embedding_layer = torch.nn.Embedding(vocab_size, EMBEDDING_DIM)
print(f"{embedding_layer.weight.shape=}")

vocab_size=50257
embedding_layer.weight.shape=torch.Size([50257, 256])


In [None]:
embedding_layer(first_batch[0]).shape # [B, S, H_DIM]

torch.Size([2, 4, 256])

In [None]:
print(embedding_layer(first_batch[0]))

tensor([[[-0.0640,  0.3317,  0.1070,  ...,  0.5349, -0.8024, -2.3238],
         [-0.3525,  0.3509,  0.9873,  ..., -1.8466, -1.7034,  0.3223],
         [ 1.0017,  0.9299, -1.2633,  ..., -1.2256,  1.1179,  0.1343],
         [ 0.7996,  2.2837, -0.6525,  ..., -1.1217,  0.4706,  0.1531]],

        [[-0.3525,  0.3509,  0.9873,  ..., -1.8466, -1.7034,  0.3223],
         [ 1.0017,  0.9299, -1.2633,  ..., -1.2256,  1.1179,  0.1343],
         [ 0.7996,  2.2837, -0.6525,  ..., -1.1217,  0.4706,  0.1531],
         [-0.1082, -1.2723, -1.2217,  ..., -0.9199,  2.0073, -1.4138]]],
       grad_fn=<EmbeddingBackward0>)


# Positional Encoding

- OpenAI’s GPT models use **absolute** positional embeddings that are **optimized during the training** rather than being fixed or predefined like the positional
encodings in the original transformer model

In [None]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, EMBEDDING_DIM)
positions = torch.arange(context_length)
print(f"{positions=}")
print(f"{pos_embedding_layer(positions).shape=}")

pos_embeddings = pos_embedding_layer(positions)
print(f"{pos_embeddings.shape=}")

assert pos_embeddings.shape == (context_length, EMBEDDING_DIM)

positions=tensor([0, 1, 2, 3])
pos_embedding_layer(positions).shape=torch.Size([4, 256])
pos_embeddings.shape=torch.Size([4, 256])


In [None]:
assert embedding_layer(first_batch[0]).shape[1:] == pos_embeddings.shape
assert (embedding_layer(first_batch[0]) + pos_embeddings).shape == embedding_layer(first_batch[0]).shape