### 2.2 Tokenizing Text

In [12]:
import os
import requests
import re

In [None]:
# Download `the-verdict.txt` file
url = ("https://raw.githubusercontent.com/rasbt/"
 "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
 "the-verdict.txt"
 )
file_path = "the-verdict.txt"

if not os.path.exists(file_path):
    resp = requests.get(url, timeout=30)
    if resp.status_code == 200:
        with open(file_path, "wb") as f:
            f.write(resp.content)

In [11]:
with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

print("The file length:", len(raw_text), "symbols")
print(raw_text[:100])

The file length: 20479 symbols
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [34]:
text = "Hello, world! This, is-- a test?"
[item.strip() for item in re.split(r'(\b)', text) if item.strip()]

['Hello', ',', 'world', '!', 'This', ',', 'is', '--', 'a', 'test', '?']

In [36]:
pat = r'([,.:;?_!"()\']|--|\s)'
# pat = r'\b'
preprocessed = [item.strip() for item in re.split(pat, raw_text) if item.strip()]

print(len(preprocessed), 'tokens')
preprocessed[:30]

4690 tokens


['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius',
 '--',
 'though',
 'a',
 'good',
 'fellow',
 'enough',
 '--',
 'so',
 'it',
 'was',
 'no',
 'great',
 'surprise',
 'to',
 'me',
 'to',
 'hear',
 'that',
 ',',
 'in']

### 2.3 Converting tokens into token IDs

In [43]:
all_words = sorted(set(preprocessed))
len(all_words)

1130

In [45]:
all_words[:10]

['!', '"', "'", '(', ')', ',', '--', '.', ':', ';']

In [46]:
vocab = {token:i for i, token in enumerate(all_words)}


In [47]:

{k:v for k,v in vocab.items() if v <= 5}

{'!': 0, '"': 1, "'": 2, '(': 3, ')': 4, ',': 5}

In [61]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.vocab = vocab
        self.vocab_r = {v:k for k,v in vocab.items()}

    def encode(self, text: str):
        pat_ = r'([,.:;?_!"()\']|--|\s)'
        tokens_ = [item.strip() for item in re.split(pat_, text) if item.strip()]
        return [self.vocab[token] for token in tokens_]

    def decode(self, t_ids:list):
        text = " ".join([self.vocab_r[id] for id in t_ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)   # remove spaces before the specified punctuation
        return text 

In [62]:
tokenizer = SimpleTokenizerV1(vocab)
text = """
"It's the last he painted, you know," Mrs. Gisburn said with pardonable pride.
"""
ids = tokenizer.encode(text)
ids

[1,
 56,
 2,
 850,
 988,
 602,
 533,
 746,
 5,
 1126,
 596,
 5,
 1,
 67,
 7,
 38,
 851,
 1108,
 754,
 793,
 7]

In [63]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [64]:
text = "Hello, do you like tea?"
tokenizer.encode(text)

KeyError: 'Hello'

### 2.4 Adding special context tokens

In [67]:
all_words.extend(["<|endoftext|>", "<|unk|>"])
len(all_words)

1132

In [68]:
vocab = {token:i for i, token in enumerate(all_words)}

In [76]:
for k in list(vocab.keys())[-5:]:
    print(k, vocab[k])

younger 1127
your 1128
yourself 1129
<|endoftext|> 1130
<|unk|> 1131


In [80]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.vocab = vocab
        self.vocab_r = {v:k for k,v in vocab.items()}

    def encode(self, text: str):
        pat_ = r'([,.:;?_!"()\']|--|\s)'
        tokens_ = [item.strip() for item in re.split(pat_, text) if item.strip()]
        return [self.vocab.get(token, vocab["<|unk|>"]) for token in tokens_]

    def decode(self, t_ids:list):
        text = " ".join([self.vocab_r[id] for id in t_ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)   # remove spaces before the specified punctuation
        return text 

In [78]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))
text

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.'

In [81]:
tokenizer = SimpleTokenizerV2(vocab)
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [83]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

### 2.5 Byte pair encoding

In [85]:
import tiktoken

In [86]:
tiktoken.__version__

'0.12.0'

In [87]:
tokenizer = tiktoken.get_encoding("gpt2")

In [90]:
text = (
 "Hello, do you like tea? <|endoftext|> In the sunlit terraces "
 "of someunknownPlace."
)

ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
ids

[15496,
 11,
 466,
 345,
 588,
 8887,
 30,
 220,
 50256,
 554,
 262,
 4252,
 18250,
 8812,
 2114,
 286,
 617,
 34680,
 27271,
 13]

In [91]:
tokenizer.decode(ids)

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.'

#### Exercise 2.1 Byte pair encoding of unknown words

In [96]:
word = "Akwirw ier"

tkn = tokenizer.encode(word)

tkn

[33901, 86, 343, 86, 220, 959]

In [99]:
for id in tkn:
    print(tokenizer.decode([id]), end="  ")

Ak  w  ir  w     ier  

In [98]:
tokenizer.decode(tkn)

'Akwirw ier'

### 2.6 Data sampling with a sliding window

In [15]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [102]:
enc_text = tokenizer.encode(raw_text)
len(enc_text)

5145

In [111]:
enc_sample = enc_text[50:]

In [108]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(x)
print(" " * 4, y)

[290, 4920, 2241, 287]
     [4920, 2241, 287, 257]


In [109]:
for i in range(1, context_size + 1):
    x = enc_sample[:i]
    y = enc_sample[i]
    print(f'{tokenizer.decode(x)} ---> {tokenizer.decode([y])}')

 and --->  established
 and established --->  himself
 and established himself --->  in
 and established himself in --->  a


---

In [23]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

In [20]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids_ = tokenizer.encode(txt)

        for i in range(0, len(token_ids_) - max_length, stride):
            input_chunk = token_ids_[i:i+max_length]
            target_chunk = token_ids_[i+1: i+1+max_length]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx:int):
        return self.input_ids[idx], self.target_ids[idx]


In [21]:
def create_dataloader_v1(txt,
                         batch_size=4,
                         max_length=256,
                         stride=128,
                         shuffle=True,
                         drop_last=True,
                         num_workers=0) -> DataLoader:
    
    tokenizer = tiktoken.get_encoding("gpt2")
    data = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset=data,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            drop_last=drop_last,
                            num_workers=num_workers)
    
    return dataloader

In [128]:
raw_text[:100]

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g'

In [145]:
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=1,
    max_length=8,
    stride=2,
    shuffle=False)

In [146]:
data_iter = iter(dataloader)
next(data_iter)

[tensor([[  40,  367, 2885, 1464, 1807, 3619,  402,  271]]),
 tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899]])]

In [147]:
next(data_iter)

[tensor([[ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138]]),
 tensor([[ 1464,  1807,  3619,   402,   271, 10899,  2138,   257]])]

In [24]:
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=8,
    max_length=4,
    stride=4,
    shuffle=False)

In [25]:
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

inputs

tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

In [26]:
targets

tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])

### 2.7 Creating token embeddings

In [1]:
import torch

In [2]:
vocab_size = 6
output_dim = 3

In [3]:
torch.manual_seed(123)

embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
embedding_layer.weight

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)

In [6]:
next(embedding_layer.parameters())

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)

In [10]:
embedding_layer(torch.tensor([3]))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)

In [11]:
input_ids = torch.tensor([2, 3, 5, 1])

embedding_layer(input_ids)

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)

### 2.8 Encoding word positions

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

In [12]:
vocab_size = 50257
output_dim = 256

In [None]:
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [14]:
token_embedding_layer.weight.shape

torch.Size([50257, 256])

In [None]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids_ = tokenizer.encode(txt)

        for i in range(0, len(token_ids_) - max_length, stride):
            input_chunk = token_ids_[i:i+max_length]
            target_chunk = token_ids_[i+1: i+1+max_length]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx:int):
        return self.input_ids[idx], self.target_ids[idx]


In [None]:
def create_dataloader_v1(txt,
                         batch_size=4,
                         max_length=256,
                         stride=128,
                         shuffle=True,
                         drop_last=True,
                         num_workers=0) -> DataLoader:
    
    tokenizer = tiktoken.get_encoding("gpt2")
    data = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset=data,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            drop_last=drop_last,
                            num_workers=num_workers)
    
    return dataloader

In [27]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=8,
    max_length=max_length,
    stride=max_length,
    shuffle=False
    )

In [29]:
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

inputs, inputs.shape

(tensor([[   40,   367,  2885,  1464],
         [ 1807,  3619,   402,   271],
         [10899,  2138,   257,  7026],
         [15632,   438,  2016,   257],
         [  922,  5891,  1576,   438],
         [  568,   340,   373,   645],
         [ 1049,  5975,   284,   502],
         [  284,  3285,   326,    11]]),
 torch.Size([8, 4]))

In [30]:
targets, targets.shape

(tensor([[  367,  2885,  1464,  1807],
         [ 3619,   402,   271, 10899],
         [ 2138,   257,  7026, 15632],
         [  438,  2016,   257,   922],
         [ 5891,  1576,   438,   568],
         [  340,   373,   645,  1049],
         [ 5975,   284,   502,   284],
         [ 3285,   326,    11,   287]]),
 torch.Size([8, 4]))

In [32]:
token_embeddings = token_embedding_layer(inputs)

token_embeddings.shape

torch.Size([8, 4, 256])

In [33]:
token_embeddings[0], token_embeddings[0].shape

(tensor([[ 0.4913,  1.1239,  1.4588,  ..., -0.3995, -1.8735, -0.1445],
         [ 0.4481,  0.2536, -0.2655,  ...,  0.4997, -1.1991, -1.1844],
         [-0.2507, -0.0546,  0.6687,  ...,  0.9618,  2.3737, -0.0528],
         [ 0.9457,  0.8657,  1.6191,  ..., -0.4544, -0.7460,  0.3483]],
        grad_fn=<SelectBackward0>),
 torch.Size([4, 256]))

In [35]:
context_length = max_length

pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))

In [36]:
torch.arange(context_length)

tensor([0, 1, 2, 3])

In [38]:
pos_embeddings, pos_embeddings.shape

(tensor([[ 0.1265, -0.0513,  1.3903,  ...,  0.7732, -0.0831,  0.0133],
         [ 1.8435, -0.4803, -0.6885,  ..., -1.9100, -0.1373,  1.5089],
         [-0.8424,  0.2939, -0.5960,  ...,  1.3296,  0.1291,  0.5257],
         [ 0.0491, -1.4016,  0.1980,  ..., -0.4796, -1.4831, -0.8476]],
        grad_fn=<EmbeddingBackward0>),
 torch.Size([4, 256]))

In [41]:
input_embeddings = token_embeddings + pos_embeddings

input_embeddings.shape

torch.Size([8, 4, 256])

In [42]:
token_embeddings[0]

tensor([[ 0.4913,  1.1239,  1.4588,  ..., -0.3995, -1.8735, -0.1445],
        [ 0.4481,  0.2536, -0.2655,  ...,  0.4997, -1.1991, -1.1844],
        [-0.2507, -0.0546,  0.6687,  ...,  0.9618,  2.3737, -0.0528],
        [ 0.9457,  0.8657,  1.6191,  ..., -0.4544, -0.7460,  0.3483]],
       grad_fn=<SelectBackward0>)

In [44]:
input_embeddings[0]

tensor([[ 0.6178,  1.0725,  2.8492,  ...,  0.3736, -1.9567, -0.1312],
        [ 2.2916, -0.2266, -0.9540,  ..., -1.4103, -1.3364,  0.3245],
        [-1.0931,  0.2393,  0.0728,  ...,  2.2913,  2.5028,  0.4729],
        [ 0.9948, -0.5359,  1.8171,  ..., -0.9341, -2.2291, -0.4993]],
       grad_fn=<SelectBackward0>)