In [1]:
!pip install transformers tiktoken



# Data tokenize

In [2]:
with open("/content/Build-a-Large-Language-Model-_From-Scratch_-.txt", "r", encoding="utf-8") as f:
    raw_data = f.read()

print(f"Total characters: {len(raw_data)}")
print(f"Data head: {raw_data[1000:1100]}")

Total characters: 420297
Data head: y. This book has been a long-standing idea in my mind, and I'm
thrilled to finally have the opportun


In [3]:
import re

text = "Hello, world. This, is a example"
result = re.split(r"(\s)+", text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'example']


In [4]:
result = re.split(r'([,.]|\s)', text)

print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'example']


In [5]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'example']


In [6]:
text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [7]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_data)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['Build', 'a', 'Large', 'Language', 'Model', '(', 'From', 'Scratch', ')', '1', '.', 'welcome', '2', '.', '1', '_', 'Understanding', '_', 'Large', '_', 'Language', '_', 'Models', '3', '.', '2', '_', 'Working', '_', 'with']


In [8]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

5949


In [9]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [10]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
('#', 2)
('###', 3)
('#A', 4)
('#B', 5)
('#C', 6)
('#D', 7)
('#E', 8)
('#F', 9)
('#G', 10)
('#H', 11)
('#I', 12)
('#J', 13)
('#K', 14)
('#L', 15)
('#M', 16)
('$30', 17)
('$4', 18)
('$690', 19)
('%', 20)
('%timeit', 21)
("'", 22)
('(', 23)
(')', 24)
('*', 25)
('*[DummyTransformerBlock', 26)
('*[TransformerBlock', 27)
('+', 28)
('+=', 29)
(',', 30)
('-', 31)
('--', 32)
('-0', 33)
('-1', 34)
('-10', 35)
('-11', 36)
('-12', 37)
('-1]', 38)
('-2', 39)
('-3', 40)
('-4', 41)
('-9', 42)
('-context', 43)
('-i', 44)
('-inf', 45)
('-inf]', 46)
('-torch', 47)
('-∞', 48)
('.', 49)
('/', 50)


In [11]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('م\u202crefres', 5944)
('—', 5945)
('−', 5946)
('≈', 5947)
('⋅', 5948)


In [12]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [13]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [14]:
tokenizer.encode(text)

[1471,
 30,
 3010,
 5892,
 3963,
 5439,
 1157,
 1148,
 1500,
 5485,
 5394,
 5471,
 4343,
 5485,
 4491,
 49]

In [15]:
tokenizer.decode(tokenizer.encode(text))

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.'

In [16]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [17]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text)

print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [18]:
strings = tokenizer.decode(integers)

print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [19]:
integers = tokenizer.encode("Akwirw ier")
print(integers)

strings = tokenizer.decode(integers)
print(strings)

[33901, 86, 343, 86, 220, 959]
Akwirw ier


# All text tokenize

In [20]:
def tokenize_like_tiktoken(text, tokenizer, max_length=None):
    if max_length is None:
        encoding = tokenizer(text, truncation=False, padding=False, return_tensors=None)
        return encoding['input_ids']
    else:
        tokens = []
        for i in range(0, len(text), max_length):
            chunk = text[i:i + max_length]
            chunk_tokens = tokenizer.encode(chunk, add_special_tokens=False)
            tokens.extend(chunk_tokens)
        return tokens

In [21]:
with open("/content/Build-a-Large-Language-Model-_From-Scratch_-.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenize_like_tiktoken(raw_text, tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (122002 > 1024). Running this sequence through the model will result in indexing errors


In [22]:
len(enc_text)

122002

In [23]:
enc_sample = enc_text[50:]

In [24]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [220, 220, 513, 62]
y:      [220, 513, 62, 34]


In [25]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)

[220] ----> 220
[220, 220] ----> 513
[220, 220, 513] ----> 62
[220, 220, 513, 62] ----> 34


In [26]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

  ---->  
   ---->  3
   3 ----> _
   3_ ----> C


In [27]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [28]:
import tiktoken
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [29]:
import torch
print("PyTorch version:", torch.__version__)
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

PyTorch version: 2.8.0+cu126
[tensor([[  200,   200, 15580,   257]]), tensor([[  200, 15580,   257, 13601]])]


In [30]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[  200, 15580,   257, 13601]]), tensor([[15580,   257, 13601, 15417]])]


In [31]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[  200,   200, 15580,   257],
        [13601, 15417,  9104,   357],
        [ 4863,  1446, 36722,     8],
        [  198,   352,    13,   220],
        [  220,  7062,   198,   362],
        [   13,   220,   220,   352],
        [   62, 43467,    62, 21968],
        [   62, 32065,    62,  5841]])

Targets:
 tensor([[  200, 15580,   257, 13601],
        [15417,  9104,   357,  4863],
        [ 1446, 36722,     8,   198],
        [  352,    13,   220,   220],
        [ 7062,   198,   362,    13],
        [  220,   220,   352,    62],
        [43467,    62, 21968,    62],
        [32065,    62,  5841,  1424]])


In [32]:
input_ids = torch.tensor([2, 3, 5, 1])

In [33]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [34]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [35]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [36]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


In [37]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [38]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [39]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[  200,   200, 15580,   257],
        [13601, 15417,  9104,   357],
        [ 4863,  1446, 36722,     8],
        [  198,   352,    13,   220],
        [  220,  7062,   198,   362],
        [   13,   220,   220,   352],
        [   62, 43467,    62, 21968],
        [   62, 32065,    62,  5841]])

Inputs shape:
 torch.Size([8, 4])


In [40]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [41]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [42]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [43]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


# Implementing simple attention

In [44]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [45]:
inputs.shape

torch.Size([6, 3])

In [46]:
query = inputs[1]

attn_scores_2 = torch.empty(inputs.shape[0])
for i, vec in enumerate(inputs):
    attn_scores_2[i] = torch.dot(vec, query)

print(f"Scores: {attn_scores_2}")

Scores: tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [47]:
attn_weight_2_tmp = attn_scores_2 / torch.sum(attn_scores_2, dim=-1)
print(f"Weight attention scores: {attn_weight_2_tmp}")

Weight attention scores: tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])


In [48]:
def softmax_naive(x):
    return torch.exp(x) / torch.sum(torch.exp(x), dim=-1)

attn_naiwe_softmax = softmax_naive(attn_scores_2)
print(f"Naive scores: {attn_naiwe_softmax}")

Naive scores: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])


In [49]:
attn_softmax = torch.softmax(attn_scores_2, dim=0)
print(f"Torch softmax scores: {attn_softmax}")

Torch softmax scores: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])


In [50]:
query = inputs[1]

context_vec_2 = torch.zeros(query.shape[0])
for i, vec in enumerate(inputs):
    context_vec_2 += vec * attn_softmax[i]

print(f"Context vec 2: {context_vec_2}")

Context vec 2: tensor([0.4419, 0.6515, 0.5683])


In [51]:
attn_scores = inputs @ inputs.T
attn_scores_weigth = torch.softmax(attn_scores, dim=0)
print(attn_scores_weigth)

tensor([[0.2098, 0.1385, 0.1390, 0.1435, 0.1526, 0.1385],
        [0.2006, 0.2379, 0.2369, 0.2074, 0.1958, 0.2184],
        [0.1981, 0.2333, 0.2326, 0.2046, 0.1975, 0.2128],
        [0.1242, 0.1240, 0.1242, 0.1462, 0.1367, 0.1420],
        [0.1220, 0.1082, 0.1108, 0.1263, 0.1879, 0.0988],
        [0.1452, 0.1581, 0.1565, 0.1720, 0.1295, 0.1896]])


In [52]:
all_context_vecs = attn_scores_weigth @ inputs
print(all_context_vecs)

tensor([[0.4017, 0.5023, 0.5059],
        [0.5595, 0.7824, 0.6953],
        [0.5538, 0.7686, 0.6834],
        [0.3369, 0.4647, 0.4119],
        [0.3525, 0.4059, 0.3657],
        [0.3856, 0.5761, 0.5077]])


In [53]:
print("Previous 2nd context vector:", context_vec_2)

Previous 2nd context vector: tensor([0.4419, 0.6515, 0.5683])


# Attention moment

In [54]:
from torch import nn

class SelfAttentionLayer(nn.Module):

    def __init__(self, in_dim, out_dim, rqrd_bias=False):
        super().__init__()
        self.W_query = nn.Linear(in_dim, out_dim, bias=rqrd_bias)
        self.W_key = nn.Linear(in_dim, out_dim, bias=rqrd_bias)
        self.W_value = nn.Linear(in_dim, out_dim, bias=rqrd_bias)

    def forward(self, x):
        query = self.W_query(x)
        key = self.W_key(x)
        value = self.W_value(x)

        z = nn.functional.softmax(
            (query @ key.T) / key.shape[-1], dim=-1
        ) @ value
        return z

In [55]:
atten_m = SelfAttentionLayer(inputs.shape[-1], 2)
print(atten_m(inputs))

tensor([[-0.3417,  0.1183],
        [-0.3413,  0.1182],
        [-0.3414,  0.1182],
        [-0.3438,  0.1200],
        [-0.3438,  0.1200],
        [-0.3431,  0.1194]], grad_fn=<MmBackward0>)


# Complect causal attention

In [59]:
class CausalSelfAttention(nn.Module):

    def __init__(self, d_in, d_out, context_length, dropout, rqrd_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=rqrd_bias)
        self.W_keys = nn.Linear(d_in, d_out, bias=rqrd_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=rqrd_bias)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length)))

    def forward(self, x):
        b, tokens, d = x.shape
        mask = self.mask[:tokens, :tokens]

        query = self.W_query(x)
        keys = self.W_keys(x)
        value = self.W_value(x)

        weights = query @ torch.transpose(keys, 1, 2)
        masked_weights = weights.masked_fill_(mask.bool(), -torch.inf)
        norm_masked_weights = nn.functional.softmax(masked_weights, dim=-1)

        attn_vec = self.dropout(norm_masked_weights)
        context_vec = norm_masked_weights @ value
        return context_vec

In [60]:
batch = torch.stack((inputs, inputs), dim=0)
print(batch.shape)

torch.Size([2, 6, 3])


In [61]:
d_in, d_out = 3, 2

torch.manual_seed(123)
context_length = batch.shape[1]
ca = CausalSelfAttention(d_in, d_out, context_length, 0.0)
context_vecs = ca(batch)
print("context_vecs.shape:", context_vecs.shape)

context_vecs.shape: torch.Size([2, 6, 2])


# Multihead attention with weights per batches

In [70]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, num_heads, dropout=0.1, bias=False):
        super().__init__()
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.qkv_proj = nn.Linear(d_in, 3 * d_out, bias=bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)

        # Более эффективная маска
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        B, T, C = x.shape

        # Одной проекцией получаем Q, K, V
        qkv = self.qkv_proj(x).reshape(B, T, 3, self.num_heads, self.head_dim)
        q, k, v = qkv.unbind(2)  # [B, T, num_heads, head_dim]

        q = q.transpose(1, 2)  # [B, num_heads, T, head_dim]
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        # Attention
        attn_scores = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)

        # Маска
        if T <= self.mask.size(0):
            mask = self.mask[:T, :T]
        else:
            mask = torch.triu(torch.ones(T, T, device=x.device), diagonal=1)

        attn_scores = attn_scores.masked_fill(mask.unsqueeze(0).unsqueeze(0).bool(), -torch.inf)

        attn_weights = torch.softmax(attn_scores, dim=-1)
        attn_weights = self.dropout(attn_weights)

        out = attn_weights @ v  # [B, num_heads, T, head_dim]
        out = out.transpose(1, 2).contiguous().reshape(B, T, self.d_out)

        return self.out_proj(out)

In [63]:
a = torch.tensor([[[[0.2745, 0.6584, 0.2775, 0.8573], #A
[0.8993, 0.0390, 0.9268, 0.7388],
[0.7179, 0.7058, 0.9156, 0.4340]],
[[0.0772, 0.3565, 0.1479, 0.5331],
[0.4066, 0.2318, 0.4545, 0.9737],
[0.4606, 0.5159, 0.4220, 0.5786]]]])

<div class="alert alert-block alert-info">

The shape of this tensor is (b, num_heads, num_tokens, head_dim) = (1, 2, 3, 4)
</div>

In [71]:
torch.manual_seed(123)
batch_size, context_length, d_in = batch.shape
d_out = 2
mha = MultiHeadAttention(d_in, d_out, context_length, 2, 0)
context_vecs = mha(batch)
print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tensor([[[0.3190, 0.4858],
         [0.2943, 0.3897],
         [0.2856, 0.3593],
         [0.2693, 0.3873],
         [0.2639, 0.3928],
         [0.2575, 0.4028]],

        [[0.3190, 0.4858],
         [0.2943, 0.3897],
         [0.2856, 0.3593],
         [0.2693, 0.3873],
         [0.2639, 0.3928],
         [0.2575, 0.4028]]], grad_fn=<ViewBackward0>)
context_vecs.shape: torch.Size([2, 6, 2])
