In [1]:
import os
import urllib.request
if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

In [2]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
import importlib
import tiktoken
print("tiktoken version:", importlib.metadata.version("tiktoken"))
tokenizer = tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(raw_text)
#print(enc_text)

tiktoken version: 0.9.0


In [3]:
import importlib
import tiktoken
#print("tiktoken version:", importlib.metadata.version("tiktoken"))
tokenizer = tiktoken.get_encoding("gpt2")
enc_sample = enc_text[50:]
context_size = 4
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))
    print(context, "---->", desired)

 and ---->  established
[290] ----> 4920
 and established ---->  himself
[290, 4920] ----> 2241
 and established himself ---->  in
[290, 4920, 2241] ----> 287
 and established himself in ---->  a
[290, 4920, 2241, 287] ----> 257


In [4]:
import torch
#print("PyTorch version:", torch.__version__)
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [5]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [6]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=4, max_length=4, stride=1, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[  40,  367, 2885, 1464],
        [ 367, 2885, 1464, 1807],
        [2885, 1464, 1807, 3619],
        [1464, 1807, 3619,  402]])

Targets:
 tensor([[ 367, 2885, 1464, 1807],
        [2885, 1464, 1807, 3619],
        [1464, 1807, 3619,  402],
        [1807, 3619,  402,  271]])


In [7]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)
for i in range(len(dataloader.dataset)):
  print(dataloader.dataset[i])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
(tensor([ 326,   11, 1770,   13]), tensor([  11, 1770,   13, 8759]))
(tensor([  11, 1770,   13, 8759]), tensor([1770,   13, 8759, 2763]))
(tensor([1770,   13, 8759, 2763]), tensor([  13, 8759, 2763,  438]))
(tensor([  13, 8759, 2763,  438]), tensor([8759, 2763,  438, 1169]))
(tensor([8759, 2763,  438, 1169]), tensor([2763,  438, 1169, 2994]))
(tensor([2763,  438, 1169, 2994]), tensor([ 438, 1169, 2994,  284]))
(tensor([ 438, 1169, 2994,  284]), tensor([1169, 2994,  284,  943]))
(tensor([1169, 2994,  284,  943]), tensor([ 2994,   284,   943, 17034]))
(tensor([ 2994,   284,   943, 17034]), tensor([  284,   943, 17034,   318]))
(tensor([  284,   943, 17034,   318]), tensor([  943, 17034,   318,   477]))
(tensor([  943, 17034,   318,   477]), tensor([17034,   318,   477,   314]))
(tensor([17034,   318,   477,   314]), tensor([318, 477, 314, 892]))
(tensor([318, 477, 314, 892]), tensor([477, 314, 892, 286]))
(tensor([477, 314,

In [8]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)
data_iter=iter(dataloader)
inputs, target =next(data_iter)
print(inputs)
print(target)

tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [9]:
input_ids = torch.tensor([2, 3, 5, 1])
print(input_ids)
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

print(embedding_layer.weight)


output = embedding_layer(input_ids)
print(output)

tensor([2, 3, 5, 1])
Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)
tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


In [10]:
vocab_size = 50257
output_dim = 3
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4,
    stride=1, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print(inputs.shape)
token_embeddings = embedding_layer(inputs)
print(token_embeddings.shape)
print(token_embeddings)


torch.Size([8, 4])
torch.Size([8, 4, 3])
tensor([[[-0.6171, -0.8334,  0.4839],
         [ 0.8888, -0.5905,  0.5722],
         [-0.6917,  0.4100, -0.5865],
         [ 2.3808,  0.2818, -1.5755]],

        [[ 0.8888, -0.5905,  0.5722],
         [-0.6917,  0.4100, -0.5865],
         [ 2.3808,  0.2818, -1.5755],
         [-0.2305, -0.7572, -0.3657]],

        [[-0.6917,  0.4100, -0.5865],
         [ 2.3808,  0.2818, -1.5755],
         [-0.2305, -0.7572, -0.3657],
         [ 1.0826,  0.4289, -0.7858]],

        [[ 2.3808,  0.2818, -1.5755],
         [-0.2305, -0.7572, -0.3657],
         [ 1.0826,  0.4289, -0.7858],
         [-0.5539,  0.0569,  0.4340]],

        [[-0.2305, -0.7572, -0.3657],
         [ 1.0826,  0.4289, -0.7858],
         [-0.5539,  0.0569,  0.4340],
         [ 0.2517, -0.9284,  1.8260]],

        [[ 1.0826,  0.4289, -0.7858],
         [-0.5539,  0.0569,  0.4340],
         [ 0.2517, -0.9284,  1.8260],
         [ 0.8731, -0.0938,  0.4529]],

        [[-0.5539,  0.0569,  0.4340

In [11]:
max_length = 4
output_dim = 3
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 3])


In [12]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 3])


**Coding attention Mechanism**

In [13]:
x=input_embeddings[1]
d_in = x.shape[1]
d_out = 3

In [14]:
torch.manual_seed(123)

W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key   = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
print(W_query.shape)
print(W_key.shape)
print(W_value.shape)

torch.Size([3, 3])
torch.Size([3, 3])
torch.Size([3, 3])


In [15]:
import torch.nn as nn

class SelfAttention_v1(nn.Module):

    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))
        self.W_key   = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))

    def forward(self, x):
        keys = x @ self.W_key
        queries = x @ self.W_query
        values = x @ self.W_value

        attn_scores = queries @ keys.T # omega
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1
        )

        context_vec = attn_weights @ values
        return context_vec

torch.manual_seed(123)
sa_v1 = SelfAttention_v1(d_in, d_out)
print(sa_v1(x))

tensor([[ 0.2212,  0.1389,  0.5761],
        [ 0.2813,  0.5377,  0.6781],
        [ 0.9852,  1.5106,  2.1774],
        [-0.3355, -1.3020, -0.9845]], grad_fn=<MmBackward0>)




*   We can streamline the implementation above using PyTorch's Linear layers, which are equivalent to a matrix multiplication if we disable the bias units

*   Another big advantage of using nn.Linear over our manual nn.Parameter(torch.rand(...) approach is that nn.Linear has a preferred weight initialization scheme, which leads to more stable model training



In [16]:
import torch.nn as nn

class SelfAttention_v2(nn.Module):

    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out,bias=qkv_bias)
        self.W_key   = nn.Linear(d_in, d_out,bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out,bias=qkv_bias)

    def forward(self, x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        attn_scores = queries @ keys.T # omega
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1
        )

        context_vec = attn_weights @ values
        return context_vec

torch.manual_seed(123)
sa_v2 = SelfAttention_v2(d_in, d_out)
print(sa_v2(x))

tensor([[ 0.1239, -0.4548,  0.8591],
        [-0.2733,  0.1706, -0.4582],
        [-0.4527, -0.2961,  0.0946],
        [-0.1437, -0.0529, -0.1443]], grad_fn=<MmBackward0>)


In [17]:
queries = sa_v2.W_query(x)
keys = sa_v2.W_key(x)
values = sa_v2.W_value(x)
attn_scores = queries @ keys.T
context_length = attn_scores.shape[0]
mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
masked = attn_scores.masked_fill(mask.bool(), -torch.inf)

print(masked)


attn_weights2 = torch.softmax(masked / keys.shape[-1]**0.5, dim=-1)
attn_weights1 = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
values
context_vec1 = attn_weights1 @ values
context_vec2 = attn_weights2 @ values
print("context_vector without masking:",context_vec1)
print("context_vector with masking:",context_vec2)


tensor([[-1.7896,    -inf,    -inf,    -inf],
        [ 2.4716, -3.8191,    -inf,    -inf],
        [-0.7732,  0.7350,  2.8338,    -inf],
        [ 0.7663, -0.8704, -1.6972,  0.3211]], grad_fn=<MaskedFillBackward0>)
context_vector without masking: tensor([[ 0.1239, -0.4548,  0.8591],
        [-0.2733,  0.1706, -0.4582],
        [-0.4527, -0.2961,  0.0946],
        [-0.1437, -0.0529, -0.1443]], grad_fn=<MmBackward0>)
context_vector with masking: tensor([[ 0.1977,  0.7329, -0.8358],
        [ 0.2044,  0.7022, -0.7795],
        [-0.4300, -0.2496,  0.1034],
        [-0.1437, -0.0529, -0.1443]], grad_fn=<MmBackward0>)


**Masking additional attention weights with droputs**

1) In addition, we also apply dropout to reduce overfitting during training

2) Dropout can be applied in several places:

3) for example, after computing the attention weights;
or after multiplying the attention weights with the value vectors
Here, we will apply the dropout mask after computing the attention weights because it's more common

4) Furthermore, in this specific example, we use a dropout rate of 50%, which means randomly masking out half of the attention weights. (When we train the GPT model later, we will use a lower dropout rate, such as 0.1 or 0.2

In [18]:
import torch.nn as nn

class CasualAttention(nn.Module):

    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out,bias=qkv_bias)
        self.W_key   = nn.Linear(d_in, d_out,bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out,bias=qkv_bias)
        self.dropout = nn.Dropout(0.1)  #new

    def forward(self, x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        attn_scores = queries @ keys.T # omega
        context_length = attn_scores.shape[0]
        mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)  #new
        masked = attn_scores.masked_fill(mask.bool(), -torch.inf)                  #new

        attn_weights = torch.softmax(
            masked / keys.shape[-1]**0.5, dim=-1         #new
        )
        attn_weights = self.dropout(attn_weights)        #new
        context_vec = attn_weights @ values
        return context_vec

torch.manual_seed(123)
ca = CasualAttention(d_in, d_out)
print(ca(x))

tensor([[ 0.2197,  0.8144, -0.9287],
        [ 0.2271,  0.7803, -0.8661],
        [-0.4778, -0.2773,  0.1149],
        [ 0.0874,  0.2220, -0.1609]], grad_fn=<MmBackward0>)


In [19]:
batch = torch.stack((x, x), dim=0)
print(x.shape)
print(x)
print(batch.shape)
class CausalAttention(nn.Module):

    def __init__(self, d_in, d_out, context_length,
                 dropout, qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key   = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout) # New
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) # New

    def forward(self, x):
        b, num_tokens, d_in = x.shape # New batch dimension b
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        attn_scores = queries @ keys.transpose(1, 2) # Changed transpose
        attn_scores.masked_fill_(  # New, _ ops are in-place
            self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)  # `:num_tokens` to account for cases where the number of tokens in the batch is smaller than the supported context_size
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1
        )
        attn_weights = self.dropout(attn_weights) # New

        context_vec = attn_weights @ values
        return context_vec

torch.manual_seed(123)

context_length = batch.shape[1]
ca = CausalAttention(d_in, d_out, context_length, 0.0)

context_vecs = ca(batch)

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

torch.Size([4, 3])
tensor([[ 1.2923, -0.8822,  1.3157],
        [-3.0635,  1.9203, -0.7746],
        [ 2.9541,  1.2904, -1.4919],
        [ 0.1111, -1.3175, -1.0572]], grad_fn=<SelectBackward0>)
torch.Size([2, 4, 3])
tensor([[[ 0.1977,  0.7329, -0.8358],
         [ 0.2044,  0.7022, -0.7795],
         [-0.4300, -0.2496,  0.1034],
         [-0.1437, -0.0529, -0.1443]],

        [[ 0.1977,  0.7329, -0.8358],
         [ 0.2044,  0.7022, -0.7795],
         [-0.4300, -0.2496,  0.1034],
         [-0.1437, -0.0529, -0.1443]]], grad_fn=<UnsafeViewBackward0>)
context_vecs.shape: torch.Size([2, 4, 3])


In [20]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec

torch.manual_seed(123)

batch_size, context_length, d_in = batch.shape
d_out = 2
mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads=2)

context_vecs = mha(batch)

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)


tensor([[[ 0.4617,  0.4784],
         [-0.0888,  0.9037],
         [ 0.1325,  0.2891],
         [ 0.1205,  0.6875]],

        [[ 0.4617,  0.4784],
         [-0.0888,  0.9037],
         [ 0.1325,  0.2891],
         [ 0.1205,  0.6875]]], grad_fn=<ViewBackward0>)
context_vecs.shape: torch.Size([2, 4, 2])


In [21]:
from importlib.metadata import version

import matplotlib
import tiktoken
import torch

In [22]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [23]:
import torch
import torch.nn as nn


class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        # Use a placeholder for TransformerBlock
        self.trf_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        # Use a placeholder for LayerNorm
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # A simple placeholder

    def forward(self, x):
        # This block does nothing and just returns its input.
        return x


class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
        # The parameters here are just to mimic the LayerNorm interface.

    def forward(self, x):
        # This layer does nothing and just returns its input.
        return x

# **layer normalization**

In [24]:
torch.manual_seed(123)

# create 2 training examples with 5 dimensions (features) each
batch_example = torch.randn(2, 5)
print(batch_example)
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())
out = layer(batch_example)
print(out)

tensor([[-0.1115,  0.1204, -0.3696, -0.2404, -1.1969],
        [ 0.2093, -0.9724, -0.7550,  0.3239, -0.1085]])
tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)


In [25]:
mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True)

print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


In [26]:
out_norm = (out - mean) / torch.sqrt(var)
print("Normalized layer outputs:\n", out_norm)

mean = out_norm.mean(dim=-1, keepdim=True)
var = out_norm.var(dim=-1, keepdim=True)
print("Mean:\n", mean)
print("Variance:\n", var)

Normalized layer outputs:
 tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)
Mean:
 tensor([[9.9341e-09],
        [1.9868e-08]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [27]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [28]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim=True)
torch.set_printoptions(sci_mode=False)
print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [29]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [30]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

In [31]:

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x

In [32]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [33]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
batch = []

txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)

model = GPTModel(GPT_CONFIG_124M)

out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 0.3040, -0.5946, -0.0769,  ...,  1.1078, -0.0504,  0.1565],
         [ 0.1506,  0.5771, -0.2291,  ...,  0.1398,  0.5562, -0.4891],
         [-0.1035, -0.0737, -0.0147,  ...,  0.1901,  0.3644, -0.5351],
         [ 0.9624,  0.2590,  1.3362,  ...,  0.0066, -0.8251,  0.0241]],

        [[-0.1246, -0.5475, -0.4489,  ...,  0.3256,  0.0283,  0.3715],
         [-0.5463,  0.0735, -0.2820,  ..., -0.5704,  0.7376, -0.9539],
         [-0.6029, -0.1722, -0.0734,  ...,  0.1868,  0.4553, -0.1255],
         [ 0.1704, -0.2963,  0.4654,  ..., -0.3236,  0.2745,  0.4302]]],
       grad_fn=<UnsafeViewBackward0>)


In [34]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

total_params_gpt2 =  total_params - sum(p.numel() for p in model.out_head.parameters())
print(f"Number of trainable parameters considering weight tying: {total_params_gpt2:,}")

total_size_bytes = total_params * 4
total_size_mb = total_size_bytes / (1024 * 1024)
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total number of parameters: 163,009,536
Number of trainable parameters considering weight tying: 124,412,160
Total size of the model: 621.83 MB


In [35]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):

        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]

        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)

        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

In [38]:
start_context = "Hello, I am"

encoded = tokenizer.encode(start_context)
print("encoded:", encoded)

encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print("encoded_tensor.shape:", encoded_tensor.shape)

encoded: [15496, 11, 314, 716]
encoded_tensor.shape: torch.Size([1, 4])


In [39]:
model.eval() # disable dropout

out = generate_text_simple(
    model=model,
    idx=encoded_tensor,
    max_new_tokens=6,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[15496,    11,   314,   716, 36819, 10592, 24848, 36502, 32080,  8357]])
Output length: 10


In [40]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hello, I am Exactly Leon 152 scramblingDetailedarms
