In [52]:
import torch
import torch.nn as nn

In [53]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "emb_dim": 768,          # Embedding dimension
    "n_heads": 12,           # Number of attention heads
    "n_layers": 12,          # Number of layers
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": False        # Query-Key-Value bias
}

### 4.1 Coding LLM Architecture

In [54]:
class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()

    def forward(self, x):
        return x

In [55]:
class DummyLayerNorm(nn.Module):
    def __init__(self, emb_dim, eps=1e-6):
        super().__init__()
        self.eps = eps

    def forward(self, x):
        return x



In [56]:
class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg

        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        #Shouldn't there be the Positional Embedding implemented?
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trns_blocks = nn.Sequential(*[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias = False)

    def forward(self, x_in):
        batch_size, seq_length = x_in.shape
        x = x_in
        tok_embeds  = self.tok_emb(x_in)
        pos_emb = self.pos_emb(torch.arange(seq_length, device=x_in.device))
        x = tok_embeds + pos_emb
        x = self.drop_emb(x)

        x = self.trns_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)

        return logits

In [57]:
import tiktoken

In [58]:
tokenizer = tiktoken.get_encoding("gpt2")

In [59]:
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

In [60]:
tokenizer.encode(txt1)

[6109, 3626, 6100, 345]

In [61]:
batch = []
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)

In [62]:
batch

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

In [63]:
torch.manual_seed(123)
gptModel = DummyGPTModel(cfg=GPT_CONFIG_124M)

In [64]:
logits = gptModel(batch)

In [65]:
logits.shape

torch.Size([2, 4, 50257])

In [66]:
logits

tensor([[[-1.2034,  0.3201, -0.7130,  ..., -1.5548, -0.2390, -0.4667],
         [-0.1192,  0.4539, -0.4432,  ...,  0.2392,  1.3469,  1.2430],
         [ 0.5307,  1.6720, -0.4695,  ...,  1.1966,  0.0111,  0.5835],
         [ 0.0139,  1.6754, -0.3388,  ...,  1.1586, -0.0435, -1.0400]],

        [[-1.0908,  0.1798, -0.9484,  ..., -1.6047,  0.2439, -0.4530],
         [-0.7860,  0.5581, -0.0610,  ...,  0.4835, -0.0077,  1.6621],
         [ 0.3567,  1.2698, -0.6398,  ..., -0.0162, -0.1296,  0.3717],
         [-0.2407, -0.7349, -0.5102,  ...,  2.0057, -0.3694,  0.1814]]],
       grad_fn=<UnsafeViewBackward0>)

### 4.2 Layer Normalization

In [124]:
torch.manual_seed(123)

batch_example = torch.randn(2,5)
batch_example

tensor([[-0.1115,  0.1204, -0.3696, -0.2404, -1.1969],
        [ 0.2093, -0.9724, -0.7550,  0.3239, -0.1085]])

In [125]:
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())
out = layer(batch_example)
out

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)

In [126]:
mean = out.mean(dim = -1, keepdim = True)
var = out.var(dim = -1, keepdim = True)

In [127]:
torch.set_printoptions(sci_mode=False)

In [128]:
normed = (out-mean) / torch.sqrt(var)

In [129]:
normed.mean(dim = -1)

tensor([0.0000, 0.0000], grad_fn=<MeanBackward1>)

In [130]:
normed.var(dim = -1)

tensor([1.0000, 1.0000], grad_fn=<VarBackward0>)

In [139]:
class LayerNorm(nn.Module):
    def __init__(self, dim, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(dim))
        self.shift = nn.Parameter(torch.zeros(dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)

        return self.scale * norm_x + self.shift

In [140]:
ln = LayerNorm(dim = 5)
ln_out = ln(batch_example)


In [141]:
batch_example

tensor([[-0.1115,  0.1204, -0.3696, -0.2404, -1.1969],
        [ 0.2093, -0.9724, -0.7550,  0.3239, -0.1085]])

In [142]:
ln_out

tensor([[ 0.5528,  1.0693, -0.0223,  0.2656, -1.8654],
        [ 0.9087, -1.3767, -0.9564,  1.1304,  0.2940]], grad_fn=<AddBackward0>)

In [143]:
ln_out.mean(dim = -1)

tensor([-0.0000,  0.0000], grad_fn=<MeanBackward1>)

In [144]:
ln_out.var(dim = -1)

tensor([1.2499, 1.2500], grad_fn=<VarBackward0>)

4.3 Implementing FFN with GELU activation


In [145]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor((2 / torch.pi))) * (x + 0.044715 * torch.pow(x, 3))))

In [146]:
gelu = GELU()
gelu(torch.tensor(-0.8))

tensor(-0.1696)

In [147]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        """
            FFN sub-module of Transofrmer Block learning richer representations
            and extract more meaningful information from the input sequence.
            Typical Deep Learning strategy.
        """
        super().__init__()

        self.layers = nn.Sequential(nn.Linear(cfg["emb_dim"], cfg["emb_dim"]*4),
                                         GELU(),
                                         nn.Linear(cfg["emb_dim"]*4, cfg["emb_dim"])
                                         )

    def forward(self, x):
        out = self.layers(x)
        return out


In [148]:
ffn = FeedForward(cfg=GPT_CONFIG_124M)
ffn(torch.randn(2, 10, 768)).shape

torch.Size([2, 10, 768])

In [149]:
ffn.layers

Sequential(
  (0): Linear(in_features=768, out_features=3072, bias=True)
  (1): GELU()
  (2): Linear(in_features=3072, out_features=768, bias=True)
)

In [150]:
ffn.layers[0]

Linear(in_features=768, out_features=3072, bias=True)

### 4.4. Adding shortcut connections

In [151]:
class DeepNN(nn.Module):
     def __init__(self, layer_sizes, use_shortcut=False):
         super().__init__()
         self.use_shortcut = use_shortcut
         self.layers = nn.ModuleList([
                                       nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
                                       nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
                                       nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
                                       nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
                                       nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU())
                                       ])

     def forward(self, x):
        out = x
        for layer in self.layers:
            layer_output = layer(out)
            if self.use_shortcut and x.shape == layer_output.shape:
                out = out + layer_output
            else:
                out = layer_output

        return out




In [152]:
layer_sizes = [3, 3, 3, 3, 3, 1]
sample_input = torch.tensor([[1., 0., -1.]])
torch.manual_seed(123)                            #1
model_without_shortcut = DeepNN(
    layer_sizes, use_shortcut=False
)

In [153]:
model_without_shortcut(sample_input)

tensor([[0.0610]], grad_fn=<MulBackward0>)

In [154]:
def print_gradients(model, x):
    output = model(x)             #1
    target = torch.tensor([[0.]])

    loss = nn.MSELoss()
    loss = loss(output, target)    #2

    loss.backward()          #3

    for name, param in model.named_parameters():
        if 'weight' in name:
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")
#1 Forward pass

In [155]:
print_gradients(model_without_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.00020173587836325169
layers.1.0.weight has gradient mean of 0.0001201116101583466
layers.2.0.weight has gradient mean of 0.0007152041653171182
layers.3.0.weight has gradient mean of 0.001398873864673078
layers.4.0.weight has gradient mean of 0.005049646366387606


In [156]:
torch.manual_seed(123)
model_with_shortcut = DeepNN(layer_sizes, use_shortcut=True)

In [157]:
model_with_shortcut(sample_input)

tensor([[0.7669]], grad_fn=<MulBackward0>)

In [158]:
sample_input_2 = torch.tensor([[1., 0., -1.]])
print_gradients(model_with_shortcut, sample_input_2)

layers.0.0.weight has gradient mean of 0.22169792652130127
layers.1.0.weight has gradient mean of 0.20694106817245483
layers.2.0.weight has gradient mean of 0.32896995544433594
layers.3.0.weight has gradient mean of 0.2665732502937317
layers.4.0.weight has gradient mean of 1.3258541822433472


### 4.5. Connecting attention and linear layers into transformer blocks

In [159]:
from ch03 import MultiHeadAttention

In [160]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        """
        self.attn = MultiHeadAttention(
                        d_in = cfg["emb_dim"],
                        d_out = cfg["emb_dim"],
                        context_length = cfg["context_length"],
                        dropout = cfg["drop_rate"],
                        num_heads = cfg["n_heads"],
                        qkv_bias = cfg["qkv_bias"])
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
        self.ff = FeedForward(cfg)
        """
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])


    def forward(self, x):
        """
        #MHA Block
        out_attn = self.attn(self.norm1(x))
        out_attn = self.dropout(out_attn)
        out_attn = x + out_attn

        #FFN Block
        out_ffn = self.ffn(self.norm2(out_attn))
        out_ffn = self.dropout(out_ffn)
        out_ffn = out_attn + out_ffn

        return out_ffn
        """
         # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x

In [161]:
torch.manual_seed(123)

x = torch.rand(2, 4, 768)  # Shape: [batch_size, num_tokens, emb_dim]
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)

print("Input shape:", x.shape)
print("Output shape:", output.shape)

Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])


In [162]:
x

tensor([[[0.2961, 0.5166, 0.2517,  ..., 0.9541, 0.8567, 0.4604],
         [0.2238, 0.3047, 0.3019,  ..., 0.5465, 0.4532, 0.7598],
         [0.6945, 0.2478, 0.4111,  ..., 0.8838, 0.4898, 0.5963],
         [0.0890, 0.7804, 0.9223,  ..., 0.4507, 0.6357, 0.5833]],

        [[0.5716, 0.9297, 0.3396,  ..., 0.0477, 0.4564, 0.2797],
         [0.0936, 0.2211, 0.3806,  ..., 0.3948, 0.4545, 0.4536],
         [0.6788, 0.1741, 0.2084,  ..., 0.5557, 0.5930, 0.0959],
         [0.3894, 0.4083, 0.0662,  ..., 0.9861, 0.9341, 0.1319]]])

In [163]:
output

tensor([[[-0.0055,  0.0972, -0.1122,  ...,  1.2889,  0.2623,  0.6685],
         [ 0.0023, -0.2369,  0.1720,  ...,  0.5952,  0.2497,  0.7447],
         [ 0.4673,  0.4472,  0.1791,  ...,  1.2525,  0.3045,  0.7750],
         [ 0.0662,  0.7224,  0.9206,  ...,  0.4790,  0.7428,  0.7015]],

        [[ 0.3622,  1.2144,  0.5221,  ...,  0.1854,  0.0111, -0.5034],
         [-0.0225,  0.7789,  0.2770,  ...,  0.1734,  0.5419,  0.1143],
         [ 0.7425,  0.4013,  0.3211,  ...,  0.3268,  0.7523, -0.1642],
         [ 0.5745,  0.6241,  0.4410,  ...,  1.1963,  1.2650,  0.2243]]],
       grad_fn=<AddBackward0>)

### 4.6 Coding GPT Model

In [164]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg

        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        #Shouldn't there be the Positional Embedding implemented?
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trns_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias = False)

    def forward(self, x_in):
        batch_size, seq_length = x_in.shape
        #x = x_in
        tok_embeds  = self.tok_emb(x_in)
        pos_emb = self.pos_emb(torch.arange(seq_length, device=x_in.device))
        x = tok_embeds + pos_emb
        x = self.drop_emb(x)

        x = self.trns_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)

        return logits

In [165]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)

out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 0.3613,  0.4222, -0.0711,  ...,  0.3483,  0.4661, -0.2838],
         [-0.1792, -0.5660, -0.9485,  ...,  0.0477,  0.5181, -0.3168],
         [ 0.7120,  0.0332,  0.1085,  ...,  0.1018, -0.4327, -0.2553],
         [-1.0076,  0.3418, -0.1190,  ...,  0.7195,  0.4023,  0.0532]],

        [[-0.2564,  0.0900,  0.0335,  ...,  0.2659,  0.4454, -0.6806],
         [ 0.1230,  0.3653, -0.2074,  ...,  0.7705,  0.2710,  0.2246],
         [ 1.0558,  1.0318, -0.2800,  ...,  0.6936,  0.3205, -0.3178],
         [-0.1565,  0.3926,  0.3288,  ...,  1.2630, -0.1858,  0.0388]]],
       grad_fn=<UnsafeViewBackward0>)


In [166]:
model.parameters()

<generator object Module.parameters at 0x000002A38571FA00>

In [167]:
sum([p.numel() for p in model.parameters()])

163009536

### 4.7 Generating Text

Fundamental questions:
1. Does LLM training process consider entire sequence within one batch iteration? I think that yes.
2. Is Loss function - cross-entropy - computed for entire non-masked sequence?

In [168]:
start_context = "Hello, I am"
encoded_text = tokenizer.encode(start_context)
encoded_tensor = torch.tensor(encoded_text).unsqueeze(0)
encoded_tensor.shape

torch.Size([1, 4])

In [169]:
def generate_text_simplified(model, idx, max_new_tokens, context_size):

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]

        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        probs = torch.softmax(logits, dim=-1) #, keepdim = True)
        idx_next = torch.argmax(probs, dim=-1, keepdim = True)
        idx = torch.cat((idx, idx_next), dim = 1)

    return idx

In [170]:
out = generate_text_simplified(
                               model,
                               idx = encoded_tensor,
                               max_new_tokens = 6,
                               context_size = GPT_CONFIG_124M["context_length"]
                               )

In [171]:
out

tensor([[15496,    11,   314,   716, 27018, 24086, 47843, 30961, 38891, 34320]])