In [1]:
import torch
import torch.nn as nn

In [22]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "emb_dim": 768,          # Embedding dimension
    "n_heads": 12,           # Number of attention heads
    "n_layers": 12,          # Number of layers
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": False        # Query-Key-Value bias
}

### 4.1 Coding LLM Architecture

In [13]:
class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()

    def forward(self, x):
        return x

In [14]:
class DummyLayerNorm(nn.Module):
    def __init__(self, emb_dim, eps=1e-6):
        super().__init__()
        self.eps = eps

    def forward(self, x):
        return x



In [15]:
class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg

        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        #Shouldn't there be the Positional Embedding implemented?
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trns_blocks = nn.Sequential(*[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias = False)

    def forward(self, x_in):
        batch_size, seq_length = x_in.shape
        x = x_in
        tok_embeds  = self.tok_emb(x_in)
        pos_emb = self.pos_emb(torch.arange(seq_length, device=x_in.device))
        x = tok_embeds + pos_emb
        x = self.drop_emb(x)

        x = self.trns_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)

        return logits

In [16]:
import tiktoken

In [17]:
tokenizer = tiktoken.get_encoding("gpt2")

In [29]:
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

In [30]:
tokenizer.encode(txt1)

[6109, 3626, 6100, 345]

In [34]:
batch = []
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)

In [35]:
batch

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

In [40]:
torch.manual_seed(123)
gptModel = DummyGPTModel(cfg=GPT_CONFIG_124M)

In [41]:
logits = gptModel(batch)

In [42]:
logits.shape

torch.Size([2, 4, 50257])

### 4.2 Layer Normalization

In [58]:
torch.manual_seed(123)

batch_example = torch.randn(2,5)
batch_example

tensor([[-0.1115,  0.1204, -0.3696, -0.2404, -1.1969],
        [ 0.2093, -0.9724, -0.7550,  0.3239, -0.1085]])

In [59]:
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())
out = layer(batch_example)
out

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)

In [60]:
mean = out.mean(dim = -1, keepdim = True)
var = out.var(dim = -1, keepdim = True)

In [61]:
torch.set_printoptions(sci_mode=False)

In [62]:
normed = (out-mean) / torch.sqrt(var)

In [63]:
normed.mean(dim = -1)

tensor([0.0000, 0.0000], grad_fn=<MeanBackward1>)

In [64]:
normed.var(dim = -1)

tensor([1.0000, 1.0000], grad_fn=<VarBackward0>)

In [67]:
class LayerNorm(nn.Module):
    def __init__(self, dim, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(dim))
        self.shift = nn.Parameter(torch.zeros(dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True) #, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)

        return self.scale * norm_x + self.shift

In [69]:
ln = LayerNorm(6)
ln_out = ln(out)
ln_out

tensor([[ 0.6157,  1.4123, -0.8717,  0.5871, -0.8717, -0.8717],
        [-0.0189,  0.1121, -1.0875,  1.5171,  0.5647, -1.0875]],
       grad_fn=<AddBackward0>)

In [70]:
ln_out.mean(dim = -1)

tensor([-0.0000,  0.0000], grad_fn=<MeanBackward1>)

In [71]:
ln_out.var(dim = -1)

tensor([0.9996, 0.9997], grad_fn=<VarBackward0>)

4.3 Implementing FFN with GELU activation


In [80]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor((2 / torch.pi))) * (x + 0.044715 * torch.pow(x, 3))))

In [81]:
gelu = GELU()
gelu(torch.tensor(-0.8))

tensor(-0.1696)

In [86]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        """
            FFN sub-module of Transofrmer Block learning richer representations
            and extract more meaningful information from the input sequence.
            Typical Deep Learning strategy.
        """
        super().__init__()

        self.layers = nn.Sequential(nn.Linear(cfg["emb_dim"], cfg["emb_dim"]*4),
                                         GELU(),
                                         nn.Linear(cfg["emb_dim"]*4, cfg["emb_dim"])
                                         )

    def forward(self, x):
        out = self.layers(x)
        return out


In [87]:
ffn = FeedForward(cfg=GPT_CONFIG_124M)
ffn(torch.randn(2, 10, 768)).shape

torch.Size([2, 10, 768])

In [88]:
ffn.layers

Sequential(
  (0): Linear(in_features=768, out_features=3072, bias=True)
  (1): GELU()
  (2): Linear(in_features=3072, out_features=768, bias=True)
)

In [89]:
ffn.layers[0]

Linear(in_features=768, out_features=3072, bias=True)

In [109]:
class DeepNN(nn.Module):
     def __init__(self, layer_sizes, use_shortcut=False):
         super().__init__()
         self.use_shortcut = use_shortcut
         self.layers = nn.ModuleList([
                                       nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
                                       nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
                                       nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
                                       nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
                                       nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU())
                                       ])

     def forward(self, x):
        out = x
        for layer in self.layers:
            layer_output = layer(out)
            if self.use_shortcut and x.shape == layer_output.shape:
                out += layer_output
            else:
                out = layer_output

        return out




In [110]:
layer_sizes = [3, 3, 3, 3, 3, 1]
sample_input = torch.tensor([[1., 0., -1.]])
torch.manual_seed(123)                            #1
model_without_shortcut = DeepNN(
    layer_sizes, use_shortcut=False
)

In [114]:
model_without_shortcut(sample_input)

tensor([[0.0610]], grad_fn=<MulBackward0>)

In [113]:
def print_gradients(model, x):
    output = model(x)             #1
    target = torch.tensor([[0.]])

    loss = nn.MSELoss()
    loss = loss(output, target)    #2

    loss.backward()          #3

    for name, param in model.named_parameters():
        if 'weight' in name:
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")
#1 Forward pass

In [115]:
print_gradients(model_without_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.00020173587836325169
layers.1.0.weight has gradient mean of 0.0001201116101583466
layers.2.0.weight has gradient mean of 0.0007152041653171182
layers.3.0.weight has gradient mean of 0.001398873864673078
layers.4.0.weight has gradient mean of 0.005049646366387606


In [119]:
model_with_shortcut = DeepNN(layer_sizes, use_shortcut=True)

In [120]:
print_gradients(model_with_shortcut, sample_input)

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 3]], which is output 0 of AddBackward0, is at version 4; expected version 3 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).