# Setup

In [1]:
import tiktoken
import torch
from models import DummyGPTModel, GPTConfig, LayerNorm
from torch import nn

In [2]:
config = GPTConfig(
    vocab_size=50257, context_length=1024, embed_dim=768, n_heads=12, n_layers=12, drop_rate=0.1, qkv_bias=False
)

# Dummy implementation

## Setting up tokens

In [3]:
tokenizer = tiktoken.get_encoding("gpt2")
batch: list[torch.Tensor] = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))

batch = torch.stack(batch, dim=0)  # converts list of tensors to a single tensor with dimension 2*4
batch

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

## Initializing a 124-million param gpt mdel

In [4]:
torch.manual_seed(123)
model = DummyGPTModel(config)
logits = model(batch)  # context verctors
print(f"Output shape: {logits.shape}")  # 2 rows (batch size) * 4 (tokens) * 50257 (vocab size)
logits

Output shape: torch.Size([2, 4, 50257])


tensor([[[-1.2034,  0.3201, -0.7130,  ..., -1.5548, -0.2390, -0.4667],
         [-0.1192,  0.4539, -0.4432,  ...,  0.2392,  1.3469,  1.2430],
         [ 0.5307,  1.6720, -0.4695,  ...,  1.1966,  0.0111,  0.5835],
         [ 0.0139,  1.6754, -0.3388,  ...,  1.1586, -0.0435, -1.0400]],

        [[-1.0908,  0.1798, -0.9484,  ..., -1.6047,  0.2439, -0.4530],
         [-0.7860,  0.5581, -0.0610,  ...,  0.4835, -0.0077,  1.6621],
         [ 0.3567,  1.2698, -0.6398,  ..., -0.0162, -0.1296,  0.3717],
         [-0.2407, -0.7349, -0.5102,  ...,  2.0057, -0.3694,  0.1814]]],
       grad_fn=<UnsafeViewBackward0>)

## Layer normalisation

We need to normalize the outputs of the model to have a mean of 0 and a variance of 1 (unit variance), which will speed up and make training more reliable.

ReLU is popular because:
It's computationally efficient (just a max operation)
It helps with the vanishing gradient problem
It introduces non-linearity into the network, allowing it to learn more complex patterns
It's simple to implement and works well in practice

nn.Sequential is a container in PyTorch that allows you to chain multiple neural network layers together in sequence. It's like a pipeline where the output of one layer becomes the input of the next layer.

In [5]:
torch.manual_seed(123)
batch_example = torch.randn(2, 5)
print(f"Batch example shape: {batch_example.shape}")
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())  # 5 inputs, 6 outputs
out = layer(batch_example)
out

Batch example shape: torch.Size([2, 5])


tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)

In [6]:
mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True)
print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


Layer normalisation applied to preceding layer outputs

Note that the value –5.9605e-08 in the output tensor is the scientific notation for –5.9605 × 10-8, which is –0.000000059605 in decimal form. This value is very close to 0, but it is not exactly 0 due to small numerical errors that can accumulate because of the finite precision with which computers represent numbers.

In [7]:
out_norm = (out - mean) / torch.sqrt(
    var
)  # subtract the mean from each value in out then devide by the square root of the variance
print("Normalized layer outputs:\n", out_norm)

mean = out_norm.mean(dim=-1, keepdim=True)
print("Mean:\n", mean)

var = out_norm.var(dim=-1, keepdim=True)
print("Variance:\n", var)

Normalized layer outputs:
 tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)
Mean:
 tensor([[9.9341e-09],
        [5.9605e-08]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


Improving readability

In [8]:
torch.set_printoptions(sci_mode=False)
print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


Use custom LayerNorm and apply it to the batch input

In [9]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim=True)
print("Mean:\n", mean)
print("Variance:\n", var)

TypeError: LayerNorm.__init__() got an unexpected keyword argument 'emb_dim'. Did you mean 'embed_dim'?