# Setup

In [1]:
import matplotlib.pyplot as plt
import tiktoken
import torch
from torch import nn

from src.build_llm_from_scratch_book.modules import (
    GELU,
    DummyGPTModel,
    ExampleDeepNeuralNetwork,
    FeedForward,
    GPTConfig,
    GPTModel,
    LayerNorm,
    TransformerBlock,
)

ModuleNotFoundError: No module named 'src'

In [2]:
config = GPTConfig(
    vocab_size=50257, context_length=1024, embed_dim=768, n_heads=12, n_layers=12, drop_rate=0.1, qkv_bias=False
)

# Dummy implementation

## Setting up tokens

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")
batch: list[torch.Tensor] = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))

batch = torch.stack(batch, dim=0)  # converts list of tensors to a single tensor with dimension 2*4
batch

## Initializing a 124-million param gpt mdel

In [None]:
torch.manual_seed(123)
model = DummyGPTModel(config)
logits = model(batch)  # context verctors
print(f"Output shape: {logits.shape}")  # 2 rows (batch size) * 4 (tokens) * 50257 (vocab size)
logits

## Layer normalisation

We need to normalize the outputs of the model to have a mean of 0 and a variance of 1 (unit variance), which will speed up and make training more reliable.

ReLU is popular because:
It's computationally efficient (just a max operation)
It helps with the vanishing gradient problem
It introduces non-linearity into the network, allowing it to learn more complex patterns
It's simple to implement and works well in practice

nn.Sequential is a container in PyTorch that allows you to chain multiple neural network layers together in sequence. It's like a pipeline where the output of one layer becomes the input of the next layer.

In [None]:
torch.manual_seed(123)
batch_example = torch.randn(2, 5)
print(f"Batch example shape: {batch_example.shape}")
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())  # 5 inputs, 6 outputs
out = layer(batch_example)
out

In [None]:
mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True)
print("Mean:\n", mean)
print("Variance:\n", var)

Layer normalisation applied to preceding layer outputs

Note that the value –5.9605e-08 in the output tensor is the scientific notation for –5.9605 × 10-8, which is –0.000000059605 in decimal form. This value is very close to 0, but it is not exactly 0 due to small numerical errors that can accumulate because of the finite precision with which computers represent numbers.

In [None]:
out_norm = (out - mean) / torch.sqrt(
    var
)  # subtract the mean from each value in out then devide by the square root of the variance
print("Normalized layer outputs:\n", out_norm)

mean = out_norm.mean(dim=-1, keepdim=True)
print("Mean:\n", mean)

var = out_norm.var(dim=-1, keepdim=True)
print("Variance:\n", var)

Improving readability

In [None]:
torch.set_printoptions(sci_mode=False)
print("Mean:\n", mean)
print("Variance:\n", var)

Use custom LayerNorm and apply it to the batch input

In [None]:
ln = LayerNorm(embed_dim=5)
out_ln = ln(batch_example)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim=True)
print("Mean:\n", mean)
print("Variance:\n", var)

Comparing the GELU and the RELU functions

In [None]:
# Comparing the GELU and the RELU functions
# GELU (Gaussian Error Linear Unit) and ReLU (Rectified Linear Unit) are both activation functions
# used in neural networks. GELU is a smooth approximation of ReLU that has been shown to work
# better in transformer models like GPT.

# Initialize both activation functions
gelu, relu = GELU(), nn.ReLU()

# Create a range of input values from -3 to 3 to visualize the functions
x = torch.linspace(-3, 3, 100)

# Compute the outputs of both activation functions for the input range
y_gelu, y_relu = gelu(x), relu(x)

# Create a figure with two subplots to compare the functions side by side
plt.figure(figsize=(8, 3))
for i, (y, label) in enumerate(zip([y_gelu, y_relu], ["GELU", "ReLU"], strict=False), 1):
    plt.subplot(1, 2, i)
    plt.plot(x, y)
    plt.title(f"{label} activation function")
    plt.xlabel("x")
    plt.ylabel(f"{label}(x)")
    plt.grid(True)
plt.tight_layout()
plt.show()

# Key differences between GELU and ReLU:
# 1. GELU is smooth and differentiable everywhere, while ReLU has a sharp corner at x=0
# 2. GELU allows small negative values to pass through (slightly negative for x < 0)
# 3. GELU's smoothness helps with gradient flow during training
# 4. GELU has been shown to work better in transformer models like GPT

Applying custom FeedForward module (simple NN with 1 linear layer 1 gelu 1 linear )

In [None]:
ffn = FeedForward(config)
x = torch.rand(2, 3, 768)
out = ffn(x)
print(out.shape)

# Example deep neural network usage

In [12]:
layer_sizes = [3, 3, 3, 3, 3, 1]
sample_input = torch.tensor([[1.0, 0.0, -1.0]])
torch.manual_seed(123)
model_without_shortcut = ExampleDeepNeuralNetwork(layer_sizes, use_shortcut=False)

Next, we implement a function that computes the gradients in the model’s backward pass:

In [13]:
def print_gradients(model: ExampleDeepNeuralNetwork, x: torch.Tensor) -> None:
    """Print gradients of the model."""
    output = model(x)
    target = torch.tensor([[0.0]])

    loss = nn.MSELoss()
    loss = loss(output, target)

    loss.backward()

    for name, param in model.named_parameters():
        if "weight" in name:
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

In [None]:
print_gradients(model_without_shortcut, sample_input)

Now let's use the shortcut to avoid vanishing gradients problem:

In [None]:
torch.manual_seed(123)
model_with_shortcut = ExampleDeepNeuralNetwork(layer_sizes, use_shortcut=True)
print_gradients(model_with_shortcut, sample_input)

In [None]:
torch.manual_seed(123)
x = torch.rand(2, 4, 768)
block = TransformerBlock(config)
output = block(x)

print("Input shape:", x.shape)
print("Output shape:", output.shape)

# Wrapping it up in a GPT model 

It consists of embedding layers, passed through transformer blocks, before being normalized and converted to logits in the forward pass

In [None]:
torch.manual_seed(123)
model = GPTModel(config)

out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

In [None]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

In [None]:
print("Token embedding layer shape:", model.tok_emb.weight.shape)
print("Output layer shape:", model.out_head.weight.shape)

In [None]:
total_params_gpt2 = total_params - sum(p.numel() for p in model.out_head.parameters())
print(f"Number of trainable parameters " f"considering weight tying: {total_params_gpt2:,}")

# Exercise 4.2 Init larget GPT models

 implement GPT-2 medium (using 1,024-dimensional embeddings, 24 transformer blocks, 16 multi-head attention heads)

 GPT-2 large (1,280-dimensional embeddings, 36 transformer blocks, 20 multi-head attention heads)

 GPT-2 XL (1,600-dimensional embeddings, 48 transformer blocks, 25 multi-head attention heads)

In [None]:
config_medium = GPTConfig(
    vocab_size=50257, context_length=1024, embed_dim=1024, n_heads=16, n_layers=24, drop_rate=0.1, qkv_bias=False
)
model_medium = GPTModel(config_medium)

torch.manual_seed(123)

out_medium = model_medium(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out_medium.shape)
print(out_medium)

In [22]:
def generate_text_simple(model: GPTModel, idx: int, max_new_tokens: int, context_size: int) -> int:
    """Generate simple text with untrained model."""
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)

        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)

    return idx

In [None]:
start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print("encoded_tensor.shape:", encoded_tensor.shape)

In [None]:
model.eval()
out = generate_text_simple(
    model=model,
    idx=encoded_tensor,
    max_new_tokens=6,
    context_size=config.context_length,
)
print("Output:", out)
print("Output length:", len(out[0]))

In [None]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)