In [1]:
!pip install tiktoken -q


[notice] A new release of pip is available: 24.0 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Build your LLM from scratch

In [2]:
import os
import pandas as pd
import numpy as np

import tiktoken
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Working with text data

In [3]:
tokenizer = tiktoken.get_encoding("gpt2")

In [4]:
with open(os.path.join("data", "the-verdict.txt"), "r", encoding="utf-8") as f:
    raw_text = f.read()

print(raw_text[:100])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [5]:
# Length of a sample text tokenized by GPT-2 BPE tokenizer
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


## Data sampling with a sliding window approach

Using torch's `Dataset` class to create a simple textual dataset for GPT training. The code uses sliding window approach (combined with additional strides) to create a new dataset.

![sliding_window](https://camo.githubusercontent.com/9c738e75095f70d3dc4f6b3630008dd67607b5fa92e3bf776b0ed2cbb68db299/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830325f636f6d707265737365642f31332e776562703f313233)

In [6]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length: int = 256, stride: int = 0) -> Dataset:
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt) # tokenize entire text

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [7]:
def create_dataloader_v1(txt, batch_size: int = 4, max_length: int = 256,
                         stride: int = 128, shuffle = True, drop_last = True,
                         num_workers: int = 0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [8]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


Creating embedding layers

In [9]:
vocab_size = 50257
output_dim = 256
max_length = 4
context_length = max_length
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

Using embedding layers the procedure is the following:

In [10]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token ID's", inputs)
print("Shape", inputs.shape)

Token ID's tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Shape torch.Size([8, 4])


In [11]:
# Embedded tokens
token_embedding_layer(inputs).shape

torch.Size([8, 4, 256])

In [12]:
# Absolute embeddings for positional encoding are created using the following procedure

pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


Positional embeddings are combined with the token embedding vector to form the input embeddings for a large language model:
![pos_embddings](https://camo.githubusercontent.com/e53fdceda6a07218acfe115d81dc930241569fbbcd5c3e533856dee1959a8a93/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830325f636f6d707265737365642f31382e77656270)

In [13]:
input_embeddings = token_embedding_layer(inputs) + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


# Attention implementation

Simple attention idea is the following: context-aware vector the represents attention mechanism is calculated for each token in an input sequence in order to create vector representations that contain the most valuable information for each part of the input sequence.

## Simple attention

Simple self-attention implements an immutable attention mechanism based solely on matrix multiplication. Attention calulation is perfomed in the following way:
1. For each `Query` vector compute dot products with each `Key` vectors;
2. Transform results of dot products using *softmax* function in order to normalize attention multipliers (**interpretation:** probabilities of relative importance for each token);
3. Compute attention scores.

In [14]:
inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)
inputs

tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])

Inputs serve as Q, K, V vectors at the same time.

In [15]:
attn_scores = inputs @ inputs.T
print('Attention scores', attn_scores, sep = '\n')

attn_weights = torch.softmax(attn_scores, dim = -1)
print('Attention weights', attn_weights, sep = '\n')
print('Attention weights check', attn_weights.sum(dim = -1), sep = '\n')

Attention scores
tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])
Attention weights
tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])
Attention weights check
tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])


In [16]:
context_vectors = attn_weights @ inputs
print('Final context vectors', context_vectors, sep = '\n')

Final context vectors
tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


## Trainable self-attention

Self-attention mechanism that is so-called **Scaled Dot-Product Attention**. This attention mechanism is trainable and can perform necessary update due to use of $W_q; W_k; W_v$ matrices that represent *Query, Key, Value* weight matrices.

**Attention scores** are dynamically updated coefficients that change due to input information mutations, however **QKV** weight matrices are static matrices that update only while training.
$$\text{Attention} = \text{softmax} \bigl( \frac{Q \cdot K}{\sqrt{d_{keys}}}\bigr) V$$


In [17]:
d_in = inputs.shape[1] # the input embedding size, d=3
d_out = 2

In [18]:
class SelfAttention_v1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.rand(d_in, d_out)) # (emb_size, attn_emb_size)
        self.W_key = nn.Parameter(torch.rand(d_in, d_out)) # (emb_size, attn_emb_size)
        self.W_value = nn.Parameter(torch.rand(d_in, d_out)) # (emb_size, attn_emb_size)

    def forward(self, x):
        queries = x @ self.W_query # (n_tokens, emb_size) X (emb_size, attn_emb_size) -> (n_tokens, attn_emb_size)
        keys = x @ self.W_key # (n_tokens, emb_size) X (emb_size, attn_emb_size) -> (n_tokens, attn_emb_size)
        values = x @ self.W_value # (n_tokens, emb_size) X (emb_size, attn_emb_size) -> (n_tokens, attn_emb_size)

        attn_scores = queries @ keys.T # (n_tokens, attn_emb_size) X (attn_emb_size, n_tokens) -> (n_tokens, n_tokens)
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim = -1
        )
        context_vectors = attn_weights @ values # (n_tokens, n_tokens) X (n_tokens, attn_emb_size) -> (n_tokens, attn_emb_size)
        return context_vectors

In [19]:
torch.manual_seed(123)

sa_v1 = SelfAttention_v1(d_in, d_out)

# Calculate context vectors
print(sa_v1(inputs))

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)


![image.png](https://camo.githubusercontent.com/5edb6b2e02db4ad16761a1c6a6de4f75b16d6d03e2eac6d52c263669ea358308/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830335f636f6d707265737365642f31382e77656270)

Usage of `nn.Linear` layers is preferred in the self attention class due to a more efficient implementation.

In [20]:
class SelfAttention_v2(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias: bool = False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, qkv_bias) # (emb_size, attn_emb_size)
        self.W_key = nn.Linear(d_in, d_out, qkv_bias) # (emb_size, attn_emb_size)
        self.W_value = nn.Linear(d_in, d_out, qkv_bias) # (emb_size, attn_emb_size)

    def forward(self, x):
        queries = self.W_query(x) # (n_tokens, emb_size) X (emb_size, attn_emb_size) -> (n_tokens, attn_emb_size)
        keys = self.W_key(x) # (n_tokens, emb_size) X (emb_size, attn_emb_size) -> (n_tokens, attn_emb_size)
        values = self.W_value(x) # (n_tokens, emb_size) X (emb_size, attn_emb_size) -> (n_tokens, attn_emb_size)

        attn_scores = queries @ keys.T # (n_tokens, attn_emb_size) X (attn_emb_size, n_tokens) -> (n_tokens, n_tokens)
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim = -1
        )
        context_vectors = attn_weights @ values # (n_tokens, n_tokens) X (n_tokens, attn_emb_size) -> (n_tokens, attn_emb_size)
        return context_vectors

In [21]:
torch.manual_seed(789)

sa_v2 = SelfAttention_v2(d_in, d_out)

# Calculate context vectors
print(sa_v2(inputs))

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)


## Causal attention

Causal attention (*masked attention*) is used for restricting LM. *Masked attention* forces LM to consider only previous tokens when generating the output.

**Causal attention masks future tokens.**
![image.png](https://camo.githubusercontent.com/ae6a1857af914fbb7d57da177ce6bff4b57dbebe3bd11395855b3315ae13d1e1/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830335f636f6d707265737365642f31392e77656270)

Normalization steps performed during attention calculation can cause information leakage during this procedure, however *properties* of the softmax function can nullify this error.

*The mathematical elegance of softmax is that despite initially including all positions in the denominator, after masking and renormalizing, the effect of the masked positions is nullified—they don’t contribute to the softmax score in any meaningful way.*

*In simpler terms, after masking and renormalization, the distribution of attention weights is as if it was calculated only among the unmasked positions to begin with. This ensures there’s no information leakage from future (or otherwise masked) tokens as we intended.*

*Causal attention* can be improved using `inf` values that will faster the computations due to lesser number of operations in attention mechanism.

![image](https://camo.githubusercontent.com/085c511ba76dafdd9bc7d9fc8e15fdf24770dfa7d8061cc2441675411051c5f0/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830335f636f6d707265737365642f32302e77656270)

In [22]:
batch = torch.stack((inputs, inputs), dim = 0)
print(batch.shape)

torch.Size([2, 6, 3])


In [23]:
class CausalAttention(nn.Module):

  def __init__(self, d_in, d_out, context_length, dropout, qkv_bias: bool = False) -> None:
      super().__init__()
      self.W_query = nn.Linear(d_in, d_out, qkv_bias)
      self.W_key = nn.Linear(d_in, d_out, qkv_bias)
      self.W_value = nn.Linear(d_in, d_out, qkv_bias)

      self.dropout = nn.Dropout(dropout)

      # Transformers context length defines attention mask that can be applied to the data
      # Create an upper triangular attention mask (mask is applied on a ones matrix) that is used for causal attention
      self.register_buffer('mask', torch.triu(torch.ones((context_length, context_length)), diagonal = 1))

  def forward(self, x):
    batch, num_tokens, d_in = x.shape # B, T, C
    queries = self.W_query(x) # T, CS
    keys = self.W_key(x) # T, CS
    values = self.W_value(x) # T, CS

    # Transpose keys matrices in each batch element
    attn_scores = queries @ keys.transpose(1, 2) # T, T

    # Omit scores that are not used in calculation
    attn_scores.masked_fill_(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)

    attn_weights = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim = -1)
    attn_weights = self.dropout(attn_weights)

    context_vectors = attn_weights @ values
    return context_vectors

In [24]:
torch.manual_seed(123)

causal = CausalAttention(d_in, d_out, batch.shape[1], 0.1)

# Calculate context vectors
print(causal(batch).shape)

torch.Size([2, 6, 2])


## Multihead attention mechanism

Multi-head attention is the form of causal attention that implements multiple independent attention heads that will analyze the sequence in parallel.

In [25]:
class MultiHeadAttention(nn.Module):

  def __init__(self, d_in, d_out,
               num_heads, context_length,
               dropout, qkv_bias: bool = True):
    super().__init__()

    assert (d_out % num_heads == 0), 'SA Head embedding size must be divisible by number of heads'
    self.d_out = d_out
    self.num_heads = num_heads
    self.head_dim = d_out // num_heads
    self.W_query = nn.Linear(d_in, d_out, qkv_bias)
    self.W_key = nn.Linear(d_in, d_out, qkv_bias)
    self.W_value = nn.Linear(d_in, d_out, qkv_bias)
    self.out_proj = nn.Linear(d_out, d_out)
    self.dropout = nn.Dropout(dropout)
    self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

  def forward(self, x):
    batch, num_tokens, emb_size = x.shape

    Q = self.W_query(x)
    K = self.W_key(x)
    V = self.W_value(x)


    Q = Q.view(batch, num_tokens, self.num_heads, self.head_dim)
    K = K.view(batch, num_tokens, self.num_heads, self.head_dim)
    V = V.view(batch, num_tokens, self.num_heads, self.head_dim)

    Q = Q.transpose(1, 2)
    K = K.transpose(1, 2)
    V = V.transpose(1, 2)

    attn_scores = Q @ K.transpose(2, 3)
    mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

    # masked_fill_ (underscore) changes tensor inplace reducing unnecessary copying
    attn_scores.masked_fill_(mask_bool, -torch.inf) 

    attn_weights = torch.softmax(attn_scores / K.shape[-1] ** 0.5, dim = -1)
    attn_weights = self.dropout(attn_weights)

    context_vectors = (attn_weights @ V).transpose(1, 2)
    context_vectors = context_vectors.contiguous().view(batch, num_tokens, self.d_out)

    context_vectors = self.out_proj(context_vectors)    #11
    return context_vectors

In [26]:
torch.manual_seed(123)
heads = 2

mha = MultiHeadAttention(d_in, d_out, heads, batch.shape[1], 0.1)

# Calculate context vectors
print(mha(batch).shape)

torch.Size([2, 6, 2])


In [27]:
mha(batch)

tensor([[[ 0.7963, -0.1726],
         [ 0.7935, -0.1266],
         [ 0.7910, -0.1149],
         [ 0.7689, -0.1457],
         [ 0.7696, -0.1541],
         [ 0.7314, -0.2289]],

        [[ 0.7963, -0.1726],
         [ 0.7935, -0.1266],
         [ 0.7910, -0.1149],
         [ 0.7689, -0.1457],
         [ 0.7696, -0.1541],
         [ 0.7625, -0.1589]]], grad_fn=<ViewBackward0>)

# Implementing GPT

We use short variable names to avoid long lines of code later:
- "vocab_size" indicates a vocabulary size of 50,257 words, supported by the BPE tokenizer;
- "context_length" represents the model's maximum input token count, as enabled by positional embeddings;
- "emb_dim" is the embedding size for token inputs, converting each input token into a 768-dimensional vector;
- "n_heads" is the number of attention heads in the multi-head attention mechanism;
- "n_layers" is the number of transformer blocks within the model;
- "drop_rate" is the dropout mechanism's intensity;
- "qkv_bias" decides if the Linear layers in the multi-head attention mechanism should include a bias vector when computing query (Q), key (K), and value (V) tensors.

In [28]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [29]:
class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        # Using placeholders for transformer blocks create a dummy GPT
        self.trf_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        # Use a placeholder for LayerNorm
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(
            torch.arange(seq_len, device = in_idx.device)
        )
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # A simple placeholder

    def forward(self, x):
        # This block does nothing and just returns its input.
        return x


class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
        # The parameters here are just to mimic the LayerNorm interface.

    def forward(self, x):
        # This layer does nothing and just returns its input.
        return x

In [30]:
# Simple data pipeline for GPT model
batch = []

txt1 = 'Every effort moves you'
txt2 = "Every day holds a"

batch.extend([torch.tensor(tokenizer.encode(i)) for i in [txt1, txt2]])

batch

[tensor([6109, 3626, 6100,  345]), tensor([6109, 1110, 6622,  257])]

In [31]:
batch = torch.stack(batch, dim = 0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [32]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)

logits = model(batch)
print("Output shape:", logits.shape)
print(logits)

Output shape: torch.Size([2, 4, 50257])
tensor([[[-0.9289,  0.2748, -0.7557,  ..., -1.6070,  0.2702, -0.5888],
         [-0.4476,  0.1726,  0.5354,  ..., -0.3932,  1.5285,  0.8557],
         [ 0.5680,  1.6053, -0.2155,  ...,  1.1624,  0.1380,  0.7425],
         [ 0.0447,  2.4787, -0.8843,  ...,  1.3219, -0.0864, -0.5856]],

        [[-1.5474, -0.0542, -1.0571,  ..., -1.8061, -0.4494, -0.6747],
         [-0.8422,  0.8243, -0.1098,  ..., -0.1434,  0.2079,  1.2046],
         [ 0.1355,  1.1858, -0.1453,  ...,  0.0869, -0.1590,  0.1552],
         [ 0.1666, -0.8138,  0.2307,  ...,  2.5035, -0.3055, -0.3083]]],
       grad_fn=<UnsafeViewBackward0>)


## LayerNormalization

- Layer normalization, also known as LayerNorm (Ba et al. 2016), centers the activations of a neural network layer around a mean of 0 and normalizes their variance to 1
- This stabilizes training and enables faster convergence to effective weights
- Layer normalization is applied both before and after the multi-head attention module within the transformer block; it's also applied before the final output layer

![image](https://camo.githubusercontent.com/1bb0018e68d16529b969ccc6b0bca371ed5a7b9a514d1bd3f345c4941c0f463d/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830345f636f6d707265737365642f30352e77656270)

In [39]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

**Scale and shift**

- Note that in addition to performing the normalization by subtracting the mean and dividing by the variance, we added two trainable parameters, a scale and a shift parameter
- The initial scale (multiplying by 1) and shift (adding 0) values don't have any effect; however, scale and shift are trainable parameters that the LLM automatically adjusts during training if it is determined that doing so would improve the model's performance on its training task
- This allows the model to learn appropriate scaling and shifting that best suit the data it is processing
- Note that we also add a smaller value (eps) before computing the square root of the variance; this is to avoid division-by-zero errors if the variance is 0

**Biased variance**

- In the variance calculation above, setting `unbiased=False` means using the formula $\frac{\sum{(x_i-\hat{x_i})^2}}{n}$ to compute the variance where `n` is the sample size (here, the number of features or columns); this formula does not include Bessel's correction (which uses `n-1` in the denominator), thus providing a biased estimate of the variance

- For LLMs, where the embedding dimension `n` is very large, the difference between using `n` and `n-1` is negligible

- However, GPT-2 was trained with a biased variance in the normalization layers, which is why we also adopted this setting for compatibility reasons with the pretrained weights that we will load in later chapters

## GELU activations

- In deep learning, ReLU (Rectified Linear Unit) activation functions are commonly used due to their simplicity and effectiveness in various neural network architectures
- In LLMs, various other types of activation functions are used beyond the traditional ReLU; two notable examples are **GELU** (*Gaussian Error Linear Unit*) and **SwiGLU** (*Swish-Gated Linear Unit*)
- **GELU** and **SwiGLU** are more complex, smooth activation functions incorporating Gaussian and sigmoid-gated linear units, respectively, offering better performance for deep learning models, unlike the simpler, piecewise linear function of ReLU

**GELU** or *Gaussian Error Linear Unit*. The GELU nonlinearity weights inputs by their percentile, rather than gates inputs by their sign as in ReLU.
$$\text{GELU}\left(x\right) = x{P}\left(X\leq{x}\right) = x\Phi\left(x\right) = x \cdot \frac{1}{2}\left[1 + \text{erf}\left(\frac{x}{\sqrt{2}}\right)\right]$$
where  $X \sim \mathcal{N}(0, 1)$

**GELU** approximation can be written the following way:
$$\text{GELU}(x) \approx 0.5x\left(1+\tanh\left[\sqrt{\frac{2}{\pi}}\left(x + 0.044715x^{3}\right)\right]\right)$$

In [34]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [35]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

## Shortcut connections

*Shortcut* (*skip-*, *residual-*) connections that were firstly introduced in field of computer vision to solve the problem of vanishing gradients are used as shorter paths for pushing inputs to the outputs of each layer. They play a crucial role for gradients stabilization.

- A shortcut connection creates an alternative shorter path for the gradient to flow through the network
- This is achieved by adding the output of one layer to the output of a later layer, usually skipping one or more layers in between

![image](https://camo.githubusercontent.com/b1cb95fee4a11c35cb6ce14a2399ac09875c45dc92d344713a80913e05e04c20/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830345f636f6d707265737365642f31322e776562703f313233)

## Transformer block

Transformer block forms a solid structure that is the heart of the transformer that contains of the following:
- Skip-connections;
- Layer Norms;
- Multihead attention block;
- Feed-Forward netwok with a GELU activation;
- Dropout block

![image](https://camo.githubusercontent.com/6c8c392f72d5b9e86c94aeb9470beab435b888d24135926f1746eb88e0cc18fb/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830345f636f6d707265737365642f31332e776562703f31)

In [40]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in = cfg["emb_dim"],
            d_out = cfg["emb_dim"],
            context_length = cfg["context_length"],
            num_heads = cfg["n_heads"],
            dropout = cfg["drop_rate"],
            qkv_bias = cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.dropout_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.dropout_shortcut(x)
        x = x + shortcut

        return x

In [41]:
torch.manual_seed(123)

x = torch.rand(2, 4, 768)  # Shape: [batch_size, num_tokens, emb_dim]
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)

print("Input shape:", x.shape)
print("Output shape:", output.shape)

Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])


## GPT model

In [49]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias = False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape

        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device = in_idx.device))

        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [75]:
torch.manual_seed(789)
model = GPTModel(GPT_CONFIG_124M)

out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 1.2428, -0.0599, -0.3957,  ..., -0.2280, -1.0819,  0.7927],
         [ 0.8036, -0.1291,  0.5082,  ..., -0.7195,  0.1451, -0.4633],
         [ 0.4693, -0.8252,  0.4037,  ..., -0.2769, -0.4315, -0.6399],
         [ 1.2097, -0.3363,  0.4473,  ...,  0.4035,  0.0572,  0.5602]],

        [[ 1.2733, -0.1520, -1.0307,  ...,  0.1332, -0.8642,  0.5514],
         [ 0.3403, -0.5964,  0.6355,  ...,  0.3735,  0.4378, -1.1490],
         [-0.0091, -0.4894, -0.0107,  ..., -0.1857, -0.7640, -0.4280],
         [ 0.3347, -0.8868, -0.9892,  ..., -0.5269,  0.1438, -0.2993]]],
       grad_fn=<UnsafeViewBackward0>)


In [62]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 163,009,536


- As we see above, this model has 163M, not 124M parameters; why?
- In the original GPT-2 paper, the researchers applied weight tying, which means that they reused the token embedding layer (tok_emb) as the output layer, which means setting `self.out_head.weight` = `self.tok_emb.weight`
- The token embedding layer projects the 50,257-dimensional one-hot encoded input tokens to a 768-dimensional embedding representation
- The output layer projects 768-dimensional embeddings back into a 50,257-dimensional representation so that we can convert these back into words (more about that in the next section)
- So, the embedding and output layer have the same number of weight parameters, as we can see based on the shape of their weight matrices
- However, a quick note about its size: we previously referred to it as a 124M parameter model
- In the original GPT-2 paper, the researchers reused the token embedding matrix as an output matrix
- Correspondingly, if we subtracted the number of parameters of the output layer, we'd get a 124M parameter mode

In [63]:
total_params_gpt2 =  total_params - sum(p.numel() for p in model.out_head.parameters())
print(f"Number of trainable parameters considering weight tying: {total_params_gpt2:,}")

Number of trainable parameters considering weight tying: 124,412,160


- The memory requirements of the model can be computed as follows, which can be a helpful reference point:

In [64]:
# Calculate the total size in bytes (assuming float32, 4 bytes per parameter)
total_size_bytes = total_params * 4

# Convert to megabytes
total_size_mb = total_size_bytes / (1024 * 1024)

print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 621.83 MB


# Generating text

LLM's are autoregressive and can be used to generate text the following way

![image](https://camo.githubusercontent.com/be7b35733665766c48c64f651586173df9d1dd3a9ca985eca3593df5355db6a1/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830345f636f6d707265737365642f31362e77656270)

- The following `generate_text_simple` function implements **greedy decoding**, which is a simple and fast method to generate text
- In greedy decoding, at each step, the model chooses the word (or token) with the highest probability as its next output (the highest logit corresponds to the highest probability, so we technically wouldn't even have to compute the softmax function explicitly)
- However there exist a big variety of text generating strategies that implement quite sophisticated methods of text generation like: **top-k sampling**, **beam search** and others.

**Typical text generation**

The process of text generation contains of multiple steps that form an autoregressive nature of a generative model:
1. Output tensors decoding;
2. Token selection (based on probability or other strategies);
3. Token conversion to human-like text.

![img](https://camo.githubusercontent.com/1e55260d5caca3ac6bd1215ca36973e0a0779995487ed9486f510e5baa108072/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830345f636f6d707265737365642f31372e77656270)

In [1]:
# Greedy decoding

def generate_text_simple(model, idx, max_new_tokens, context_size): # idx - [batch, n_tokens]
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:] # crops current context in order to comply with the available context window
        with torch.no_grad():
            logits = model(idx_cond)

        logits = logits[:, -1, :]  # chose last vector of vocab_size corresponding to the next token probability distribution
        probas = torch.softmax(logits, dim = -1)
        idx_next = torch.argmax(probas, dim = -1, keepdim = True)
        idx = torch.cat((idx, idx_next), dim = 1)

    return idx

In [78]:
start_context = "Hello, I am a computer"

encoded = tokenizer.encode(start_context)
print("encoded:", encoded)

encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print("encoded_tensor.shape:", encoded_tensor.shape)

encoded: [15496, 11, 314, 716, 257, 3644]
encoded_tensor.shape: torch.Size([1, 6])


In [79]:
model.eval() # disable dropout

out = generate_text_simple(
    model=model,
    idx=encoded_tensor, 
    max_new_tokens=2, 
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[15496,    11,   314,   716,   257,  3644, 15496,    11,   314,   716,
           257,  3644, 15496,    11,   314,   716,   257,  3644, 15496,    11,
           314,   716,   257,  3644]])
Output length: 24


In [80]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hello, I am a computerHello, I am a computerHello, I am a computerHello, I am a computer
