In [None]:
# Exercise 4.1 Number of parameters in feed forward and attention modules 
# Calculate and compare the number of parameters that are contained in the feed forward module 
# and those that are contained in the multi-head attention module.

In [1]:
import torch
import torch.nn as nn
import tiktoken

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [3]:
from gpt_model import TransformerBlock

GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [4]:
block = TransformerBlock(GPT_CONFIG_124M)
print(block)

TransformerBlock(
  (norm1): LayerNorm()
  (att): MultiHeadAttention(
    (W_query): Linear(in_features=768, out_features=768, bias=False)
    (W_key): Linear(in_features=768, out_features=768, bias=False)
    (W_value): Linear(in_features=768, out_features=768, bias=False)
    (out_proj): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (norm2): LayerNorm()
  (feedforward): FeedForward(
    (layers): Sequential(
      (0): Linear(in_features=768, out_features=3072, bias=True)
      (1): GELU()
      (2): Linear(in_features=3072, out_features=768, bias=True)
    )
  )
)


In [12]:
# Total # of parameters in the feed forward module of the Transformer block
total_params = sum(p.numel() for p in block.feedforward.parameters())
print(f"Total number of parameters for feed forward module: {total_params :,}")

Total number of parameters for feed forward module: 4,722,432


In [13]:
# Total # of parameters in the multi-head attention module of the Transformer block
total_params_attn = sum(p.numel() for p in block.att.parameters())
print(f"Total number of parameters in multi head attention module: {total_params_attn :,}")

Total number of parameters in multi head attention module: 2,360,064


Mathematical breakdown using an embedding demension of 768

Feed forward module:

1st Linear layer: 768 inputs × 4×768 outputs + 4×768 bias units = 2,362,368
2nd Linear layer: 4×768 inputs × 768 outputs + 768 bias units = 2,360,064
Total: 1st Linear layer + 2nd Linear layer = 2,362,368 + 2,360,064 = 4,722,432

Attention module:

W_query: 768 inputs × 768 outputs = 589,824
W_key: 768 inputs × 768 outputs = 589,824
W_value: 768 inputs × 768 outputs = 589,824
out_proj: 768 inputs × 768 outputs + 768 bias units = 590,592
Total: W_query + W_key + W_value + out_proj = 3×589,824 + 590,592 = 2,360,064

In [None]:
# Exercise 4.2 Initializing larger GPT models 
# We initialized a 124-million-parameter GPT model, which is known as “GPT-2 small.”
# Without making any code modifications besides updating the configuration file, use
# the GPTModel class to implement GPT-2 medium (using 1,024-dimensional embeddings, 24 transformer blocks, 16 multi-head attention heads), GPT-2 large (1,280-
# dimensional embeddings, 36 transformer blocks, 20 multi-head attention heads),
# and GPT-2 XL (1,600-dimensional embeddings, 48 transformer blocks, 25 multi-head
# attention heads). As a bonus, calculate the total number of parameters in each GPT
# model.

GPT2-small (the 124M configuration we already implemented):

"emb_dim" = 768
"n_layers" = 12
"n_heads" = 12

GPT2-medium:

"emb_dim" = 1024
"n_layers" = 24
"n_heads" = 16

GPT2-large:

"emb_dim" = 1280
"n_layers" = 36
"n_heads" = 20

GPT2-XL:

"emb_dim" = 1600
"n_layers" = 48
"n_heads" = 25

In [19]:
# GPT-2 small configuration
GPT_CONFIG_SMALL = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [20]:
# GPT-2 medium configuration
GPT_CONFIG_MEDIUM = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 1024,         # Embedding dimension
    "n_heads": 16,          # Number of attention heads
    "n_layers": 24,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [21]:
# GPT-2 large configuration
GPT_CONFIG_LARGE = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1280, # Context length
    "emb_dim": 1024,         # Embedding dimension
    "n_heads": 20,          # Number of attention heads
    "n_layers": 36,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [22]:
# GPT-2 2-XL configuration
GPT_CONFIG_XL = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1600, # Context length
    "emb_dim": 1024,         # Embedding dimension
    "n_heads": 25,          # Number of attention heads
    "n_layers": 48,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [18]:
def calculate_params(model):
    
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total number of parameters: {total_params :,}")

In [None]:
from gpt_model import GPTModel

model_small = GPTModel(GPT_CONFIG_SMALL)
calculate_params(model_small)

model_medium = GPTModel(GPT_CONFIG_MEDIUM)
calculate_params(model_medium)

model_large = GPTModel(GPT_CONFIG_LARGE)
calculate_params(model_large)

model_xl = GPTModel(GPT_CONFIG_XL)
calculate_params(model_xl)
