In [3]:
import torch
import torch.nn as nn

class GPTModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embed = nn.Embedding(config["vocab_size"], config["emb_dim"])
        self.pos_embed = nn.Parameter(torch.zeros(1, config["context_length"], config["emb_dim"]))
        self.drop = nn.Dropout(config["drop_rate"])
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=config["emb_dim"],
                nhead=config["n_heads"],
                dim_feedforward=4 * config["emb_dim"],
                dropout=config["drop_rate"],
                activation='gelu',
                batch_first=True,
                norm_first=True
            ) for _ in range(config["n_layers"])
        ])
        self.ln_f = nn.LayerNorm(config["emb_dim"])
        self.head = nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)

    def forward(self, idx):
        x = self.embed(idx) + self.pos_embed[:, :idx.size(1), :]
        x = self.drop(x)
        for layer in self.layers:
            x = layer(x)
        x = self.ln_f(x)
        logits = self.head(x)
        return logits

In [4]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    'context_length': 256,
    "emb_dim": 768,
    "n_layers": 12,
    "n_heads": 12,
    "drop_rate": 0.1,
    "qkv_bias": False, # setting it true means the model will learn biases for the query, key, and value projections
}

torch.manual_seed(42)  # For reproducibility
model = GPTModel(GPT_CONFIG_124M)
print(model)

GPTModel(
  (embed): Embedding(50257, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (layers): ModuleList(
    (0-11): 12 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
      )
      (linear1): Linear(in_features=768, out_features=3072, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=3072, out_features=768, bias=True)
      (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (head): Linear(in_features=768, out_features=50257, bias=False)
)


In [5]:
model.eval()  # Set the model to evaluation mode

GPTModel(
  (embed): Embedding(50257, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (layers): ModuleList(
    (0-11): 12 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
      )
      (linear1): Linear(in_features=768, out_features=3072, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=3072, out_features=768, bias=True)
      (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (head): Linear(in_features=768, out_features=50257, bias=False)
)

In [7]:
%pip install tiktoken
import tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp310-cp310-win_amd64.whl.metadata (6.8 kB)
Downloading tiktoken-0.9.0-cp310-cp310-win_amd64.whl (894 kB)
   ---------------------------------------- 0.0/894.0 kB ? eta -:--:--
   --------------------------------------- 894.0/894.0 kB 10.2 MB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [8]:
 import urllib.request
 url = (
    "https://raw.githubusercontent.com/rasbt/"
    "LLMs-from-scratch/main/ch05/"
    "01_main-chapter-code/gpt_download.py"
 )
 filename = url.split('/')[-1]
 urllib.request.urlretrieve(url, filename)

('gpt_download.py', <http.client.HTTPMessage at 0x208280c1f90>)

In [2]:
 pip install tensorflow>=2.15.0  tqdm>=4.66

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
from gpt_download import download_and_load_gpt2
settings, params = download_and_load_gpt2(
model_size="124M", models_dir="gpt2"
)

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 97.1kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:01<00:00, 556kiB/s] 
hparams.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 9.88kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 498M/498M [02:55<00:00, 2.83MiB/s]   
model.ckpt.index: 100%|██████████| 5.21k/5.21k [00:00<00:00, 2.78MiB/s]
model.ckpt.meta: 100%|██████████| 471k/471k [00:01<00:00, 351kiB/s]  
vocab.bpe: 100%|██████████| 456k/456k [00:01<00:00, 342kiB/s]  


In [10]:
c_attn = params["blocks"][0]["attn"]["c_attn"]
print(c_attn)

{'b': array([ 0.48033914, -0.5254326 , -0.42926455, ...,  0.01257301,
       -0.04987717,  0.00324764], shape=(2304,), dtype=float32), 'w': array([[-0.4738484 , -0.26136586, -0.09780374, ...,  0.05132535,
        -0.0584389 ,  0.02499568],
       [ 0.08742206,  0.1473427 ,  0.23870145, ..., -0.05253514,
        -0.01125987, -0.01558759],
       [ 0.00388936,  0.06946629,  0.3668052 , ...,  0.11428114,
         0.03629516, -0.03184864],
       ...,
       [-0.25919554, -0.01636625,  0.19914557, ...,  0.00953369,
        -0.05159837,  0.03186192],
       [ 0.15165617,  0.2170211 ,  0.10434178, ...,  0.02933884,
        -0.04287174, -0.04746685],
       [-0.41001597, -0.19235404, -0.2400296 , ..., -0.00459218,
         0.00697855,  0.01984419]], shape=(768, 2304), dtype=float32)}


In [None]:
import numpy as np
import torch

def assign(tensor, array):
    # Convert numpy array to torch tensor and copy data
    tensor.data.copy_(torch.from_numpy(array))
    return tensor

def load_weigths(gpt, params):
    # set positional embedding weights and token embedding weights from pretrained parameters
    gpt.pos_embed.weight = assign(gpt.pos_embed.weight, params["pos_embed"])
    gpt.embed.weight = assign(gpt.embed.weight, params["token_embed"])

    # iterate over each transformer block/layer (attention+feedforward) to load its weights
    for b in range(len(params["blocks"])):
        # split attention weights into query, key, value [they first store in a single array in c_attn]
        q_w, k_w, v_w = np.split(params["blocks"][b]["attn"]["c_attn"]["w"], 3, axis=-1)
        # assign query, key, value weights, we'll load later into those layers, these layers compute q, k, v for self attention
        gpt.trf_blocks[b].att.W_query.weight = assign(gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        # split attention biases into query, key, value
        q_b, k_b, v_b = np.split(params["blocks"][b]["attn"]["c_attn"]["b"], 3, axis=-1)
        # assign query, key, value biases
        gpt.trf_blocks[b].att.W_query.bias = assign(gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(gpt.trf_blocks[b].att.W_value.bias, v_b)

        # assign output projection weights and biases for attention, this projects
        # the concatenated output of q, k, v back to the embedding dimension
        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight, params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias, params["blocks"][b]["attn"]["c_proj"]["b"])

        # assign feedforward layer weights and biases, each transformer block has a feedforward layer
        # after attention
        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight, params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias, params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight, params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias, params["blocks"][b]["mlp"]["c_proj"]["b"])

        # assign layer norm parameters, scale-g for gamma which is the scaling factor
        # and shift-b for beta which is the bias term 
        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale, params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift, params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale, params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift, params["blocks"][b]["ln_2"]["b"])

    # assign final layer norm parameter and output head weights
    # the output head often shars weights with the token embedding layer (weight tying)
    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])