In [1]:
import tensorflow as tf
import tqdm
import torch

In [2]:
from gpt_model import MultiHeadAttention, LayerNorm, GELU, FeedForward, TransformerBlock, GPTModel
from gpt_model_generate import generate_text_simple, generate
from gpt_model_data_loader import GPTDatasetV1, create_dataloader_v1
from gpt_model_loss import calc_loss_batch, calc_loss_loader
from gpt_model_utility import text_to_token_ids, token_ids_to_text

In [3]:
GPT_CONFIG_124M = {

    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context Length
    "emb_dim": 768,          # Embedding dimension
    "n-heads": 12,           # Number of attention heads
    "n_layers": 12,          # Number of Layers
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": False        # Query-Key-Value bias
}

In [5]:
print(tqdm.__version__)
print(tf.__version__)

4.67.0
2.18.0


In [7]:
from gpt_download3 import download_and_load_gpt2

In [8]:
settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")



File already exists and is up-to-date: gpt2\124M\checkpoint




File already exists and is up-to-date: gpt2\124M\encoder.json




File already exists and is up-to-date: gpt2\124M\hparams.json




File already exists and is up-to-date: gpt2\124M\model.ckpt.data-00000-of-00001




File already exists and is up-to-date: gpt2\124M\model.ckpt.index




File already exists and is up-to-date: gpt2\124M\model.ckpt.meta




File already exists and is up-to-date: gpt2\124M\vocab.bpe


In [9]:
print("Settings:", settings)
print("Parameter dictionary keys:", params.keys())

Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}
Parameter dictionary keys: dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


In [10]:
params["wte"].shape

(50257, 768)

In [11]:
# Define model configurations in a dictionary for compactness
model_configs = {

    "gpt2-small (124M)": {"emb_dim":768, "n_layers":12, "n_heads":12},
    "gpt2-medium (355M)": {"emb_dim":1024, "n_layers":24, "n_heads":16},
    "gpt2-large (774M)": {"emb_dim":1280, "n_layers":36, "n_heads":20},
    "gpt2-xl (1558M)": {"emb_dim":1600, "n_layers":48, "n_heads":25},
}

# Copy the base configuration and update with specific model settings
model_name = "gpt2-small (124M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])

In [12]:
NEW_CONFIG.update({"qkv_bias": True})
gpt = GPTModel(NEW_CONFIG)
gpt.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [13]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [14]:
import numpy as np

def load_weights_into_gpt(gpt, params):
    # Weight assignment of Token and Position Embeddings
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    for b in range(len(params["blocks"])):
        # Weight assignment of Query, Key and Values
        q_w, k_w, v_w = np.split((params['blocks'][b]["attn"]["c_attn"]["w"]), 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(gpt.trf_blocks[b].att.W_value.weight, v_w.T)
        
        # Bias assignment of Query, Key and Values
        q_b, k_b, v_b = np.split((params['blocks'][b]["attn"]["c_attn"]["b"]), 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(gpt.trf_blocks[b].att.W_value.bias, v_b)

        # Weight and Bias assignment of output projection layer
        gpt.trf_blocks[b].att.out_proj.weight = assign(gpt.trf_blocks[b].att.out_proj.weight, params['blocks'][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(gpt.trf_blocks[b].att.out_proj.bias, params['blocks'][b]["attn"]["c_proj"]["b"])

        # Weight and Bias assignment of Fully connected layer of FF neural network
        gpt.trf_blocks[b].ff.layers[0].weight = assign(gpt.trf_blocks[b].ff.layers[0].weight, params['blocks'][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(gpt.trf_blocks[b].ff.layers[0].bias, params['blocks'][b]["mlp"]["c_fc"]["b"])
        
        # Weight and Bias assignment of Output layer (Projection layer) of FF neural network 
        gpt.trf_blocks[b].ff.layers[2].weight = assign(gpt.trf_blocks[b].ff.layers[2].weight, params['blocks'][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(gpt.trf_blocks[b].ff.layers[2].bias, params['blocks'][b]["mlp"]["c_proj"]["b"])

        # Shift and Scale assignment of Layer Normalization 1 
        gpt.trf_blocks[b].norm1.scale = assign(gpt.trf_blocks[b].norm1.scale, params['blocks'][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(gpt.trf_blocks[b].norm1.shift, params['blocks'][b]["ln_1"]["b"])
        
        # Shift and Scale assignment of Layer Normalization 2
        gpt.trf_blocks[b].norm2.scale = assign(gpt.trf_blocks[b].norm2.scale, params['blocks'][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(gpt.trf_blocks[b].norm2.shift, params['blocks'][b]["ln_2"]["b"])

    # Shift and Scale assignment of Final Normalization Layer
    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])

    # Weight assignment of Output layer. Here we are reusing the token embedding weights as GPT2 uses the concept of weight tying
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

In [15]:
device = torch.device("cuda")
device

device(type='cuda')

In [16]:
load_weights_into_gpt(gpt, params)
gpt.to(device)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [17]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [31]:
torch.manual_seed(123)

token_ids = generate(
    model=gpt,
    idx=text_to_token_ids("World largest democracy", tokenizer).to(device),
    max_new_tokens=50,
    context_size=NEW_CONFIG["context_length"],
    top_k=50,
    temperature=0.5
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 World largest democracy in the world, the United States has a long history of providing a platform for peaceful protest. The United States has a long history of providing a platform for peaceful protest.

In October, President Barack Obama issued a statement calling for an independent commission
