In [29]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
import torch
from transformers import GPT2Model
from src.model import MODEL_ARCHITECTURES, GPTModel

In [31]:
gpt_pretrained = GPT2Model.from_pretrained("openai-community/gpt2", cache_dir="checkpoints")

In [32]:
import numpy as np

def assign_check(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(right.clone().detach())

def load_weights(gpt, gpt_hf, model_configs):

    d = gpt_hf.state_dict()

    gpt.pos_emb.weight = assign_check(gpt.pos_emb.weight, d["wpe.weight"])
    gpt.tok_emb.weight = assign_check(gpt.tok_emb.weight, d["wte.weight"])
    
    for b in range(model_configs["n_layers"]):
        q_w, k_w, v_w = torch.split(d[f"h.{b}.attn.c_attn.weight"], d[f"h.{b}.attn.c_attn.weight"].size(-1) // 3, dim=-1)
        gpt.transformer_decoders[b].multi_head_attention.W_query.weight = assign_check(gpt.transformer_decoders[b].multi_head_attention.W_query.weight, q_w.T)
        gpt.transformer_decoders[b].multi_head_attention.W_key.weight = assign_check(gpt.transformer_decoders[b].multi_head_attention.W_key.weight, k_w.T)
        gpt.transformer_decoders[b].multi_head_attention.W_value.weight = assign_check(gpt.transformer_decoders[b].multi_head_attention.W_value.weight, v_w.T)
    
        q_b, k_b, v_b = torch.split(d[f"h.{b}.attn.c_attn.bias"], d[f"h.{b}.attn.c_attn.bias"].size(-1) // 3, dim=-1)
        gpt.transformer_decoders[b].multi_head_attention.W_query.bias = assign_check(gpt.transformer_decoders[b].multi_head_attention.W_query.bias, q_b)
        gpt.transformer_decoders[b].multi_head_attention.W_key.bias = assign_check(gpt.transformer_decoders[b].multi_head_attention.W_key.bias, k_b)
        gpt.transformer_decoders[b].multi_head_attention.W_value.bias = assign_check(gpt.transformer_decoders[b].multi_head_attention.W_value.bias, v_b)
    
        gpt.transformer_decoders[b].multi_head_attention.out_proj.weight = assign_check(gpt.transformer_decoders[b].multi_head_attention.out_proj.weight, d[f"h.{b}.attn.c_proj.weight"].T)
        gpt.transformer_decoders[b].multi_head_attention.out_proj.bias = assign_check(gpt.transformer_decoders[b].multi_head_attention.out_proj.bias, d[f"h.{b}.attn.c_proj.bias"])
    
        gpt.transformer_decoders[b].feed_foward.layers[0].weight = assign_check(gpt.transformer_decoders[b].feed_foward.layers[0].weight, d[f"h.{b}.mlp.c_fc.weight"].T)
        gpt.transformer_decoders[b].feed_foward.layers[0].bias = assign_check(gpt.transformer_decoders[b].feed_foward.layers[0].bias, d[f"h.{b}.mlp.c_fc.bias"])
        gpt.transformer_decoders[b].feed_foward.layers[2].weight = assign_check(gpt.transformer_decoders[b].feed_foward.layers[2].weight, d[f"h.{b}.mlp.c_proj.weight"].T)
        gpt.transformer_decoders[b].feed_foward.layers[2].bias = assign_check(gpt.transformer_decoders[b].feed_foward.layers[2].bias, d[f"h.{b}.mlp.c_proj.bias"])
    
        gpt.transformer_decoders[b].norm_layer1.weight = assign_check(gpt.transformer_decoders[b].norm_layer1.weight, d[f"h.{b}.ln_1.weight"])
        gpt.transformer_decoders[b].norm_layer1.bias = assign_check(gpt.transformer_decoders[b].norm_layer1.bias, d[f"h.{b}.ln_1.bias"])

        gpt.transformer_decoders[b].norm_layer2.weight = assign_check(gpt.transformer_decoders[b].norm_layer2.weight, d[f"h.{b}.ln_2.weight"])
        gpt.transformer_decoders[b].norm_layer2.bias = assign_check(gpt.transformer_decoders[b].norm_layer2.bias, d[f"h.{b}.ln_2.bias"])
    
        gpt.final_norm.weight = assign_check(gpt.final_norm.weight, d[f"ln_f.weight"])
        gpt.final_norm.bias = assign_check(gpt.final_norm.bias, d[f"ln_f.bias"])
        gpt.out.weight = assign_check(gpt.out.weight, d["wte.weight"])

In [33]:
base_configs = MODEL_ARCHITECTURES['gpt2-small']
custom_configs = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "drop_rate": 0.0,  # Dropout rate
    "qkv_bias": True,  # Query-Key-Value bias
}

model_configs = base_configs | custom_configs

In [34]:
gpt = GPTModel(model_configs)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_weights(gpt, gpt_pretrained, model_configs)

In [35]:
gpt.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (transformer_decoders): Sequential(
    (0): TransformerDecoder(
      (multi_head_attention): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (norm_layer1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (norm_layer2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (feed_foward): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      

In [36]:
import tiktoken
from src.token import text_to_token_ids, token_ids_to_text
from src.generate import generate

torch.manual_seed(123)

tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate(
    model=gpt.to(device),
    idx=text_to_token_ids("Every effort moves", tokenizer).to(device),
    max_new_tokens=30,
    context_size=model_configs["context_length"],
    top_k=1,
    temperature=1.0
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves forward, but it's not enough.

"I'm not going to sit here and say, 'I'm not going to do this,'
