### Requirements

In [1]:
# pip install -q transformers
# pip install -q huggingface_hub

### Load the GPT model and tokenizer from the HF library

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath("gpt2_model.ipynb"))))

from mingpt import GPT
from config import MyGPTConfig

# gpt2
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [3]:
## Save the tokenizer to inspect
# tokenizer.save_pretrained("gpt2-tokenizer")

In [4]:
# test the tokenizer
sent = "A man was walking"
tokenized_sent = tokenizer(sent, return_tensors="pt")   # is equivalent to .encode_plus()
print(tokenized_sent)

# detokenize
decoded_sent = tokenizer.decode(tokenized_sent.input_ids[0], skip_special_tokens=False)
print(decoded_sent)

{'input_ids': tensor([[  32,  582,  373, 6155]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
A man was walking


In [5]:
# look at the GPT2 model
print(gpt2)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [6]:
# look at the parameter names
for pn, p in gpt2.named_parameters():
    print(pn, ": shape ==>", p.shape)

transformer.wte.weight : shape ==> torch.Size([50257, 768])
transformer.wpe.weight : shape ==> torch.Size([1024, 768])
transformer.h.0.ln_1.weight : shape ==> torch.Size([768])
transformer.h.0.ln_1.bias : shape ==> torch.Size([768])
transformer.h.0.attn.c_attn.weight : shape ==> torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias : shape ==> torch.Size([2304])
transformer.h.0.attn.c_proj.weight : shape ==> torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias : shape ==> torch.Size([768])
transformer.h.0.ln_2.weight : shape ==> torch.Size([768])
transformer.h.0.ln_2.bias : shape ==> torch.Size([768])
transformer.h.0.mlp.c_fc.weight : shape ==> torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias : shape ==> torch.Size([3072])
transformer.h.0.mlp.c_proj.weight : shape ==> torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias : shape ==> torch.Size([768])
transformer.h.1.ln_1.weight : shape ==> torch.Size([768])
transformer.h.1.ln_1.bias : shape ==> torch.Size([768])
transforme

In [7]:
from transformers import GPT2LMHeadModel
lmhead = GPT2LMHeadModel.from_pretrained("gpt2")

### Look at the generation response

In [8]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=gpt2,
    tokenizer=tokenizer

)
output = pipe("My name is")
print(output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "My name is Matt, and I'm one of the few guys in my life that has ever thought of being a professional. This is my first game as a professional, and at least for me that means something. For years I had to sit and"}]


### Generate through the `AutoModel.generate()` method ```GenerationConfig```

In [9]:
# generate through the models method
input_ids = tokenized_sent['input_ids']

from transformers import GenerationConfig
gen_config = GenerationConfig(
    max_new_tokens=50,
    max_length=1024,
    # temperature=0.8,
    pad_token_id=50256,
    # greedy decoding
    num_beams=1,
    do_sample=False
)

out = lmhead.generate(input_ids, gen_config)
print(len(out[0]))
tokenizer.decode(out[0])

54


"A man was walking down the street when he saw a man with a gun. He ran to the man's house and shot him.\n\nThe man was taken to the hospital where he was pronounced dead.\n\nThe man's family said he was a good kid"

In [10]:
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenized = llama_tokenizer(
    sent,
    # padding="max_length",
    add_special_tokens=True,
    return_tensors='pt',
    # max_length=512,
    )
print(llama_tokenized)

# decode
decoded_llama = llama_tokenizer.decode(llama_tokenized["input_ids"][0], skip_special_tokens=False)

print(decoded_llama)


{'input_ids': tensor([[    1,   319,   767,   471, 22049]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
<s> A man was walking


In [11]:
llama_tokenizer.save_pretrained("llama-tokenizer/")

('llama-tokenizer/tokenizer_config.json',
 'llama-tokenizer/special_tokens_map.json',
 'llama-tokenizer/tokenizer.model',
 'llama-tokenizer/added_tokens.json',
 'llama-tokenizer/tokenizer.json')

### Look at the parameter numbers

In [12]:
print(
    "Number of parameters in GPT2-HF: ",
    sum(p.numel() for p in gpt2.parameters())//10**6
    )

Number of parameters in GPT2-HF:  124


### Check our Karpathy inspired mingpt model

In [13]:
# load the mingpt model
# mingpt = GPT.load_pretrained("gpt2")
config = MyGPTConfig()
mingpt = GPT(config)

number of parameters: 124.44M


In [14]:
mingpt_sd = mingpt.state_dict()

keys = [k for k in mingpt_sd if not k.endswith("attn.masked_bias")]  
print(len(keys))

for k, v in mingpt_sd.items():
    print(k, ": shape ==> ", v.shape)

149
transformer.wte.weight : shape ==>  torch.Size([50257, 768])
transformer.wpe.weight : shape ==>  torch.Size([1024, 768])
transformer.h.0.ln_1.weight : shape ==>  torch.Size([768])
transformer.h.0.ln_1.bias : shape ==>  torch.Size([768])
transformer.h.0.attn.masked_bias : shape ==>  torch.Size([1, 1, 1024, 1024])
transformer.h.0.attn.c_attn.weight : shape ==>  torch.Size([2304, 768])
transformer.h.0.attn.c_attn.bias : shape ==>  torch.Size([2304])
transformer.h.0.attn.c_proj.weight : shape ==>  torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias : shape ==>  torch.Size([768])
transformer.h.0.ln_2.weight : shape ==>  torch.Size([768])
transformer.h.0.ln_2.bias : shape ==>  torch.Size([768])
transformer.h.0.mlp.c_fc.weight : shape ==>  torch.Size([3072, 768])
transformer.h.0.mlp.c_fc.bias : shape ==>  torch.Size([3072])
transformer.h.0.mlp.c_proj.weight : shape ==>  torch.Size([768, 3072])
transformer.h.0.mlp.c_proj.bias : shape ==>  torch.Size([768])
transformer.h.1.ln_1.weight :

In [15]:
# load _pretrained = 
pre_load = GPT.load_pretrained("gpt2")

number of parameters: 124.44M


In [None]:
input_ids.shape


torch.Size([1, 4])

In [18]:
# perform inference on our tokens
pre_load.eval()
out_tokens = pre_load.generate(input_ids, max_new_tokens=100, temperature=0.9, do_sample=True)

# decode the out tokens
output = tokenizer.decode(out_tokens[0], skip_special_tokens=False)
output

'A man was walking in central London, wearing a white lace dress and blue tie. Another man arriving later said he saw the man leave the markets for the night alone.\n\nHe said he purchased a pair of tennis shoesد تحمدد, which he brought home for their holiday.\n\nThe man had said he had heard the man locked himself in his bathroom.\n\nHe said he had wandered out at about 3.45am, when he saw the man take off his shirt he'

In [None]:
# compare weights of the lmhead model with pre_load
our_sd = pre_load.state_dict()
sd_hf = lmhead.state_dict()

keys = [k for k in sd_hf if not k.endswith("attn.masked_bias")]
transposed = ["attn.c_attn.weight", "attn.c_proj.weight", "mlp.c_fc.weight", "mlp.c_proj.weight"]

assert len(keys) == len(sd_hf)
for k in keys:
    if any(k.endswith(w) for w in transposed):
        # transpose the conv weights in the original GPT model
        assert (
            sd_hf[k].shape[::-1] == our_sd[k].shape
        ), f"Transpose shape mismatch for {k}. Found {sd_hf[k].shape} and {sd[k].shape}"
        assert sd_hf[k].t().equal(our_sd[k])

    else:
        # vanilla copy for all the others
        assert sd_hf[k].shape == our_sd[k].shape, f"Shape mismatch for {k}. Found {sd_hf[k].shape} and {sd[k].shape}"
        assert sd_hf[k].equal(our_sd[k])

149