## explore HF gpt2

In [None]:
from transformers import GPT2LMHeadModel
from transformers import GPT2Tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2') # gpt2-medium, gpt2-large, gpt2-xl
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

sd_hf = model.state_dict()
for k, v in sd_hf.items():
    print(f'{k} --> {v.shape}')

In [None]:
prompt = "Hi, I'm a language model,"
input_ids = tokenizer(prompt, return_tensors='pt')['input_ids']
input_ids

In [None]:
gen_kwargs = dict(
    max_new_tokens=30,
    do_sample=True,
    top_k=50,            # pipeline default
    top_p=0.95,          # pipeline default
    temperature=1.0,
    num_return_sequences=2,
    pad_token_id=tokenizer.eos_token_id
)

output_ids = model.generate(input_ids, **gen_kwargs)
output_ids

In [None]:
# tok.decode(output_ids[0])
print("> ", tokenizer.decode(output_ids[0], skip_special_tokens=True))
print("> ", tokenizer.decode(output_ids[1], skip_special_tokens=True))

In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
# set_seed(42)
generator("Hello, I'm a language model,", num_return_sequences=2, max_new_tokens=30)

## weight tying property

In [None]:
print(sd_hf['transformer.wte.weight'].shape, sd_hf['lm_head.weight'].shape)
print((sd_hf['transformer.wte.weight'] == sd_hf['lm_head.weight']).all())

In [None]:
print(sd_hf['transformer.wte.weight'].data_ptr())
print(sd_hf['lm_head.weight'].data_ptr())

## causal self attention elaborate affinity scores

In [None]:
import torch

In [None]:
b,t,c = 2,3,4
nh,hs = 2,2
qkv = torch.randn((b,t,3*c))
q,k,v = qkv.split(c, dim=-1)

In [None]:
torch.allclose(qkv[:,:,:c], q)

In [None]:
q.shape, q.view(b,t,nh,hs).shape, q.view(b,t,nh,hs).transpose(1,2).shape

In [None]:
q.view(b,nh,t,hs)

In [None]:
q = q.view(b,t,nh,hs).transpose(1,2)
k = k.view(b,t,nh,hs).transpose(1,2)
v = v.view(b,t,nh,hs).transpose(1,2)

affinity_scores = (q @ k.transpose(-2,-1)) # (b,nh,t,t)
affinity_scores.shape

In [None]:
tril = torch.tril(torch.ones(t,t, dtype=torch.long))
tril # (t,t)

In [None]:
affinity_scores = (q @ k.transpose(-2,-1)) # (b,nh,t,t)
tril = torch.tril(torch.ones(t,t, dtype=torch.long))

affinity_scores = affinity_scores.masked_fill(tril==0, float('-inf'))
affinity_scores = torch.nn.functional.softmax(affinity_scores, dim=-1)
affinity_scores # (b,nh,t,t)

In [None]:
y = affinity_scores @ v # (b,nh,t,t) @ (b,nh,t,hs) -> (b,nh,t,hs)
y.shape

In [None]:
y

In [None]:
y.view(b,t,c)

In [None]:
y = y.transpose(1,2).contiguous().view(b,t,c)
y

In [None]:
y

## input data prep

In [None]:
import torch
device = "cpu"
if torch.cuda.is_available():
    devide = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"

print(f'device = {device}')

In [None]:
text = open('input.txt', 'r').read()
print(text[:200])

In [None]:
text_sample = text[:1000]

import tiktoken
enc = tiktoken.get_encoding('gpt2')
tokens = enc.encode(text_sample)
tokens = torch.tensor(tokens, dtype=torch.long, device=device)
print(tokens[:24+1])

In [None]:
# prepare a single batch of batch_size = 4
B,T = 4,6
tokens_sample = tokens[:B*T+1]
x = tokens_sample[:-1].view(B,T).to(device)
y = tokens_sample[1:].view(B,T).to(device)
print(x)
print(y)

In [None]:
from gpt2 import GPT2, GPT2Config
config = GPT2Config()
model = GPT2(config=config).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

In [None]:
logits, loss  = model(x,y)
print(logits.shape) # 4,6,50257
print(loss.item())

In [None]:
for i in range(50):
    optimizer.zero_grad()
    logits, loss = model(x, y)
    loss.backward()
    optimizer.step()
    if i%10 == 0:
        print(f'iteration: {i} --> loss: {loss.item()}')