In [1]:
from model.iGPT import iGPT,iGPTConfig, NotMyModel
import torch

config = iGPTConfig(
    block_size=256,
    vocab_size=50304,
    n_layer_main=4,
    n_layer_idea=2,
    n_head=8,
    n_embd_main = 768,          
    n_embd_idea = 512,
    idea_dim=768,
)

# config = iGPTConfig(
#     block_size=256,
#     vocab_size=50304,
#     n_layer_main=12,
#     n_layer_idea=12,
#     n_head=8,
#     n_embd_main = 768,          
#     n_embd_idea = 512,
#     idea_dim=768,
# )

# model = NotMyModel(config)
model = NotMyModel.load_from_checkpoint("checkpoints/version_0/checkpoints/epoch=3-step=6424.ckpt")
# model.cuda()
model.eval()
model

  from .autonotebook import tqdm as notebook_tqdm


NotMyModel(
  (network): iGPT(
    (wte_i): Embedding(50304, 512)
    (wpe_i): Embedding(256, 512)
    (blocks_i): ModuleList(
      (0-1): 2 x Block(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=512, out_features=1536, bias=True)
          (c_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
    )
    (ln_f_i): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (idea_head): Linear(in_features=512, out_features=768, bias=False)
    (wte_g): Embedding(50304, 768)
    (wpe_g): Embedding(256, 768)
    (blocks_g): ModuleList(
      (0-3): 4 x Block(
        (ln_1): LayerNo

In [2]:
import tiktoken
# Load the base encoding
enc = tiktoken.get_encoding("gpt2")
eos = 50257
# Define new special tokens
new_special_tokens = {
    "<|endofsent|>": eos,  # Make sure this ID does not conflict with existing tokens
}
# Create a new encoding with the added special tokens
extended_enc = tiktoken.Encoding(
    name="gpt2_extended",
    pat_str=enc._pat_str,  # Use the same pattern as the original encoding
    mergeable_ranks=enc._mergeable_ranks,  # Keep the same mergeable ranks
    special_tokens={**enc._special_tokens, **new_special_tokens},  # Extend special tokens
)

In [3]:
num_return_sequences = 5
max_length = 30
mysent = "In September 2010 , a teaser website was revealed by Sega , hinting at a new Valkyria Chronicles game ."
sent = extended_enc.encode(mysent)
sent = torch.tensor(sent, dtype=torch.long) # (8,)
sent = sent.unsqueeze(0)
sent = sent.cuda()

In [4]:
init_sent = extended_enc.encode("<|endofsent|>", allowed_special={'<|endofsent|>'})
init_sent = torch.tensor(init_sent, dtype=torch.long) # (8,)
init_sent = init_sent.unsqueeze(0).repeat(num_return_sequences, 1)
init_sent = init_sent.cuda()
init_sent

tensor([[50257],
        [50257],
        [50257],
        [50257],
        [50257]], device='cuda:0')

In [5]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)

import torch.nn.functional as F

while init_sent.size(1) < max_length:
    # forward the model to get the logits
    # with torch.no_grad():
        logits = model((init_sent, sent)) # (B, T, vocab_size)
        # take the logits at the last position
        logits = logits[:, -1, :] # (B, vocab_size)
        # get the probabilities
        probs = F.softmax(logits, dim=-1)
        # do top-k sampling of 50 (huggingface pipeline default)
        # topk_probs here becomes (5, 50), topk_indices is (5, 50)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        # select a token from the top-k probabilities
        # note: multinomial does not demand the input to sum to 1
        ix = torch.multinomial(topk_probs, 1) # (B, 1)
        # gather the corresponding indices
        xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
        # append to the sequence
        init_sent = torch.cat((init_sent, xcol), dim=1)

In [6]:
sent

tensor([[  818,  2693,  3050,   837,   257, 38388,  3052,   373,  4602,   416,
         29490,   837,  9254,   278,   379,   257,   649,   569, 18354,  7496,
         17740,   983,   764]], device='cuda:0')

In [7]:
# small models (4 layers decoder, 2 layer encoder) - 4 epochs
for i in range(num_return_sequences):
    tokens = init_sent[i, :max_length].tolist()
    decoded = extended_enc.decode(tokens)
    print(">", decoded)

> <|endofsent|>In deep before the A. " , guitar solo performances in both the things you at the launch of the Tour of the Temple ( including the sea
> <|endofsent|>In close at the base , " black comedy video attended the scene in both the creation of the Simpsons , and the former residence of the epogue
> <|endofsent|>Its so the early Geah GQ song focuses on , she won in the bottom of the top of the top forty focuses on theThese ,
> <|endofsent|>In close at the edge , the drums – Me songwriting and the singing at first performed in an attempt at the afternoon of the verse ( reads
> <|endofsent|>In exposed early descriptions of the instrumental Album credits , video and production on the song in a rebellion on the 29 slopes of the evening theme line @


In [8]:
# big models (12 layers decoder, 12 layer encoder) - 1 epochs
for i in range(num_return_sequences):
    tokens = init_sent[i, :max_length].tolist()
    decoded = extended_enc.decode(tokens)
    print(">", decoded)

> <|endofsent|> however @-@ @- '@ 6 Reception<|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|>
> <|endofsent|> History ' history of the album<|endofsent|>.<|endofsent|> was plans<|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|>
> <|endofsent|> the @-@ to 20 @.<|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|><|endofsent|>
> <|endofsent|> in on the television @- "<|endofsent|><|endofsent|><|endofsent|><|endofsent|><|e

In [None]:
In September 2010 , a teaser website was revealed by Sega , hinting at a new Valkyria Chronicles game .

In [None]:
from transformers import GPT2LMHeadModel

In [None]:
model_hf = GPT2LMHeadModel.from_pretrained("gpt2") # 124M
sd_hf = model_hf.state_dict()

for k, v in sd_hf.items():
    print(k, v.shape)

In [None]:
sd_hf["transformer.wpe.weight"].view(-1)[:20]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.imshow(sd_hf["transformer.wpe.weight"], cmap="gray")

In [None]:
plt.plot(sd_hf["transformer.wpe.weight"][:, 150])
plt.plot(sd_hf["transformer.wpe.weight"][:, 200])
plt.plot(sd_hf["transformer.wpe.weight"][:, 250])

In [None]:
plt.imshow(sd_hf["transformer.h.1.attn.c_attn.weight"][:300,:300], cmap="gray")

In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)


In [None]:
# let's instead sample manually
import torch
from torch.nn import functional as F

model = GPT2LMHeadModel.from_pretrained("gpt2") # 124M
model.eval()
model.to('cuda')
torch.manual_seed(42)
torch.cuda.manual_seed(42)
tokens = [15496, 11, 314, 1101, 257, 3303, 2746, 11] # "Hello, I'm a language model,"
tokens = torch.tensor(tokens, dtype=torch.long) # (8,)
tokens = tokens.unsqueeze(0).repeat(5, 1) # (5, 8)
x = tokens.to('cuda')

# generate!
while x.size(1) < 30: # max_length=30
    # forward the model to get the logits
    with torch.no_grad():
        logits = model(x)[0] # (B, T, vocab_size)
        # take the logits at the last position
        logits = logits[:, -1, :] # (B, vocab_size)
        # get the probabilities
        probs = F.softmax(logits, dim=-1)
        # do top-k sampling of 50 (huggingface pipeline default)
        # topk_probs here becomes (5, 50), topk_indices is (5, 50)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        # select a token from the top-k probabilities
        # note: multinomial does not demand the input to sum to 1
        ix = torch.multinomial(topk_probs, 1) # (B, 1)
        # gather the corresponding indices
        xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
        # append to the sequence
        x = torch.cat((x, xcol), dim=1)

# print the generated text
import tiktoken
enc = tiktoken.get_encoding('gpt2')
for i in range(5):
    tokens = x[i, :30].tolist()
    decoded = enc.decode(tokens)
    print(">", decoded)

In [None]:
# tiny shakespeare dataset
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r') as f:
    text = f.read()
data = text[:1000] # first 1,000 characters
print(data[:100])

In [None]:
import tiktoken
enc = tiktoken.get_encoding('gpt2')
tokens = enc.encode(data)
print(tokens[:24])

In [None]:
import torch
buf = torch.tensor(tokens[:24 + 1])
x = buf[:-1].view(4, 6)
y = buf[1:].view(4, 6)
print(x)
print(y)

In [None]:
print(sd_hf["lm_head.weight"].shape)
print(sd_hf["transformer.wte.weight"].shape)

In [None]:
(sd_hf["lm_head.weight"] == sd_hf["transformer.wte.weight"]).all()

In [None]:
print(sd_hf["lm_head.weight"].data_ptr())
print(sd_hf["transformer.wte.weight"].data_ptr())

In [None]:

# standard deviation grows inside the residual stream
x = torch.zeros(768)
n = 100 # e.g. 100 layers
for i in range(n):
    x += n**-0.5 * torch.randn(768)

print(x.std())

In [None]:
import torch

# super simple little MLP
net = torch.nn.Sequential(
    torch.nn.Linear(16, 32),
    torch.nn.GELU(),
    torch.nn.Linear(32, 1)
)
torch.random.manual_seed(42)
x = torch.randn(4, 16)
y = torch.randn(4, 1)
net.zero_grad()
yhat = net(x)
loss = torch.nn.functional.mse_loss(yhat, y)
loss.backward()
print(net[0].weight.grad.view(-1)[:10])

# the loss objective here is (due to readuction='mean')
# L = 1/4 * [
#            (y[0] - yhat[0])**2 +
#            (y[1] - yhat[1])**2 +
#            (y[2] - yhat[2])**2 +
#            (y[3] - yhat[3])**2
#           ]
# NOTE: 1/4!

In [None]:
# now let's do it with grad_accum_steps of 4, and B=1
# the loss objective here is different because
# accumulation in gradient <---> SUM in loss
# i.e. we instead get:
# L0 = 1/4(y[0] - yhat[0])**2
# L1 = 1/4(y[1] - yhat[1])**2
# L2 = 1/4(y[2] - yhat[2])**2
# L3 = 1/4(y[3] - yhat[3])**2
# L = L0 + L1 + L2 + L3
# NOTE: the "normalizer" of 1/4 is lost
net.zero_grad()
for i in range(4):
    yhat = net(x[i])
    loss = torch.nn.functional.mse_loss(yhat, y[i])
    loss = loss / 4 # <-- have to add back the "normalizer"!
    loss.backward()
print(net[0].weight.grad.view(-1)[:10])


In [None]:
# parse and visualize the logfile
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

sz = "124M"

loss_baseline = {
    "124M": 3.2924,
}[sz]
hella2_baseline = { # HellaSwag for GPT-2
    "124M": 0.294463,
    "350M": 0.375224,
    "774M": 0.431986,
    "1558M": 0.488946,
}[sz]
hella3_baseline = { # HellaSwag for GPT-3
    "124M": 0.337,
    "350M": 0.436,
    "774M": 0.510,
    "1558M": 0.547,
}[sz]

# load the log file
with open("log124M_40B/log.txt", "r") as f:
    lines = f.readlines()

# parse the individual lines, group by stream (train,val,hella)
streams = {}
for line in lines:
    step, stream, val = line.strip().split()
    if stream not in streams:
        streams[stream] = {}
    streams[stream][int(step)] = float(val)

# convert each stream from {step: val} to (steps[], vals[])
# so it's easier for plotting
streams_xy = {}
for k, v in streams.items():
    # get all (step, val) items, sort them
    xy = sorted(list(v.items()))
    # unpack the list of tuples to tuple of lists
    streams_xy[k] = list(zip(*xy))

# create figure
plt.figure(figsize=(16, 6))

# Panel 1: losses: both train and val
plt.subplot(121)
xs, ys = streams_xy["train"] # training loss
ys = np.array(ys)
plt.plot(xs, ys, label=f'nanogpt ({sz}) train loss')
print("Min Train Loss:", min(ys))
xs, ys = streams_xy["val"] # validation loss
plt.plot(xs, ys, label=f'nanogpt ({sz}) val loss')
# horizontal line at GPT-2 baseline
if loss_baseline is not None:
    plt.axhline(y=loss_baseline, color='r', linestyle='--', label=f"OpenAI GPT-2 ({sz}) checkpoint val loss")
plt.xlabel("steps")
plt.ylabel("loss")
plt.yscale('log')
plt.ylim(top=4.0)
plt.legend()
plt.title("Loss")
print("Min Validation Loss:", min(ys))

# Panel 2: HellaSwag eval
plt.subplot(122)
xs, ys = streams_xy["hella"] # HellaSwag eval
ys = np.array(ys)
plt.plot(xs, ys, label=f"nanogpt ({sz})")
# horizontal line at GPT-2 baseline
if hella2_baseline:
    plt.axhline(y=hella2_baseline, color='r', linestyle='--', label=f"OpenAI GPT-2 ({sz}) checkpoint")
if hella3_baseline:
    plt.axhline(y=hella3_baseline, color='g', linestyle='--', label=f"OpenAI GPT-3 ({sz}) checkpoint")
plt.xlabel("steps")
plt.ylabel("accuracy")
plt.legend()
plt.title("HellaSwag eval")
print("Max Hellaswag eval:", max(ys))