In [None]:
from transformers import GPT2LMHeadModel

In [None]:
model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
sd_hf = model_hf.state_dict()

for k, v in sd_hf.items():
    print(k, v.shape)

In [None]:
sd_hf["transformer.wpe.weight"].view(-1)[:20]

In [None]:
import matplotlib.pyplot as plt
plt.imshow(sd_hf["transformer.wpe.weight"], cmap="gray")
plt.show()

In [None]:
plt.plot(sd_hf["transformer.wpe.weight"][:, 150])
plt.plot(sd_hf["transformer.wpe.weight"][:, 200])
plt.plot(sd_hf["transformer.wpe.weight"][:, 250])
plt.show()

In [None]:
plt.imshow(sd_hf["transformer.h.1.attn.c_attn.weight"][:300,:300], cmap="gray")
plt.show()

In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

In [None]:
# std grows inside the residual stream
import torch
x = torch.zeros(768)
n = 100  # layers
for i in range(n):
    x += n**-0.5 * torch.randn(768)

print(x.std())

In [None]:
import math
import matplotlib.pyplot as plt

class LRScheduler:
    def __init__(self, max_lr, min_lr, warmup_steps, max_steps):
        self.max_lr = max_lr
        self.min_lr = min_lr
        self.warmup_steps = warmup_steps
        self.max_steps = max_steps
    def get_lr(self, step):
        if step < self.warmup_steps:
            return self.max_lr * (step+1) / self.warmup_steps
        elif self.warmup_steps <= step < self.max_steps:
            decay_ratio = (step-self.warmup_steps) / (self.max_steps-self.warmup_steps)
            cf = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
            return self.min_lr + cf * (self.max_lr - self.min_lr)
        else:
            return self.min_lr

In [None]:
lr_sched = LRScheduler(0.5, 0.1, 100, 1000)

X = list(range(1100))
Y = [lr_sched.get_lr(x) for x in X]

plt.plot(X, Y)
plt.show()