# Dissecting Transformers

The first step for us is to run `python3 data/nlab/prepare.py` after having cloned the `nlab-content` submodule. This will create a file `data/nlab/input.md` of roughly 91 MiB.

In [1]:
with open("../data/nlab/input.md", "r", encoding="utf-8") as fd:
    text = fd.read()

print(len(text))

95159406


In [2]:
vocab = list(sorted(list(set(text))))
vocab_size = len(vocab)
print("".join(vocab))
print(vocab_size)

	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ §¨«­¯°²³´µ·¹»ÀÁÄÅÆÉÎÓÖ×ØÜßàáâãäåæçèéêëìíîïñòóôöøùúûüýĀāăąĆćĈČčđĕęěğīĭİıķŁłńōőŒœŗřŚśŝŞşŠšţťūűŻżŽžſșțȩɐɪʰʲʹʼˆˈ̧̣̀́̂̃̄̈̌͡ΑΒΓΔΕΘΛΜΠΣΦΨΩάέήίαβγδεζηθικλμνξοπρςστυφχψωόύϑϒϕϖϵАБВГДЕЖЗИКЛМНОПРСТУФХШЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяёіћᵒᵖᵢᶜṣṬỳἀἐἓἕἢἣἰὁὄὅὐὑὗὰὲὴὶὸῃῆῇῖῦῶῷ ​‌‍‎‐‑–—‘’“”„†•…  ′⁻ⁿ₀₁₂₄₆₇₈₉ℂℋℓ№ℚℤΩℵ⅋Ⅱ←→↦⇓⇔⇸∀∂∈−∗√∞∧∼≅≈≠≡≤≥≺⊂⊗⋮─◦♧♭✄【】のオダネノブヨ下五何信分夫学山幾式引形微德徹志数方李村田程空米系經经群蕉论谷豊辻道郎间香ﬀﬁﬂﬃ
479


In [3]:
# import tiktoken

# enc = tiktoken.get_encoding("gpt2")
# enc.n_vocab
# t = enc.encode(text[:50])
# print(t)
# print(enc.decode(t))
ctoi = {ch: i for i, ch in enumerate(vocab)}
itoc = {i: ch for i, ch in enumerate(vocab)}

encode = lambda s: [ctoi[c] for c in s]
decode = lambda v: "".join([itoc[i] for i in v])

print(encode(text[:50]))

[1, 13, 15, 15, 2, 93, 28, 2, 16, 84, 75, 73, 74, 86, 42, 67, 80, 70, 53, 75, 70, 71, 95, 1, 13, 15, 15, 2, 93, 28, 2, 16, 86, 81, 69, 2, 16, 69, 78, 75, 69, 77, 38, 81, 89, 80, 2, 86, 67, 68]


In [4]:
import torch

data = torch.tensor(encode(text), dtype=torch.long, device="cuda")
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([95159406]) torch.int64
tensor([ 1, 13, 15, 15,  2, 93, 28,  2, 16, 84, 75, 73, 74, 86, 42, 67, 80, 70,
        53, 75, 70, 71, 95,  1, 13, 15, 15,  2, 93, 28,  2, 16, 86, 81, 69,  2,
        16, 69, 78, 75, 69, 77, 38, 81, 89, 80,  2, 86, 67, 68, 75, 80, 70, 71,
        90, 31,  4, 18,  4, 95,  1,  5,  5,  5,  2, 37, 81, 80, 86, 71, 90, 86,
         1,  5,  5,  5,  5,  2, 50, 74, 75, 78, 81, 85, 81, 82, 74, 91,  1, 13,
        15, 15,  2, 93, 28,  2, 16, 74, 75, 70, 71, 95,  1, 61, 61,  3, 75, 80,
        69, 78, 87, 70, 71,  2, 82, 74, 75, 78, 81, 85, 81, 82, 74, 91,  2, 15,
         2, 69, 81, 80, 86, 71, 80, 86, 85, 63, 63,  1, 31, 15, 15,  1, 31, 15,
        15,  1, 31, 15, 15,  1,  1,  5, 37, 81, 80, 86, 71, 80, 86, 85,  5,  1,
        12,  2, 86, 67, 68, 78, 71,  2, 81, 72,  2, 69, 81, 80, 86, 71, 80, 86,
        85,  1, 93, 28, 86, 81, 69, 95,  1,  1,  5,  5,  2, 43, 70, 71, 67,  1,
         1, 10, 16, 16, 16, 11,  1,  1, 35,  2, 82, 81, 75, 80, 86,  2, 81, 72,
     

In [5]:
n = int(0.9 * data.shape[0])
train_data = data[:n]
val_data = data[n:]
print(train_data.shape, val_data.shape)

torch.Size([85643465]) torch.Size([9515941])


In [6]:
context_length = 8
train_data[: context_length + 1]

tensor([ 1, 13, 15, 15,  2, 93, 28,  2, 16], device='cuda:0')

In [7]:
x = train_data[:context_length]
y = train_data[1 : context_length + 1]
for t in range(context_length):
    context = x[: t + 1]
    target = y[t]
    print(f"When input is {context} the target: {target}")

When input is tensor([1], device='cuda:0') the target: 13
When input is tensor([ 1, 13], device='cuda:0') the target: 15
When input is tensor([ 1, 13, 15], device='cuda:0') the target: 15
When input is tensor([ 1, 13, 15, 15], device='cuda:0') the target: 2
When input is tensor([ 1, 13, 15, 15,  2], device='cuda:0') the target: 93
When input is tensor([ 1, 13, 15, 15,  2, 93], device='cuda:0') the target: 28
When input is tensor([ 1, 13, 15, 15,  2, 93, 28], device='cuda:0') the target: 2
When input is tensor([ 1, 13, 15, 15,  2, 93, 28,  2], device='cuda:0') the target: 16


In [8]:
torch.manual_seed(303)

batch_size = 4
context_length = 8


def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - context_length, (batch_size,))
    x = torch.stack([data[i : i + context_length] for i in ix])
    y = torch.stack([data[i + 1 : i + 1 + context_length] for i in ix])
    x, y = x.to("cuda"), y.to("cuda")
    return x, y


xb, yb = get_batch("train")
print("inputs:")
print(xb.shape)
# print(xb)
print("targets:")
print(yb.shape)
# print(yb)

print("---")

# for b in range(batch_size):
#     for t in range(context_length):
#         context = xb[b, :t+1]
#         target = yb[b, t]
#         print(f"When input is {context} the target: {target}")

inputs:
torch.Size([4, 8])
targets:
torch.Size([4, 8])
---


In [9]:

print("---")

# for b in range(batch_size):
#     for t in range(context_length):
#         context = xb[b, :t+1]
#         target = yb[b, t]
#         print(f"When input is {context} the target: {target}")

---


In [10]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(303)

class BigramLM(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size, device="cuda")

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        B, T , C = logits.shape
        if targets is None:
            loss = None
        else:
            l = logits.view(B * T, C)
            l = l.to("cuda")
            targets = targets.view(B * T)
            targets = targets.to("cuda")
            loss = F.cross_entropy(l, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

m = BigramLM(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape, loss)

print(-torch.log(torch.tensor([1/479])))

torch.Size([4, 8, 479]) 

tensor(6.9902, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6.1717])


In [11]:
decode(m.generate(torch.zeros((1, 1), dtype=torch.long, device="cuda"), max_new_tokens=100)[0].tolist())

'\tàè←Ωὑœ8ïæ豊Àżヨˆνʹœ方ı³éỳ¹ßù\u2028₀‐&А【̌lò李őв−̀Æ─уVł»д系ÄI✄ù♧ићłiOἐ⁻\u200c♧ЗTНêыяΩ幾НŠÅ·Aé^äű五\u200d五∼cŒïī§№ёâr≅)%∗²Zcγέ'

In [12]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-4)

In [13]:
batch_size = 32
for steps in range(1000):
    xb, yb = get_batch("train")

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

torch.save(m.state_dict(), "model.bin")

6.481244087219238


In [14]:
decode(m.generate(torch.zeros((1, 1), dtype=torch.long, device="cuda"), max_new_tokens=100)[0].tolist())

'\th式и₁D山Å̈志ῶUо»₀”5βΑᶜℓńⅡ¨эﬀ五ě6\n9έюὲᵢν/·ЕG蕉道ϖ∂МĆвù♧äʲж\xadÀдἰαμęŞ¨$όπΩ‑ϕp–Ε%\u202fγネ́ʲýяåŗzп◦∧◦ôř_ˈșБ†θψ∂е̂ῷ$オﬀ'

### The mathematical trick in self-attention

In [15]:
torch.manual_seed(303)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape
x = x.to("cuda")

In [16]:
xbow = torch.zeros((B, T, C), device="cuda")
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b, t] = torch.mean(xprev, 0)

In [17]:
x[0]

tensor([[ 0.2761, -0.3973],
        [-2.1947,  0.4953],
        [ 0.0597,  0.2285],
        [ 0.1095, -1.1835],
        [ 0.0354,  0.3497],
        [ 0.9917,  0.2692],
        [-0.0558,  0.4478],
        [ 1.3278, -1.7514]], device='cuda:0')

In [18]:
xbow[0]

tensor([[ 0.2761, -0.3973],
        [-0.9593,  0.0490],
        [-0.6196,  0.1089],
        [-0.4374, -0.2142],
        [-0.3428, -0.1015],
        [-0.1204, -0.0397],
        [-0.1112,  0.0300],
        [ 0.0687, -0.1927]], device='cuda:0')

In [19]:
torch.manual_seed(303)

a = torch.tril(torch.ones(3, 3, device="cuda"))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2), device="cuda").float()
c = a @ b

print(a)
print(b)
print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]], device='cuda:0')
tensor([[4., 8.],
        [1., 9.],
        [3., 4.]], device='cuda:0')
tensor([[4.0000, 8.0000],
        [2.5000, 8.5000],
        [2.6667, 7.0000]], device='cuda:0')


In [20]:
wei = torch.tril(torch.ones(T, T, device="cuda"))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x
print(xbow[0])
print(xbow2[0])

tensor([[ 0.2761, -0.3973],
        [-0.9593,  0.0490],
        [-0.6196,  0.1089],
        [-0.4374, -0.2142],
        [-0.3428, -0.1015],
        [-0.1204, -0.0397],
        [-0.1112,  0.0300],
        [ 0.0687, -0.1927]], device='cuda:0')
tensor([[ 0.2761, -0.3973],
        [-0.9593,  0.0490],
        [-0.6196,  0.1089],
        [-0.4374, -0.2142],
        [-0.3428, -0.1015],
        [-0.1204, -0.0397],
        [-0.1112,  0.0300],
        [ 0.0687, -0.1927]], device='cuda:0')


In [21]:
torch.set_default_device("cuda")

In [22]:
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [23]:
n_embed = 32


class NewLM(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(context_length, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx) # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device="cuda"))
        x = tok_emb + pos_emb
        logits = self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:

            B, T , C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [24]:
model = NewLM()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [25]:
batch_size = 32
for steps in range(1000):
    xb, yb = get_batch("train")

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

3.022827625274658


In [26]:
torch.manual_seed(303)

B, T, C = 4, 8, 32
x = torch.randn(B, T, C)


head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)

wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) -> (B, T, T)

tril - torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v

out

tensor([[[ 7.6557e-01,  7.6792e-02, -3.5116e-02, -1.7187e-01, -4.2298e-01,
          -1.0044e+00,  2.1866e-01, -2.1635e-02, -4.3660e-01, -1.6724e-01,
          -3.1453e-01, -5.1809e-02,  1.2839e-01,  5.0777e-02,  4.5718e-01,
          -2.6916e-02],
         [ 3.2715e-01, -4.8458e-01,  1.3244e-01, -1.4200e-01, -3.5120e-01,
           1.3434e-02, -3.2762e-02,  9.7101e-02,  9.1106e-01, -1.9292e-01,
          -2.6209e-01,  5.2649e-01, -3.6265e-01,  5.4872e-01,  2.3383e-01,
          -4.8467e-01],
         [ 2.5620e-01, -2.7523e-01,  2.2497e-01, -3.7015e-02, -3.0671e-01,
          -8.5953e-02,  1.6372e-01,  8.8102e-02,  7.4853e-01, -1.3400e-01,
          -2.2559e-01,  3.7459e-01, -1.2368e-01,  2.9392e-01,  1.9261e-01,
          -3.8742e-01],
         [-4.8532e-01,  6.3573e-01,  8.9195e-01,  6.2057e-01, -4.6279e-04,
           9.8741e-03,  1.2114e+00,  1.1948e-01,  6.5533e-01,  2.4530e-01,
           1.6259e-02, -1.7245e-01,  1.0460e+00, -9.2361e-01, -2.0221e-01,
          -1.1559e-01],
    

Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating

In [27]:
import torchinfo

torchinfo.summary(model, input_size=(batch_size, context_length), dtypes=[torch.long])

Layer (type:depth-idx)                   Output Shape              Param #
NewLM                                    [32, 8, 479]              --
├─Embedding: 1-1                         [32, 8, 32]               15,328
├─Embedding: 1-2                         [8, 32]                   256
├─Linear: 1-3                            [32, 8, 479]              15,807
Total params: 31,391
Trainable params: 31,391
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 1.00
Input size (MB): 0.00
Forward/backward pass size (MB): 1.05
Params size (MB): 0.13
Estimated Total Size (MB): 1.18

In [28]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter("runs/experiment_1")
writer.add_graph(model, (xb, yb))
writer.close()

In [33]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5
print(k.var(), q.var(), wei.var())

tensor(0.9226, device='cuda:0') tensor(0.9878, device='cuda:0') tensor(0.9770, device='cuda:0')


In [35]:
F.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872], device='cuda:0')

In [39]:
F.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*80, dim=-1)

tensor([1.2664e-14, 4.7809e-25, 1.1254e-07, 4.7809e-25, 1.0000e+00],
       device='cuda:0')

In [167]:
batch_size = 64
context_length = 256
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = "cuda" if torch.cuda.is_available() else "cpu"
eval_iters = 200
n_embed = 384
head_size = n_embed
n_layer = 6
n_head = 6
dropout = 0.2

In [168]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()

        self.head_size = head_size

        self.c_attn = nn.Linear(n_embed, 3 * head_size, bias=False)
        # self.key = nn.Linear(n_embed, head_size, bias=False)
        # ...
        self.register_buffer("tril", torch.tril(torch.ones(context_length, context_length)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape

        k, q, v = self.c_attn(x).split(self.head_size, dim=-1)

        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        return wei @ v

In [169]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.dropout(self.proj(torch.cat([h(x) for h in self.heads], dim=-1)))

In [170]:
class FeedForward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()

        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.GELU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

In [171]:
class Block(nn.Module):
    def __init__(self, n_embed, n_head):
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [183]:
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(context_length, n_embed)
        self.blocks = nn.Sequential(
            *[Block(n_embed, n_head=n_head) for _ in range(n_layer)],
            nn.LayerNorm(n_embed),
        )
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device="cuda"))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -context_length:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [184]:
torch.manual_seed(303)

model = Decoder()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [185]:
torchinfo.summary(model, input_size=(batch_size, context_length), dtypes=[torch.long])

Layer (type:depth-idx)                        Output Shape              Param #
Decoder                                       [64, 256, 479]            --
├─Embedding: 1-1                              [64, 256, 384]            183,936
├─Embedding: 1-2                              [256, 384]                98,304
├─Sequential: 1-3                             [64, 256, 384]            --
│    └─Block: 2-1                             [64, 256, 384]            --
│    │    └─LayerNorm: 3-1                    [64, 256, 384]            768
│    │    └─MultiHeadAttention: 3-2           [64, 256, 384]            590,208
│    │    └─LayerNorm: 3-3                    [64, 256, 384]            768
│    │    └─FeedForward: 3-4                  [64, 256, 384]            1,181,568
│    └─Block: 2-2                             [64, 256, 384]            --
│    │    └─LayerNorm: 3-5                    [64, 256, 384]            768
│    │    └─MultiHeadAttention: 3-6           [64, 256, 384]           

In [186]:
import time
import tqdm

torch.cuda.synchronize()
start = time.time()

for steps in tqdm.tqdm(range(30000)):
    xb, yb = get_batch("train")

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if steps % eval_interval == 0:
        print(f"Step {steps} ({time.time() - start:.5f}s):", loss.item())

print("Done!")
torch.save(model.state_dict(), 'model_weights.bin')

  0%|          | 2/30000 [00:00<2:30:38,  3.32it/s]

Step 0 (0.50417s): 6.344509124755859


  2%|▏         | 502/30000 [01:11<1:09:01,  7.12it/s]

Step 500 (71.46360s): 2.161691665649414


  3%|▎         | 1002/30000 [02:22<1:09:55,  6.91it/s]

Step 1000 (142.61163s): 1.6431292295455933


  5%|▌         | 1502/30000 [03:33<1:07:07,  7.08it/s]

Step 1500 (213.68534s): 1.387339472770691


  7%|▋         | 2002/30000 [04:44<1:07:48,  6.88it/s]

Step 2000 (284.81282s): 1.3330507278442383


  8%|▊         | 2502/30000 [05:55<1:04:51,  7.07it/s]

Step 2500 (355.77598s): 1.315747857093811


 10%|█         | 3002/30000 [07:06<1:04:36,  6.96it/s]

Step 3000 (426.84542s): 1.1891156435012817


 12%|█▏        | 3502/30000 [08:17<1:02:20,  7.08it/s]

Step 3500 (497.82691s): 1.1492207050323486


 13%|█▎        | 4002/30000 [09:28<1:01:25,  7.05it/s]

Step 4000 (568.82868s): 1.0552691221237183


 15%|█▌        | 4502/30000 [10:40<1:01:05,  6.96it/s]

Step 4500 (639.96398s): 1.131295919418335


 17%|█▋        | 5002/30000 [11:51<59:12,  7.04it/s]  

Step 5000 (711.12034s): 1.13670015335083


 18%|█▊        | 5502/30000 [13:02<57:51,  7.06it/s]  

Step 5500 (782.17145s): 0.9943945407867432


 20%|██        | 6002/30000 [14:12<56:28,  7.08it/s]

Step 6000 (852.78347s): 1.0037243366241455


 22%|██▏       | 6502/30000 [15:23<55:20,  7.08it/s]

Step 6500 (923.37316s): 0.9757031798362732


 23%|██▎       | 7002/30000 [16:34<54:14,  7.07it/s]

Step 7000 (993.90621s): 0.9410073161125183


 25%|██▌       | 7502/30000 [17:44<52:59,  7.08it/s]

Step 7500 (1064.51814s): 1.0116395950317383


 27%|██▋       | 8002/30000 [18:55<51:26,  7.13it/s]

Step 8000 (1135.10367s): 0.9385262131690979


 28%|██▊       | 8502/30000 [20:05<50:40,  7.07it/s]

Step 8500 (1205.67418s): 0.9700348377227783


 30%|███       | 9002/30000 [21:16<49:28,  7.07it/s]

Step 9000 (1276.32483s): 0.930557131767273


 32%|███▏      | 9502/30000 [22:27<48:16,  7.08it/s]

Step 9500 (1347.05130s): 0.9374202489852905


 33%|███▎      | 10002/30000 [23:38<48:21,  6.89it/s]

Step 10000 (1418.25959s): 0.9643851518630981


 35%|███▌      | 10502/30000 [24:50<46:13,  7.03it/s]

Step 10500 (1490.07531s): 0.9492642879486084


 37%|███▋      | 11002/30000 [26:01<45:51,  6.91it/s]

Step 11000 (1561.04584s): 0.9054818153381348


 38%|███▊      | 11502/30000 [27:12<43:48,  7.04it/s]

Step 11500 (1631.97647s): 0.9225693345069885


 40%|████      | 12002/30000 [28:23<42:16,  7.09it/s]

Step 12000 (1702.89901s): 0.9230242371559143


 42%|████▏     | 12502/30000 [29:33<41:57,  6.95it/s]

Step 12500 (1773.59209s): 0.9109727144241333


 43%|████▎     | 13002/30000 [30:44<40:10,  7.05it/s]

Step 13000 (1844.85536s): 0.8432388305664062


 45%|████▌     | 13502/30000 [31:56<38:58,  7.06it/s]

Step 13500 (1916.09841s): 0.8858431577682495


 47%|████▋     | 14002/30000 [33:07<37:43,  7.07it/s]

Step 14000 (1987.27144s): 0.8960751295089722


 48%|████▊     | 14502/30000 [34:18<36:38,  7.05it/s]

Step 14500 (2058.20129s): 0.8274625539779663


 50%|█████     | 15002/30000 [35:29<35:26,  7.05it/s]

Step 15000 (2129.45701s): 0.9176118969917297


 52%|█████▏    | 15502/30000 [36:40<34:18,  7.04it/s]

Step 15500 (2200.77844s): 0.89134281873703


 53%|█████▎    | 16002/30000 [37:52<33:10,  7.03it/s]

Step 16000 (2272.04636s): 0.8564547300338745


 55%|█████▌    | 16502/30000 [39:03<31:50,  7.06it/s]

Step 16500 (2343.26157s): 0.8203096389770508


 57%|█████▋    | 17002/30000 [40:14<31:15,  6.93it/s]

Step 17000 (2414.51778s): 0.8547755479812622


 58%|█████▊    | 17502/30000 [41:25<29:31,  7.06it/s]

Step 17500 (2485.83381s): 0.8248926997184753


 60%|██████    | 18002/30000 [42:37<28:33,  7.00it/s]

Step 18000 (2557.13467s): 0.8240297436714172


 62%|██████▏   | 18502/30000 [43:48<27:10,  7.05it/s]

Step 18500 (2628.15525s): 0.8590502738952637


 63%|██████▎   | 19002/30000 [44:59<26:01,  7.04it/s]

Step 19000 (2698.98448s): 0.8413810729980469


 65%|██████▌   | 19502/30000 [46:10<24:59,  7.00it/s]

Step 19500 (2769.90259s): 0.8830087780952454


 67%|██████▋   | 20002/30000 [47:21<23:44,  7.02it/s]

Step 20000 (2841.28588s): 0.8501855134963989


 68%|██████▊   | 20502/30000 [48:32<22:46,  6.95it/s]

Step 20500 (2912.72717s): 0.8488247394561768


 70%|███████   | 21002/30000 [49:44<21:35,  6.94it/s]

Step 21000 (2984.21906s): 0.8354865908622742


 72%|███████▏  | 21502/30000 [50:56<20:18,  6.97it/s]

Step 21500 (3055.93455s): 0.8228123784065247


 73%|███████▎  | 22002/30000 [52:07<19:02,  7.00it/s]

Step 22000 (3127.24859s): 0.8734666109085083


 75%|███████▌  | 22502/30000 [53:18<17:58,  6.96it/s]

Step 22500 (3198.53531s): 0.8536523580551147


 77%|███████▋  | 23002/30000 [54:30<17:02,  6.84it/s]

Step 23000 (3270.12337s): 0.7809343338012695


 78%|███████▊  | 23502/30000 [55:41<15:24,  7.03it/s]

Step 23500 (3341.57679s): 0.8509016633033752


 80%|████████  | 24002/30000 [56:53<14:09,  7.06it/s]

Step 24000 (3413.02210s): 0.8154550194740295


 82%|████████▏ | 24502/30000 [58:04<12:59,  7.05it/s]

Step 24500 (3484.42279s): 0.8397276997566223


 83%|████████▎ | 25002/30000 [59:15<11:52,  7.02it/s]

Step 25000 (3555.84041s): 0.8444985747337341


 85%|████████▌ | 25502/30000 [1:00:27<10:37,  7.06it/s]

Step 25500 (3627.22391s): 0.8755136728286743


 87%|████████▋ | 26002/30000 [1:01:38<09:26,  7.06it/s]

Step 26000 (3698.38900s): 0.8017162680625916


 88%|████████▊ | 26502/30000 [1:02:49<08:14,  7.08it/s]

Step 26500 (3769.63526s): 0.8414490818977356


 90%|█████████ | 27002/30000 [1:04:01<07:09,  6.98it/s]

Step 27000 (3840.88077s): 0.7843167781829834


 92%|█████████▏| 27502/30000 [1:05:12<06:00,  6.92it/s]

Step 27500 (3912.06176s): 0.8364068865776062


 93%|█████████▎| 28002/30000 [1:06:23<04:45,  6.99it/s]

Step 28000 (3983.18835s): 0.778032660484314


 95%|█████████▌| 28502/30000 [1:07:34<03:32,  7.04it/s]

Step 28500 (4054.45079s): 0.8168056011199951


 97%|█████████▋| 29002/30000 [1:08:45<02:20,  7.09it/s]

Step 29000 (4125.67949s): 0.7425625920295715


 98%|█████████▊| 29502/30000 [1:09:57<01:11,  7.00it/s]

Step 29500 (4196.96880s): 0.8318692445755005


100%|██████████| 30000/30000 [1:11:08<00:00,  7.03it/s]


Done!


In [None]:
with open('document.md', 'w', encoding="utf-8") as fd:
    fd.write(decode(model.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=2048)[0].tolist()))

'\twhite the $p\\right$ obtainservative $\\mathrm{XVBRatS}^c$ is a braid as Hegeland, a quasitopous choose $Y^{\\mathrm{op}}$ of [[point-arrows]] is denoted $[S^{\\pm}], \n\n* which determines the corresponding pullbacks of [[universal projective orbits|universal projective modules]] over $X$ which is denoted by any $\\oversetext.\n\n### Whatever the universal property\n\nThe [[arrow separations are preserved by [[axioms]]] of the [[irreflexive category]]. (However, the Grothendieck-Theoretical orbits is very arguable abstract for detail: \n\n$\\begin{defin}\n  \\gircoloneqq\n  \\begin{proof}\n  \\array{\n     F_q \\approx{{\\elim_{i \\phi}}^\\deg(\\frac{1}{2}\\dot ( iu\\tor)^p )} j\n      {\\delta( \\chi_{n+1}^\\ddeg( i \\to j \\underlying_{\\phi}^\\deg))$\n      \\\\\n      \\end{aligned}\n     \\\\\n    F^n G &\\coloneqq& \\left\\{ i \\iota_J, j\\right\\}_{i,j} J)\n    \\;\\coloneqq\\;\n  \\Big(\n     \\Sigma_J(U) \\big[ I, \\chi_i\n    \\Big[ i \\tfrac{i}{\\phi}}\\big)\n  \\Big)\n 