In [19]:
!wget https://raw.githubusercontent.com/mount40/zmm-blabber/refs/heads/main/data/zmm.txt

--2025-07-22 12:12:32--  https://raw.githubusercontent.com/mount40/zmm-blabber/refs/heads/main/data/zmm.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 793748 (775K) [text/plain]
Saving to: ‘zmm.txt.1’


2025-07-22 12:12:33 (19.2 MB/s) - ‘zmm.txt.1’ saved [793748/793748]



In [20]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(28)

# hyperparameters
batch_size = 64
block_size = 256
max_iters = 5000
eval_interval = 300
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
eval_iters = 200
n_embed = 384
n_head = 6
n_layer = 6
dropout = 0.2

cuda


In [21]:
with open("zmm.txt", "r") as f:
  text = f.read()

print("dataset length: ", len(text))

dataset length:  793748


In [22]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(vocab_size)

stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [ stoi[c] for c in s ] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([ itos[i] for i in l ]) # decoder: take a list of integers, output a string

80


In [23]:
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [24]:
def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i + block_size] for i in ix])
  y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
  x, y = x.to(device), y.to(device)
  return x, y

In [25]:
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

In [26]:
class AttentionHead(nn.Module):
  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embed, head_size, bias=False)
    self.query = nn.Linear(n_embed, head_size, bias=False)
    self.value = nn.Linear(n_embed, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    B, T, C = x.shape
    k = self.key(x)
    q = self.query(x)
    wei = q @ k.transpose(-2, -1) * C ** -0.5
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)
    v = self.value(x)
    out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
    return out

In [27]:
class MultiHeadAttentionHead(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([AttentionHead(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embed, n_embed)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.proj(out)
    return out

In [28]:
class FeedForward(nn.Module):
  def __init__(self, n_embed):
    super().__init__()
    self.net = nn.Sequential(
      nn.Linear(n_embed, 4 * n_embed),
      nn.ReLU(),
      nn.Linear(4 * n_embed, n_embed),
      nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.net(x)

In [29]:
# grouping attention multi-heads with feed-forward nets, so that we can have both more multi-heads and a corresponding feed-forward net for each of the heads tightly grouped together
class Block(nn.Module):
  def __init__(self, n_embed, n_head):
    super().__init__()
    head_size = n_embed // n_head
    self.attention_heads = MultiHeadAttentionHead(n_head, head_size)
    self.feed_forward_net = FeedForward(n_embed)
    self.layer_norm1 = nn.LayerNorm(n_embed)
    self.layer_norm2 = nn.LayerNorm(n_embed)

  def forward(self, x):
    x = x + self.attention_heads(self.layer_norm1(x))
    x = x + self.feed_forward_net(self.layer_norm2(x))
    return x

In [30]:
class GPTLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)
    self.blocks = nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layer)])
    self.layer_norm = nn.LayerNorm(n_embed) # final layer norm
    self.lm_head = nn.Linear(n_embed, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape

    # idx and targets are both (B, T) tensor of integers
    token_embeddings = self.token_embedding_table(idx) # (B, T, C)
    pos_embeddings = self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
    x = token_embeddings + pos_embeddings # (B, T, C)
    x = self.blocks(x) # (B,T,C)
    x = self.layer_norm(x) # (B,T,C)
    logits = self.lm_head(x) # (B,T,vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B * T, C)
      targets = targets.view(B * T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T + 1)
    return idx


In [31]:
model = BigramLanguageModel()
m = model.to(device)

In [32]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [33]:
for iter in range(max_iters):
  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  if iter % eval_interval == 0 or iter == max_iters - 1:
    losses = estimate_loss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  xb, yb = get_batch('train')

  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

step 0: train loss 4.5447, val loss 4.5410
step 0: train loss 4.5446, val loss 4.5416
step 300: train loss 2.3417, val loss 2.3363
step 300: train loss 2.3437, val loss 2.3359
step 600: train loss 1.7810, val loss 1.8114
step 600: train loss 1.7837, val loss 1.8129
step 900: train loss 1.5223, val loss 1.5825
step 900: train loss 1.5242, val loss 1.5830
step 1200: train loss 1.3843, val loss 1.4789
step 1200: train loss 1.3863, val loss 1.4785
step 1500: train loss 1.2986, val loss 1.4235
step 1500: train loss 1.2964, val loss 1.4229
step 1800: train loss 1.2306, val loss 1.3925
step 1800: train loss 1.2313, val loss 1.3912
step 2100: train loss 1.1684, val loss 1.3733
step 2100: train loss 1.1691, val loss 1.3754
step 2400: train loss 1.1149, val loss 1.3772
step 2400: train loss 1.1164, val loss 1.3749
step 2700: train loss 1.0670, val loss 1.3824
step 2700: train loss 1.0672, val loss 1.3819
step 3000: train loss 1.0096, val loss 1.4004
step 3000: train loss 1.0109, val loss 1.4051


In [35]:
# generate
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))


says, ``South no one would acepl't this not stub to this way...not
completely its overall...to understand its quite somewhat down anyway....with
auside...what good of classically to who had identified it for many
killeing times I've individual practical our world you know now that produced
the machine. Chris is always kill about what he said that Phdrus saw in
the most doors of the kids better. Mathematics knew who occur with no graduation who
has worked one's way of contains through the cottonwooks is problem.
It's an important now to talk to still back and it for Jus looking from
whether Nature to Phdrus that trunk is property. It's growing and work on it with
him, hold occur to original and it should be known to inca what they have been
having a little boy damn agot, much not to send with many particular
with both of a motorcycle maintenance be one-zero. Only church at has
a needed by the nineteen-that of the machine. The maalted is cross. Few
in reason. I've simple with the second