# Set Up

- load data
- create character level tokens (transform into integers);
- create train / test splits


In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

--2025-02-17 10:08:45--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-02-17 10:08:45 (101 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [None]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [None]:
# let's look at the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Vocab Size:", vocab_size)

stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }


encoder = lambda s: [stoi[c] for c in s]
decoder = lambda l: ''.join([itos[i] for i in l])

Vocab Size: 65


In [None]:
data = torch.tensor(encoder(text), dtype=torch.long)
n = int(0.9 * len(data))
train = data[:n]
val = data[n:]
train.shape, val.shape

(torch.Size([1003854]), torch.Size([111540]))

In [None]:
# intuition behind data preparation
block_size=8
x = train[:block_size]
y = train[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"{t}/{block_size} when input is {context} the target: {target}")

0/8 when input is tensor([18]) the target: 47
1/8 when input is tensor([18, 47]) the target: 56
2/8 when input is tensor([18, 47, 56]) the target: 57
3/8 when input is tensor([18, 47, 56, 57]) the target: 58
4/8 when input is tensor([18, 47, 56, 57, 58]) the target: 1
5/8 when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
6/8 when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
7/8 when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


# Helper Functions

In [None]:
# get a small randomly sampled batch of data
def get_batch(split):
  data = train if split == 'train' else val
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)
  return x, y

@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

# Transformer Components

In [None]:
class SelfAttentionHead(nn.Module):
  """ one self attention head """
  def __init__(self, head_size):
    super().__init__()
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B, T, C = x.shape
    k = self.key(x)
    q = self.query(x)

    # compute attention scores / affinities between words
    wei = q @ k.transpose(-2, -1) * C**-0.5
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)
    wei = self.dropout(wei)

    # weighted aggregation of values
    v = self.value(x)
    out = wei @ v
    return out

class MultiHead(nn.Module):
  """ multiple self attention heads concatenated """
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([SelfAttentionHead(head_size) for _ in range(num_heads)])
    self.projection = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.projection(out)
    out = self.dropout(out)
    return out

class FeedForward(nn.Module):
  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
      # n_embd is the input size, 4*n_embd is the hidden dimension size
      nn.Linear(n_embd, 4 * n_embd),
      nn.ReLU(),
      nn.Linear(4 * n_embd, n_embd),
      nn.Dropout(dropout)
    )

  def forward(self, x):
    out = self.net(x)
    return out

class LayerNorm1d(nn.Module):
  def __init__(self, dim, eps=1e-5):
    super().__init__()
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):
    xmean = x.mean(dim=1, keepdim=True)
    xvar = x.var(dim=1, keepdim=True)
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

class Block(nn.Module):
  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size = n_embd // n_head
    self.a = MultiHead(n_head, head_size)
    self.ff = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    # use of residual connections
    x = x + self.a(self.ln1(x))
    x = x + self.ff(self.ln2(x))
    return x

class LanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
    self.ln = nn.LayerNorm(n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape

    tok_emb = self.token_embedding_table(idx)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))

    x = tok_emb + pos_emb
    x = self.blocks(x)
    x = self.ln(x)
    logits = self.lm_head(x)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss
  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      # crop idx to the last block_size tokens
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx


# Training

In [None]:
#######################
# Hyperparameters
device = "cuda" if torch.cuda.is_available()else "cpu"
eval_iters = 200
max_iters = 7000
learning_rate = 1e-4

batch_size = 64
block_size = 256

n_head = 6
n_layer = 6
n_embd = 384
dropout = 0.2


#######################

model = LanguageModel()
model = model.to(device)
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')
model

10.788929 M parameters


LanguageModel(
  (token_embedding_table): Embedding(65, 384)
  (position_embedding_table): Embedding(256, 384)
  (blocks): Sequential(
    (0): Block(
      (a): MultiHead(
        (heads): ModuleList(
          (0-5): 6 x SelfAttentionHead(
            (query): Linear(in_features=384, out_features=64, bias=False)
            (key): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (projection): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ff): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1536, out_features=384, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((384,), eps=1e-05, elementwise_af

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
for iter in range(max_iters):
  if iter % eval_iters == 0 or iter == max_iters - 1:
    losses = estimate_loss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  xb, yb = get_batch("train")

  logits, loss = model(xb, yb)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

step 0: train loss 4.2849, val loss 4.2824
step 200: train loss 2.4768, val loss 2.4906
step 400: train loss 2.4244, val loss 2.4494
step 600: train loss 2.3505, val loss 2.3714
step 800: train loss 2.2024, val loss 2.2424
step 1000: train loss 2.0857, val loss 2.1438
step 1200: train loss 1.9891, val loss 2.0715
step 1400: train loss 1.9080, val loss 2.0078
step 1600: train loss 1.8379, val loss 1.9609
step 1800: train loss 1.7799, val loss 1.9242
step 2000: train loss 1.7287, val loss 1.8833
step 2200: train loss 1.6789, val loss 1.8388
step 2400: train loss 1.6370, val loss 1.8155
step 2600: train loss 1.5999, val loss 1.7838
step 2800: train loss 1.5722, val loss 1.7551
step 3000: train loss 1.5414, val loss 1.7315
step 3200: train loss 1.5252, val loss 1.7199
step 3400: train loss 1.4982, val loss 1.6966
step 3600: train loss 1.4756, val loss 1.6792
step 3800: train loss 1.4589, val loss 1.6636
step 4000: train loss 1.4389, val loss 1.6425
step 4200: train loss 1.4252, val loss 1.

KeyboardInterrupt: 

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decoder(model.generate(context, max_new_tokens=2000)[0].tolist()))


Jount Not by love, beliked,
Whose the tric of the doth torchee frogeth;
Why crown to did Lawn Edglishire, and beath
More seizards than that things royalt.
What see tolemen? not this news, not Gaunt?
Or shows cewards? what contured, stand with you tow;
Wherefore, because they think it with there bruithers?
And I plead, let's your heaving, Claudious, sir,
Or the like is name.

CLARENCE:
Nay, sir, that march well may to-man, would,
And we letter Name go all, I ready tiele,
And, winning the cursh out oft your uping soul.

CLARENCE:
Why, for we would you stay ship you lets us conce;
Or, beard Witchmand and bear thy tongue of their
As thou drun. If you answer for knower; but child non
The unlovider Yand be none mean the mind years.
MERCK:
My greater Pomper forth, lords,
What you woy may Pribhal me about,
Who must aven you go fourtner. What, mift you
Were god in so the cursess:  of a trunne worsed
Shall of whom. My come, if you I do you
Voul us, this judge; book your me; but your greath
A se