# GPT from scratch
This code is based on [karpathy/nanoGPT model.py](https://github.com/karpathy/nanoGPT/blob/master/model.py)

In [42]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import os

In [None]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0

In [44]:
# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
if not os.path.exists('input.txt'):
  !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

with open('input.txt', 'r', encoding='utf-8') as f:
  text = f.read()

In [None]:
# We will tokenize text letter by letter. Although GPT uses more advanced
# tokenization methods (Byte Pair Encoding (BPE) is a method that segments text
# into tokens based on frequently occurring subwords or word fragments, rather
# than just individual characters or whole words.), we choose letter-by-letter
# tokenization because it is simpler and helps better understand the inner
# workings of GPT, which is our main goal.

# To achieve this, we will implement an encoder and a decoder—functions that map
# strings to integers and vice versa. This approach will allow us to illustrate
# the process more clearly and provide input that the transformer model can
# understand.

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a strin

In [46]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [47]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [None]:
# This function estimates the average loss of the model on both the training and
# validation datasets.  It runs in evaluation mode (disabling dropout and other
# training-specific layers) to get stable loss estimates.  The torch.no_grad()
# decorator disables gradient calculation to save memory and computation since
# we don't need to update weights.  For each dataset split ('train' and 'val'),
# it collects losses over multiple batches and then averages them.  Finally, it
# returns a dictionary containing the average loss for both splits and switches
# the model back to training mode.

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:

class Head(nn.Module):
    """ 
    Single head of self-attention mechanism.

    This module implements one attention head, which:
    - Projects input embeddings into key, query, and value vectors.
    - Computes scaled dot-product attention scores between queries and keys.
    - Applies a causal mask to ensure the model cannot attend to future tokens.
    - Uses softmax to get attention weights and applies dropout for regularization.
    - Aggregates the values weighted by attention scores to produce the output.
    """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [None]:
class MultiHeadAttention(nn.Module):
    """ 
    Implements multi-head self-attention by running multiple attention heads in parallel.

    Each head independently performs scaled dot-product attention, 
    and their outputs are concatenated and projected back to the original embedding size.
    """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [None]:
class FeedFoward(nn.Module):
    """
    A simple feed-forward neural network layer used within the transformer block.

    Consists of:
    - A linear layer that expands the embedding dimension by a factor of 4,
    - A ReLU activation introducing non-linearity,
    - A linear layer that projects back to the original embedding size,
    - A dropout layer for regularization.
    """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [None]:
class Block(nn.Module):
    """
    Transformer block consisting of:

    - Multi-head self-attention ("communication" phase) with residual connection and layer normalization.
    - Feed-forward network ("computation" phase) with residual connection and layer normalization.

    This structure enables the model to capture dependencies across tokens and then apply complex transformations.
    """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [None]:
class LanguageModel(nn.Module):
    """
    A language model that predicts the next token based on previous tokens,
    using learned token and positional embeddings, followed by multiple transformer blocks.

    Components:
    - Token embeddings: maps each token index to an embedding vector.
    - Positional embeddings: adds information about token position in the sequence.
    - Transformer blocks: multiple layers of self-attention and feed-forward networks.
    - Final layer normalization.
    - Linear layer projecting embeddings to logits over the vocabulary.

    The model supports both training (computing loss) and autoregressive text generation.
    """

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        # positional embeddings to encode token positions in the sequence
        # (learned embeddings here, unlike the original Transformer which used
        # fixed sinusoidal embeddings).
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        # LayerNorm normalizes inputs to stabilize and speed up training by
        # reducing internal covariate shift
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [None]:
model = LanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

0.209729 M parameters


In [55]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.3394, val loss 4.3431
step 100: train loss 2.6632, val loss 2.6792
step 200: train loss 2.5217, val loss 2.5418
step 300: train loss 2.4542, val loss 2.4649
step 400: train loss 2.3818, val loss 2.3967
step 500: train loss 2.3091, val loss 2.3468
step 600: train loss 2.2626, val loss 2.2760
step 700: train loss 2.2163, val loss 2.2513
step 800: train loss 2.1682, val loss 2.1930
step 900: train loss 2.1196, val loss 2.1471
step 1000: train loss 2.0918, val loss 2.1344
step 1100: train loss 2.0600, val loss 2.1223
step 1200: train loss 2.0399, val loss 2.0914
step 1300: train loss 2.0177, val loss 2.0806
step 1400: train loss 1.9789, val loss 2.0492
step 1500: train loss 1.9664, val loss 2.0360
step 1600: train loss 1.9329, val loss 2.0068
step 1700: train loss 1.9206, val loss 2.0012
step 1800: train loss 1.9113, val loss 1.9921
step 1900: train loss 1.8966, val loss 1.9883
step 2000: train loss 1.8729, val loss 1.9764
step 2100: train loss 1.8606, val loss 1.9689


In [56]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


Be everen threst, him lords you to still grow.
Graciong are as to give, A care tenes.

BRUTUS:
Sid us is took a worst Garlain foot I well and man
Bels regoly, you in soul holde
Tone our Crivivem out my king our proment
hold fither: country lame, coulent that chamed
Richman I unted some son, done and dried mearce, in no approuce.

SICINIUS:
No dismore, know stady my say, the see he do,
'Cour me paredom for her one should movest bodie-him,
Where it that God
Merred full proful
As most but warms for fine.
'Tis would hears to a scity go me, this truaing make.

BOLIONUS:
Come, thou who deed in the ower grow; who at the two wreath.
Who more the care offel and slought
The coped to win creitge niclos that furself;
I'll it must, and you roin king
Fromed the shephends it.

COMINGO:
No, but have to muttage woulder aloged
Crute and laight. Cate it thun in Donoke.

First Give Warwardony corthall, haste of the rest tore the shall
Our must you hath hose you moreh, that jrden,
Now, to I reft, what is 