<a href="https://colab.research.google.com/github/mettafore/annotated-transformer/blob/master/nbs/03_transformer_workshop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [None]:
import torch
from IPython.core.magic import register_cell_magic
from IPython.display import HTML, display
import html
import math
import copy
import numpy as np

https://arxiv.org/pdf/1706.03762

I will be doing an implementation of the seminal paper "Attention Is All You Need."

## Attention

Implement the scaled dot-product attention mechanism. Remember the formula: Attention(Q,K,V) = softmax(QK^T / âˆšd_k)V

In [None]:
def attention(Q, K, V, dropout, mask):
    """Scaled Dot-Product Attention"""
    sqrt_d_k = math.sqrt(K.size(-1))

    # TODO: Compute attention scores (Q @ K^T / sqrt_d_k)
    scores =

    # TODO: Apply mask if provided (set masked positions to -1e9)
    if mask is not None:
        scores =

    # TODO: Apply softmax to get attention weights
    attention_weights =

    # TODO: Apply dropout if provided
    if dropout is not None:
        attention_weights =

    # TODO: Multiply attention weights by V
    output =

    return output

#### Unit Test

In [None]:
# Test attention
Q = torch.randn(2, 1, 4, 8)
K = torch.randn(2, 1, 4, 8)
V = torch.randn(2, 1, 4, 8)
result = attention(Q, K, V, None, None)
print("Result shape:", result.shape)

## MultiHeadedAttention

Implement multi-head attention. Split d_model into h heads, apply attention to each, then concatenate.

In [None]:
class MultiHeadedAttention(torch.nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super().__init__()
        assert d_model % h == 0

        self.d_k = d_model // h
        self.h = h
        self.dropout = torch.nn.Dropout(p=dropout)
        self.attn = None

        # TODO: Create 4 linear layers (Q, K, V projections + final output)
        self.linear_layers =

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        # TODO: Project Q, K, V using first 3 linear layers
        Q =
        K =
        V =

        # TODO: Reshape to split into h heads: (batch, seq_len, d_model) -> (batch, h, seq_len, d_k)
        Q = Q.reshape(batch_size, -1, self.h, self.d_k).transpose(1,2)
        K =
        V =

        # TODO: Apply mask if provided
        if mask is not None:
            mask = mask.unsqueeze(1)

        # TODO: Apply attention
        x = attention(Q, K, V, self.dropout, mask)

        # TODO: Concatenate heads back: (batch, h, seq_len, d_k) -> (batch, seq_len, d_model)
        x = x.transpose(1,2).reshape(batch_size, -1, self.h * self.d_k)

        # TODO: Apply final linear projection
        output =

        return output

#### Unit Test

In [None]:
# Test MultiHeadedAttention
mha = MultiHeadedAttention(h=8, d_model=512, dropout=0.1)
x = torch.randn(2, 10, 512)
output = mha(x, x, x)
print("Input shape:", x.shape, "Output shape:", output.shape)

## PositionwiseFeedForward

Implement the position-wise feed-forward network: FFN(x) = max(0, xW1 + b1)W2 + b2

In [None]:
class PositionwiseFeedForward(torch.nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        # TODO: Create two linear layers: d_model -> d_ff -> d_model
        self.linear_layer =
        self.relu = torch.nn.ReLU()
        self.dropout =
        self.output_layer =

    def forward(self, x):
        # TODO: Apply linear -> ReLU -> dropout -> linear
        x =
        x = self.relu(x)
        x =
        x =
        return x

#### Unit Test

In [None]:
# Test PositionwiseFeedForward
ffn = PositionwiseFeedForward(d_model=512, d_ff=2048, dropout=0.1)
x = torch.randn(2, 10, 512)
output = ffn(x)
print("Input shape:", x.shape, "Output shape:", output.shape)

## Positional Encoding

Since attention has no notion of position, we add positional encodings using sin/cos functions of different frequencies.

In [None]:
class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super().__init__()
        self.dropout = torch.nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        even_indices = torch.arange(0, d_model, 2)

        # TODO: Create position vector [0, 1, 2, ..., max_len-1]
        position =

        # TODO: Calculate div_term for the denominator
        div_term = torch.exp(-even_indices * (torch.log(torch.tensor(10000.0)) / d_model))

        # TODO: Apply sin to even indices, cos to odd indices
        pe[:, ::2] =
        pe[:, 1::2] =

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        seq_len = x.size(1)
        # TODO: Add positional encoding to x
        x =
        return self.dropout(x)

#### Unit Test

In [None]:
# Test PositionalEncoding
pe = PositionalEncoding(d_model=512, dropout=0.1)
x = torch.zeros(2, 10, 512)
output = pe(x)
print("Input shape:", x.shape, "Output shape:", output.shape)

## Encoder Layer

Each encoder layer has two sub-layers: multi-head self-attention and feed-forward network, each with residual connections.

### Sublayer Connection

In [None]:
class SublayerConnection(torch.nn.Module):
    "A residual connection followed by layer norm"
    def __init__(self, size, dropout):
        super().__init__()
        # TODO: Create LayerNorm and Dropout
        self.layer_norm =
        self.dropout =

    def forward(self, x, sublayer):
        # TODO: Apply LayerNorm -> sublayer -> dropout -> add residual (x + dropout(sublayer(layer_norm(x))))
        y = self.layer_norm(x)
        y = sublayer(y)
        y =
        return

### Final clones function

In [None]:
def clones(module, N):
    "Produce N identical layers"
    # TODO: Create N deep copies of module
    return

### Final EncoderLayer Function

In [None]:
class EncoderLayer(torch.nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        super().__init__()
        self.size = size
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        # TODO: Create 2 SublayerConnections
        self.sublayer =

    def forward(self, x, mask):
        # TODO: Apply self-attention with residual
        x =
        # TODO: Apply feed-forward with residual
        x =
        return x

#### Unit Test

In [None]:
# Test EncoderLayer
attn = MultiHeadedAttention(h=8, d_model=512, dropout=0.1)
ff = PositionwiseFeedForward(d_model=512, d_ff=2048, dropout=0.1)
encoder_layer = EncoderLayer(size=512, self_attn=attn, feed_forward=ff, dropout=0.1)
x = torch.randn(2, 10, 512)
output = encoder_layer(x, None)
print("Input shape:", x.shape, "Output shape:", output.shape)

## Encoder Class

Stack N encoder layers together.

In [None]:
class Encoder(torch.nn.Module):
    "Stack of N encoder layers"
    def __init__(self, layer, N):
        super().__init__()
        # TODO: Create N copies of layer
        self.encoders =
        # TODO: Create final LayerNorm
        self.layer_norm =

    def forward(self, x, mask):
        # TODO: Pass x through each encoder layer
        for encoder in self.encoders:
            x =
        # TODO: Apply final layer norm
        return

#### Unit test

In [None]:
# Test Encoder
encoder = Encoder(encoder_layer, N=6)
x = torch.randn(2, 10, 512)
output = encoder(x, None)
print("Input shape:", x.shape, "Output shape:", output.shape)

## Decoder Layer

Each decoder layer has three sub-layers: masked self-attention, cross-attention to encoder output, and feed-forward.

In [None]:
class DecoderLayer(torch.nn.Module):
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super().__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        # TODO: Create 3 SublayerConnections
        self.sublayers =

    def forward(self, x, memory, src_mask, tgt_mask):
        # TODO: Apply masked self-attention
        x =
        # TODO: Apply cross-attention to encoder output (memory)
        x =
        # TODO: Apply feed-forward
        x =
        return x

#### Unit test

In [None]:
# Test DecoderLayer
attn1 = MultiHeadedAttention(h=8, d_model=512, dropout=0.1)
attn2 = MultiHeadedAttention(h=8, d_model=512, dropout=0.1)
ff = PositionwiseFeedForward(d_model=512, d_ff=2048, dropout=0.1)
decoder_layer = DecoderLayer(size=512, self_attn=attn1, src_attn=attn2, feed_forward=ff, dropout=0.1)
x = torch.randn(2, 10, 512)
memory = torch.randn(2, 10, 512)
output = decoder_layer(x, memory, None, None)
print("Input shape:", x.shape, "Output shape:", output.shape)

## Decoder

Stack N decoder layers together.

In [None]:
class Decoder(torch.nn.Module):
    def __init__(self, layer, N):
        super().__init__()
        # TODO: Create N copies of layer
        self.layers =
        # TODO: Create final LayerNorm
        self.layer_norm =

    def forward(self, x, memory, src_mask, tgt_mask):
        # TODO: Pass through each decoder layer
        for layer in self.layers:
            x =
        # TODO: Apply final layer norm
        return

### Unit test

In [None]:
# Test Decoder
decoder = Decoder(decoder_layer, N=6)
x = torch.randn(2, 10, 512)
memory = torch.randn(2, 10, 512)
output = decoder(x, memory, None, None)
print("Input shape:", x.shape, "Output shape:", output.shape)

## Embeddings

Convert token IDs to dense vectors, scaled by sqrt(d_model).

In [None]:
class Embeddings(torch.nn.Module):
    def __init__(self, d_model, vocab):
        super().__init__()
        self.d_model = d_model
        # TODO: Create embedding layer
        self.embedding =

    def forward(self, x):
        # TODO: Apply embedding and scale by sqrt(d_model)
        return

### Unit test

In [None]:
# Test Embeddings
emb = Embeddings(d_model=512, vocab=1000)
x = torch.randint(0, 1000, (2, 10))
output = emb(x)
print("Input shape:", x.shape, "Output shape:", output.shape)

## Generator class

Final linear layer + log softmax to convert decoder output to token probabilities.

In [None]:
class Generator(torch.nn.Module):
    def __init__(self, d_model, vocab):
        super().__init__()
        # TODO: Create linear layer and log softmax
        self.linear =
        self.logsoftmax =

    def forward(self, x):
        # TODO: Apply linear -> log_softmax
        x =
        return

#### Unit test

In [None]:
# Test Generator
gen = Generator(d_model=512, vocab=1000)
x = torch.randn(2, 10, 512)
output = gen(x)
print("Input shape:", x.shape, "Output shape:", output.shape)

### Final Encoder Decoder Function.

In [None]:
class EncoderDecoder(torch.nn.Module):
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator

    def encode(self, src, src_mask):
        x = self.src_embed(src)
        return self.encoder(x, src_mask)

    def decode(self, memory, src_mask, tgt, tgt_mask):
        x = self.tgt_embed(tgt)
        return self.decoder(x, memory, src_mask, tgt_mask)

    def forward(self, src, tgt, src_mask, tgt_mask):
        memory = self.encode(src, src_mask)
        return self.decode(memory, src_mask, tgt, tgt_mask)

## Make Model

Helper function to construct the full transformer model.

In [None]:
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
    "Construct transformer model from hyperparameters"
    # Create shared components
    multi_head_attentions = [copy.deepcopy(MultiHeadedAttention(h, d_model, dropout)) for _ in range(3)]
    feedforward_layers = [copy.deepcopy(PositionwiseFeedForward(d_model, d_ff, dropout)) for _ in range(2)]
    positional_encoding = [PositionalEncoding(d_model, dropout) for _ in range(2)]

    src_embedding_layer = Embeddings(d_model, src_vocab)
    tgt_embeddings_layer = Embeddings(d_model, tgt_vocab)
    src_embed = torch.nn.Sequential(src_embedding_layer, positional_encoding[0])
    tgt_embed = torch.nn.Sequential(tgt_embeddings_layer, positional_encoding[1])

    encoder_layer = EncoderLayer(d_model, multi_head_attentions[0], feedforward_layers[0], dropout)
    decoder_layer = DecoderLayer(d_model, multi_head_attentions[1], multi_head_attentions[2], feedforward_layers[1], dropout)

    encoder = Encoder(encoder_layer, N)
    decoder = Decoder(decoder_layer, N)
    generator = Generator(d_model, tgt_vocab)

    encoder_decoder = EncoderDecoder(encoder, decoder, src_embed, tgt_embed, generator)

    # Initialize parameters
    for p in encoder_decoder.parameters():
        if p.dim() > 1:
            torch.nn.init.xavier_uniform_(p)

    return encoder_decoder

## Copy Code Task

Training utilities for the copy task (provided complete).

### Final subsequent mask function

In [None]:
def subsequent_mask(size):
    "Create mask to prevent attention to future positions"
    lower_t = torch.ones([size, size]).tril().bool()
    lower_t = lower_t.unsqueeze(0)
    return lower_t

### Final Batch Class for copy example

In [None]:
class Batch:
    def __init__(self, src, tgt=None, pad=2):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if tgt is not None:
            self.tgt = tgt[:, :-1]
            self.tgt_y = tgt[:,1:]
            self.pad_tgt_mask = (self.tgt!=pad).unsqueeze(-2)
            self.subseq_tgt_mask = subsequent_mask(self.tgt.size(1))
            self.tgt_mask = self.pad_tgt_mask & self.subseq_tgt_mask
            self.ntokens = (self.tgt_y!=pad).sum()

### Final Data Gen Function

In [None]:
def data_gen(V, batch_size, nbatches):
    "Generate random data for a src-tgt copy task"
    for i in range(nbatches):
        random_int = np.random.randint(1, V, size=[batch_size, 10])
        random_int[:,0] = 1
        random_int = torch.tensor(random_int)
        src = random_int
        tgt = random_int
        yield Batch(src, tgt, 0)

### Final Simple Loss Compute Function

In [None]:
class SimpleLossCompute:
    def __init__(self, generator, criterion):
        self.generator = generator
        self.criterion = criterion

    def __call__(self, x, y, norm):
        pred = self.generator(x)
        vocab = pred.size(-1)
        pred_flat = pred.reshape(-1, vocab)
        y_flat = y.reshape(-1)
        loss = self.criterion(pred_flat, y_flat) / norm
        return loss.data * norm, loss

### Final Run Epoch Function for Copy

In [None]:
def run_epoch(data_iter, model, loss_compute, optimizer, scheduler, mode="train"):
    total_tokens = 0
    total_loss = 0

    for i, batch in enumerate(data_iter):
        pred = model(batch.src, batch.tgt, batch.src_mask, batch.tgt_mask)
        num_loss, tensor_loss = loss_compute(pred, batch.tgt_y, batch.ntokens)
        if mode == "train":
            tensor_loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
        total_loss += num_loss
        total_tokens += batch.ntokens

    return total_loss / total_tokens

### Greedy Decoding

In [None]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    memory = model.encode(src, src_mask)
    ys = torch.LongTensor([[start_symbol]])
    for _ in range(max_len - 1):
        tgt_mask = subsequent_mask(ys.size(1))
        x = model.decode(memory, src_mask, ys, tgt_mask)
        pred = model.generator(x[:, -1])
        _, max_indices = torch.max(pred, dim=-1)
        max_indices = max_indices.data[0]
        ys = torch.cat([ys, torch.ones(1,1).type_as(src.data).fill_(max_indices)], dim=-1)
    return ys

### Training Example

In [None]:
# Create small model for testing
V = 11
criterion = torch.nn.CrossEntropyLoss()
model = make_model(V, V, N=2, d_model=64, d_ff=128, h=4, dropout=0.1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.5, betas=(0.9, 0.98), eps=1e-9)

from torch.optim.lr_scheduler import LambdaLR
def rate(step, model_size=64, factor=1.0, warmup=400):
    if step == 0:
        step = 1
    return factor * (model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5)))

scheduler = LambdaLR(optimizer, lr_lambda=lambda step: rate(step))

print("Training...")
for epoch in range(10):
    model.train()
    loss_compute = SimpleLossCompute(model.generator, criterion)
    loss = run_epoch(data_gen(V, batch_size=30, nbatches=20), model, loss_compute, optimizer, scheduler, mode="train")
    print(f"Epoch {epoch} Loss: {loss:.4f}")

print("\nTraining complete!")

### Test Inference

In [None]:
model.eval()
src = torch.LongTensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
src_mask = torch.ones(1, 1, 10)

print("Source:", src)
result = greedy_decode(model, src, src_mask, max_len=10, start_symbol=1)
print("Generated:", result)
print("\nSuccess!", torch.equal(src, result))

## ðŸŽ‰ Congratulations! ðŸŽ‰

You've implemented a Transformer from scratch and trained it successfully!

### Resources:
- [Original Paper](https://arxiv.org/pdf/1706.03762)
- [Harvard NLP Annotated Transformer](https://nlp.seas.harvard.edu/2018/04/03/attention.html)