# Transformers from Scratch

In [1]:
import torch
import torch.nn.functional as F

# 3 tokens, head_dim = 2 tensors
Q = torch.tensor([[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]])
K = torch.tensor([[1.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
V = torch.tensor([[1.0, 0.0], [0.0, 2.0], [1.0, 1.0]])
L, d_h = Q.shape  # L=3, d_h=2

# attention scores
scores = (Q @ K.T) / (d_h**0.5)
print(f"scores shape: {scores.shape}")
print(scores)

# attention weights: softmax over keys dimension
attn_weights = F.softmax(scores, dim=-1)  # [L, L]
print(f"\nattn_weights shape: {attn_weights.shape}")
print(attn_weights)

# output: [L, d_h]
attn_output = attn_weights @ V
print(f"\nattn_output shape: {attn_output.shape}")
print(attn_output)

scores shape: torch.Size([3, 3])
tensor([[0.7071, 0.7071, 0.0000],
        [0.0000, 0.7071, 0.7071],
        [0.7071, 1.4142, 0.7071]])

attn_weights shape: torch.Size([3, 3])
tensor([[0.4011, 0.4011, 0.1978],
        [0.1978, 0.4011, 0.4011],
        [0.2483, 0.5035, 0.2483]])

attn_output shape: torch.Size([3, 2])
tensor([[0.5989, 1.0000],
        [0.5989, 1.2033],
        [0.4965, 1.2552]])
