In [1]:
# Q1_scaled_attention.py
import numpy as np

def softmax(x, axis=-1):
    # numerically stable softmax
    x_max = np.max(x, axis=axis, keepdims=True)
    e = np.exp(x - x_max)
    return e / np.sum(e, axis=axis, keepdims=True)

def scaled_dot_product_attention(Q, K, V, mask=None):
    """
    Compute scaled dot-product attention.
    Inputs:
      Q: (batch, seq_q, d_k)
      K: (batch, seq_k, d_k)
      V: (batch, seq_k, d_v)
      mask: optional (batch, seq_q, seq_k) with 0 for allowed positions and -inf (or large negative) for masked
    Returns:
      attention_weights: (batch, seq_q, seq_k)
      context: (batch, seq_q, d_v)
    """
    d_k = Q.shape[-1]
    # raw scores: (batch, seq_q, seq_k)
    scores = np.matmul(Q, np.swapaxes(K, -1, -2)) / np.sqrt(d_k)
    if mask is not None:
        # assume mask contains True for positions to mask, or additive mask
        scores = np.where(mask, -1e9, scores)
    # attention weights
    attn_weights = softmax(scores, axis=-1)
    # context: weighted sum over values
    context = np.matmul(attn_weights, V)  # (batch, seq_q, d_v)
    return attn_weights, context

# small test
if __name__ == "__main__":
    np.random.seed(1)
    B = 2
    seq_q = 3
    seq_k = 4
    d_k = 8
    d_v = 6
    Q = np.random.randn(B, seq_q, d_k)
    K = np.random.randn(B, seq_k, d_k)
    V = np.random.randn(B, seq_k, d_v)
    attn_w, ctx = scaled_dot_product_attention(Q, K, V)
    print("attn_w shape:", attn_w.shape)  # (2,3,4)
    print("ctx shape:", ctx.shape)        # (2,3,6)


attn_w shape: (2, 3, 4)
ctx shape: (2, 3, 6)


In [2]:
# Q2_transformer_encoder.py
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleMultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_head = d_model // num_heads

        # linear projections for Q, K, V and output
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        # x: (batch, seq_len, d_model)
        B, T, _ = x.size()
        Q = self.W_q(x)  # (B, T, d_model)
        K = self.W_k(x)
        V = self.W_v(x)

        # reshape to heads: (B, num_heads, T, d_head)
        def split_heads(tensor):
            return tensor.view(B, T, self.num_heads, self.d_head).transpose(1, 2)
        Qh = split_heads(Q)
        Kh = split_heads(K)
        Vh = split_heads(V)

        # scaled dot-product per head
        scores = torch.matmul(Qh, Kh.transpose(-2, -1)) / (self.d_head ** 0.5)  # (B, h, T, T)
        if mask is not None:
            # mask shape should be broadcastable to (B, 1, T, T), use True for masked positions
            scores = scores.masked_fill(mask.unsqueeze(1).bool(), float('-inf'))
        attn = F.softmax(scores, dim=-1)  # (B, h, T, T)
        out_heads = torch.matmul(attn, Vh)  # (B, h, T, d_head)

        # concat heads -> (B, T, d_model)
        out = out_heads.transpose(1, 2).contiguous().view(B, T, self.d_model)
        out = self.W_o(out)
        return out, attn  # return attention for debugging if desired

class TransformerEncoderBlock(nn.Module):
    def __init__(self, d_model=128, num_heads=8, d_ff=512, dropout=0.1):
        super().__init__()
        self.self_attn = SimpleMultiHeadSelfAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # x: (B, T, d_model)
        attn_out, attn = self.self_attn(x, mask=mask)
        x = x + self.dropout(attn_out)    # residual + dropout
        x = self.norm1(x)

        ffn_out = self.ffn(x)
        x = x + self.dropout(ffn_out)     # residual + dropout
        x = self.norm2(x)

        return x, attn

# test / verification
if __name__ == "__main__":
    B = 32
    T = 10
    d_model = 128
    num_heads = 8
    x = torch.randn(B, T, d_model)

    block = TransformerEncoderBlock(d_model=d_model, num_heads=num_heads, d_ff=512)
    out, attn = block(x)  # out shape should be (32, 10, 128)
    print("out.shape:", out.shape)   # expect (32, 10, 128)
    print("attn.shape:", attn.shape) # expect (32, num_heads, 10, 10)


out.shape: torch.Size([32, 10, 128])
attn.shape: torch.Size([32, 8, 10, 10])
