In [8]:
import torch
import torch.nn as nn

# 自注意力 v1：最简实现，没有 bias、没有 mask、没有 batch
class SelfAttention_v1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))
        self.W_key   = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))

    def forward(self, x):
        # x: shape = [num_tokens, d_in]
        queries = x @ self.W_query      # [num_tokens, d_out]
        keys    = x @ self.W_key        # [num_tokens, d_out]
        values  = x @ self.W_value      # [num_tokens, d_out]

        attn_scores = queries @ keys.T  # [num_tokens, num_tokens]
        attn_weights = torch.softmax(
            attn_scores / (keys.shape[-1] ** 0.5), dim=-1
        )

        context = attn_weights @ values  # [num_tokens, d_out]
        return context, attn_weights  # ⬅ 加上权重返回


In [9]:
# 假设有 4 个词，每个词的 embedding 是 5维
num_tokens = 4
d_in = 5
d_out = 8

# 生成输入：随机词向量
x = torch.rand(num_tokens, d_in)

# 初始化模型
attn = SelfAttention_v1(d_in, d_out)

# 前向传播
out, attn_weights = attn(x)


print("注意力权重矩阵 shape:", attn_weights.shape)
print("注意力权重矩阵内容:\n", attn_weights)


注意力权重矩阵 shape: torch.Size([4, 4])
注意力权重矩阵内容:
 tensor([[0.1374, 0.5729, 0.2220, 0.0678],
        [0.0998, 0.6732, 0.1882, 0.0387],
        [0.1307, 0.5953, 0.2134, 0.0606],
        [0.1572, 0.5265, 0.2302, 0.0862]], grad_fn=<SoftmaxBackward0>)
