In [19]:
# 简易实现transformer

import torch
import torch.nn as nn

# 假设超参数
vocab_size = 10  # 词表大小
d_model = 6      # 词向量维度
seq_len = 5        # 句子长度
batch_size = 2    # 批次大小

# 定义 Embedding 层
# 这一层的参数量是: vocab_size * d_model
embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)

# 1. 构造输入 (Batch_Size, Seq_Len)
# 这里是随机生成的单词索引，范围在 [0, vocab_size-1] 之间
input_indices = torch.randint(0, vocab_size, (batch_size, seq_len))
print("输入尺寸:", input_indices.shape) 

# 2. 经过 Embedding 层
output_vectors = embedding_layer(input_indices)
print("输出尺寸:", output_vectors.shape) # 期望是 (Batch_Size, Seq_Len, D_Model)
# print(output_vectors)


输入尺寸: torch.Size([2, 5])
输出尺寸: torch.Size([2, 5, 5])


In [8]:
#接下来是实战例子演示
#假设句子是: "I love deep learning"
#词表: {"I":0, "love":1, "deep":2, "learning":3}
#对应的索引序列是: [0, 1, 2, 3]
str = "I love deep learning"
word_to_index = {"I":0, "love":1, "deep":2, "learning":3}
input_sentence = [word_to_index[word] for word in str.split()]
input_tensor = torch.tensor([input_sentence])  # 添加 batch 维度,[[]]多了个[]
print("输入索引:", input_tensor)

输入索引: tensor([[0, 1, 2, 3]])


In [44]:
# 假设超参数
vocab_size = 5  # 词表大小
d_model = 6       # 词向量维度
seq_len = 5        # 句子长度
batch_size = 2     # 批次大小

# 定义 Embedding 层
# 这一层的参数量是: vocab_size * d_model
embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
# 经过 Embedding 层
output_vectors = embedding_layer(input_tensor)
print("输出词向量尺寸:", output_vectors.shape)# 期望是 (Batch_Size, Seq_Len, D_Model)
print("输出词向量:", output_vectors)
for i in range(len(str.split())):
    print(f"单词: {str.split()[i]} -> 词向量: {output_vectors[0][i].detach().numpy()}")  


输出词向量尺寸: torch.Size([1, 4, 6])
输出词向量: tensor([[[-0.0667,  0.5876, -1.1609, -0.0734, -0.7703, -0.3741],
         [-0.2260,  0.4813, -0.6121, -1.0415,  1.5145,  0.3271],
         [-0.3737, -1.8556,  0.9718, -0.8834,  0.6442, -0.7859],
         [ 0.7231, -1.3994, -0.8637,  1.3661, -1.1196,  0.0040]]],
       grad_fn=<EmbeddingBackward0>)
单词: I -> 词向量: [-0.06668201  0.5875759  -1.1609036  -0.07340842 -0.7702971  -0.37410647]
单词: love -> 词向量: [-0.22602133  0.4812881  -0.6121076  -1.0415363   1.5144681   0.3270686 ]
单词: deep -> 词向量: [-0.37365475 -1.8555632   0.97181636 -0.8833915   0.64417136 -0.7858806 ]
单词: learning -> 词向量: [ 0.7231205  -1.3993503  -0.86371547  1.3661411  -1.1196176   0.00401473]


In [None]:
 # [-0.06668201  0.5875759  -1.1609036  -0.07340842 -0.7702971  -0.37410647]
 # [-0.22602133  0.4812881  -0.6121076  -1.0415363   1.5144681   0.3270686 ]
 # [-0.37365475 -1.8555632   0.97181636 -0.8833915   0.64417136 -0.7858806 ]
 # [ 0.7231205  -1.3993503  -0.86371547  1.3661411  -1.1196176   0.00401473]

In [47]:
# pos计算PE矩阵
import torch
import math
str = "I love deep learning"
word_to_index = {"I":0, "love":1, "deep":2, "learning":3}
input_sentence = [word_to_index[word] for word in str.split()]
input_tensor = torch.tensor([input_sentence])  # 添加 batch 维度,[[]]多了个[]
print("输入索引:", input_tensor)
# 假设参数
seq_len = len(str.split())#词的数量 4
# print(seq_len)
d_model = 6  # 词向量维度

# 1. 先创建一个空的 Tensor 来占位
# 形状是 (Seq_Len, d_model)，比如 (4, 5)
pe_tensor = torch.zeros(seq_len, d_model)
# 2. 你的循环逻辑（填空）

# 这是基础写法，好理解，但效率不高
for pos in range(seq_len):
    for i in range(d_model):
        if i % 2 == 0:
            val = math.sin(pos / (10000 ** (2 * i / d_model)))
        else:
            val = math.cos(pos / (10000 ** (2 * i / d_model))) 
            
        # 关键步骤：把计算出的值填入 Tensor
        pe_tensor[pos, i] = val

print("生成的 PE 矩阵形状:", pe_tensor.shape)
# print(pe_tensor)
print("输出词向量尺寸:", output_vectors.shape)# 期望是 (Batch_Size, Seq_Len, D_Model)
# 对应矩阵尺寸进行加法
final_output =output_vectors+pe_tensor.unsqueeze(0)# unsqueeze(0) 是为了匹配 batch 维度
print(final_output)


输入索引: tensor([[0, 1, 2, 3]])
生成的 PE 矩阵形状: torch.Size([4, 6])
输出词向量尺寸: torch.Size([1, 4, 6])
tensor([[[-0.0667,  1.5876, -1.1609,  0.9266, -0.7703,  0.6259],
         [ 0.6154,  1.4802, -0.6100, -0.0415,  1.5145,  1.3271],
         [ 0.5356, -0.8599,  0.9761,  0.1166,  0.6442,  0.2141],
         [ 0.8642, -0.4090, -0.8573,  2.3661, -1.1196,  1.0040]]],
       grad_fn=<AddBackward0>)


In [48]:
# 优化写法，利用张量运算
pos = torch.arange(0, seq_len).unsqueeze(1)  # (Seq_Len, 1)
i = torch.arange(0, d_model,2).unsqueeze(0)    #生成 [0, 2, 4]
denom = torch.pow(10000, (2 * i) / d_model)  # 分母
tmp = pos / denom  #算出一半
#接下来分奇偶公用的进行sin和cos
#因为这里你自己算你下就可以找到规律是0列个1列共一个单元，2列个3列共一个单元...
pe_tensor[:, 0::2] = torch.sin(tmp)  # 偶数位置
pe_tensor[:, 1::2] = torch.cos(tmp)  # 奇数位置
print(pe_tensor.shape)
print("生成的 PE 矩阵形状:", pe_tensor.shape)
# print(pe_tensor)
print("输出词向量尺寸:", output_vectors.shape)# 期望是 (Batch_Size, Seq_Len, D_Model)
# 对应矩阵尺寸进行加法
final_output =output_vectors+pe_tensor.unsqueeze(0)# unsqueeze(0) 是为了匹配 batch 维度
print(final_output)


torch.Size([4, 6])
生成的 PE 矩阵形状: torch.Size([4, 6])
输出词向量尺寸: torch.Size([1, 4, 6])
tensor([[[-0.0667,  1.5876, -1.1609,  0.9266, -0.7703,  0.6259],
         [ 0.6154,  1.0216, -0.6100, -0.0415,  1.5145,  1.3271],
         [ 0.5356, -2.2717,  0.9761,  0.1166,  0.6442,  0.2141],
         [ 0.8642, -2.3893, -0.8573,  2.3661, -1.1196,  1.0040]]],
       grad_fn=<AddBackward0>)


In [51]:
#实现自注意力机制模块
def Attention(Q, K, V):
    """
    计算自注意力机制,以下三个矩阵都是来自上层经过线性变换得到的
    Q: 查询矩阵 (Batch_Size, Seq_Len, D_k)
    K: 键矩阵 (Batch_Size, Seq_Len, D_k)
    V: 值矩阵 (Batch_Size, Seq_Len, D_v)
    返回:
    输出矩阵 (Batch_Size, Seq_Len, D_v)
    """
    d_k = Q.size(-1)  # 获取 D_k 的维度,因为Q是(Batch_Size, Seq_Len, D_k)最后一维表示d_k
    # 1. 计算注意力分数，套用公式
    #注意力分数矩阵的计算公式是 Q * K^T / sqrt(d_k)
    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)  # (Batch_Size, Seq_Len, Seq_Len)
    
    # 2. 应用 softmax 得到注意力权重
    attn_weights = torch.softmax(scores, dim=-1)  # (Batch_Size, Seq_Len, Seq_Len),按照列来看，这里指的是每一批次的矩阵seq_len*seq_len
    """
                  看 I (Col 0)  ,看 love (Col 1)      ,看 AI (Col 2), (这是 dim=-1)
我是 I (Row 0)        ,10.0          ,5.0                  ,2.0,"← 这一行是 ""I"" 的视角"
我是 love (Row 1)     ,4.0           ,12.0                 ,8.0,
我是 AI (Row 2)       ,1.0           ,6.0                  ,11.0,
    """
    # 3. 计算加权和，公式为attn_weights * V矩阵相乘
    output = torch.matmul(attn_weights, V)  # (Batch_Size, Seq_Len, D_v)
    
    return output
    
    

In [52]:
# 测试 Attention 函数
import torch
import math 
batch_size = 2
seq_len = 4
d_k = 6
d_v = 6
# 随机生成 Q, K, V 矩阵
Q = torch.rand(batch_size, seq_len, d_k)
K = torch.rand(batch_size, seq_len, d_k)
V = torch.rand(batch_size, seq_len, d_v)
# 计算注意力输出
output = Attention(Q, K, V)
print("注意力输出尺寸:", output.shape)  # 期望是 (Batch_Size, Seq_Len, D_v)
print("注意力输出:", output)

注意力输出尺寸: torch.Size([2, 4, 6])
注意力输出: tensor([[[0.3803, 0.8813, 0.4401, 0.4152, 0.4913, 0.4493],
         [0.3890, 0.8765, 0.4447, 0.4184, 0.4722, 0.4609],
         [0.3907, 0.8757, 0.4588, 0.4140, 0.4712, 0.4673],
         [0.4047, 0.8729, 0.4829, 0.4090, 0.4535, 0.4848]],

        [[0.2425, 0.2381, 0.7493, 0.6029, 0.4236, 0.4700],
         [0.2375, 0.2363, 0.7496, 0.5939, 0.4271, 0.4802],
         [0.2538, 0.2265, 0.7471, 0.5850, 0.4104, 0.4833],
         [0.2473, 0.2302, 0.7525, 0.5877, 0.4285, 0.4835]]])


In [None]:
num_heads = 2  # 注意力头数
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model  # 词向量维度
        self.num_heads = num_heads  # 注意力头数
        assert self.d_model % self.num_heads == 0, "d_model 必须能被 num_heads 整除"
        self.d_k = self.d_model // self.num_heads  # 每个头的维度,平分
        
        # 定义线性变换层
        self.W_Q = nn.Linear(self.d_model, self.d_model)
        self.W_K = nn.Linear(self.d_model, self.d_model)
        self.W_V = nn.Linear(self.d_model, self.d_model)
        self.W_O = nn.Linear(self.d_model, self.d_model)# 输出线性变换层作用是把多头拼接后的结果映射回d_model维度
    def forward(self, Q, K, V, mask=None):
        """
        Q, K, V: 输入的张量，形状通常是 [Batch_Size, Seq_Len, d_model]
        mask: 掩码张量 (可选)
        """
        batch_size = Q.size(0) # 获取批次大小

        # =================================================
        # 1. 线性变换 (Projection)
        # =================================================
        # 经过全连接层，形状依然是 [Batch, Seq_Len, d_model]
        # 比如 [2, 4, 6] -> [2, 4, 6]
        Q = self.W_Q(Q)
        K = self.W_K(K)
        V = self.W_V(V)

        # =================================================
        # 2. 切割与转置 (Split Heads) - 最关键的一步！
        # =================================================
        # 我们要把 d_model (6) 切成 num_heads (2) * d_k (3)
        # view: [Batch, Seq_Len, d_model] -> [Batch, Seq_Len, num_heads, d_k]
        # transpose: 交换维度 1 和 2 -> [Batch, num_heads, Seq_Len, d_k]
        # 为什么要交换？为了让 num_heads 靠近 batch 维度，这样 PyTorch 就能并行一次算出所有头的注意力
        Q = Q.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = K.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = V.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        
        # 此时 Q, K, V 的形状都是: [2, 2, 4, 3] (Batch, Heads, Seq, d_k)

        # =================================================
        # 3. 计算注意力分数 (Scaled Dot-Product Attention)
        # =================================================
        # 矩阵乘法: Q * K.T
        # K.transpose(-2, -1) 把最后两个维度互换，变成 [..., 3, 4] 以便相乘
        # scores 形状: [Batch, Heads, Seq_Len, Seq_Len] -> [2, 2, 4, 4]
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        # (可选) 如果有 mask，在这里填充负无穷
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        # Softmax 归一化
        attn_weights = torch.softmax(scores, dim=-1)

        # 加权求和: weights * V
        # [2, 2, 4, 4] * [2, 2, 4, 3] -> [2, 2, 4, 3] (context)
        context = torch.matmul(attn_weights, V)

        # =================================================
        # 4. 拼接与还原 (Concat)
        # =================================================
        # 先把维度换回来: transpose(1, 2) -> [Batch, Seq_Len, num_heads, d_k]
        context = context.transpose(1, 2)
        
        # contiguous() 是为了解决 transpose 后内存不连续的问题，必须加
        # view: 把最后两个维度 (Heads, d_k) 强行捏在一起 -> d_model
        # 结果形状: [Batch, Seq_Len, d_model] -> [2, 4, 6]
        context = context.contiguous().view(batch_size, -1, self.d_model)

        # =================================================
        # 5. 最后的线性变换 (Output Projection)
        # =================================================
        # 融合多头的信息
        output = self.W_O(context)

        return output
        
        

In [None]:

class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super(LayerNorm, self).__init__()
        # 1. 定义两个可学习的参数 (这就是公式里的 gamma 和 beta)
        # nn.Parameter 表示这两个变量是需要跟着模型一起训练更新的
        self.gamma = nn.Parameter(torch.ones(d_model))  # 初始化为 1 (不做缩放)
        self.beta = nn.Parameter(torch.zeros(d_model))  # 初始化为 0 (不做平移)
        self.eps = eps

    def forward(self, x):
        # x 的形状: [Batch, Seq_Len, d_model]
        
        # 2. 算均值 (沿着最后一维 d_model 算)，就是每个词向量的均值，怎么才能不迷，就是按谁算谁算谁消失
        # keepdim=True 很重要，为了保持维度形状方便后面相减
        mean = x.mean(-1, keepdim=True) 
        
        # 3. 算标准差 (std)
        std = x.std(-1, keepdim=True)
        
        # 4. 归一化 (减均值，除标准差)
        # 这里的 eps 是为了防止 std 是 0 导致报错
        output = (x - mean) / (std + self.eps)
        
        # 5. 加上可学习的参数 (缩放 + 平移)
        # 这一步让模型有能力“反悔”，如果不需要归一化，它可以学出对应的 gamma 和 beta 来抵消
        output = output * self.gamma + self.beta
        
        return output