In [14]:
import torch
import torch.nn as nn

X = torch.tensor([
    [
        [0.1, 0.2, 0.3, 0.4],  # 句子 1, 词 1
        [0.5, 0.6, 0.7, 0.8],  # 句子 1, 词 2
        [0.9, 1.0, 1.1, 1.2]   # 句子 1, 词 3
    ],
    [
        [10.0, 11.0, 12.0, 13.0], # 句子 2, 词 1
        [14.0, 15.0, 16.0, 17.0], # 句子 2, 词 2
        [18.0, 19.0, 20.0, 21.0]  # 句子 2, 词 3
    ]
], dtype=torch.float32)
B, T, C = X.shape

layernorm = nn.LayerNorm(N)  # N 是特征维度
X_ln = layernorm(X)
print("LayerNorm 后的输出:\n", X_ln)
print(f"LayerNorm 后的均值:\n{X_ln.mean(dim=-1)}， 方差:\n{X_ln.var(dim=-1, unbiased=False)}")

sentence1_wordvec1 = X[0, 0, :]  # 句子 1 的词 1 的向量
vec_mean = sentence1_wordvec1.mean(0)
vec_var = sentence1_wordvec1.var(unbiased=False)  # LayerNorm默认使用有偏估计
epsilon = 1e-5  # 防止除零
vec_norm = (sentence1_wordvec1 - vec_mean) / (vec_var + epsilon).sqrt()
print(f"\n句子 1 的词 1 的向量:\n{sentence1_wordvec1}")
print(f"均值: {vec_mean}, 方差: {vec_var}, 归一化向量: {vec_norm}")

LayerNorm 后的输出:
 tensor([[[-1.3411, -0.4470,  0.4470,  1.3411],
         [-1.3411, -0.4470,  0.4470,  1.3411],
         [-1.3411, -0.4470,  0.4470,  1.3411]],

        [[-1.3416, -0.4472,  0.4472,  1.3416],
         [-1.3416, -0.4472,  0.4472,  1.3416],
         [-1.3416, -0.4472,  0.4472,  1.3416]]],
       grad_fn=<NativeLayerNormBackward0>)
LayerNorm 后的均值:
tensor([[ 2.9802e-08, -2.9802e-07,  5.3644e-07],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00]], grad_fn=<MeanBackward1>)， 方差:
tensor([[0.9992, 0.9992, 0.9992],
        [1.0000, 1.0000, 1.0000]], grad_fn=<VarBackward0>)

句子 1 的词 1 的向量:
tensor([0.1000, 0.2000, 0.3000, 0.4000])
均值: 0.25, 方差: 0.012500000186264515, 归一化向量: tensor([-1.3411, -0.4470,  0.4470,  1.3411])


In [16]:
X_reshape = X.view(-1, C)
print(X_reshape)
batchnorm = nn.BatchNorm1d(C)
X_bn = batchnorm(X_reshape)
print(X_bn)
X_bn_fold = X_bn.view(B, T, C)
print("BatchNorm 后的输出还原张量形状:\n", X_bn_fold)

tensor([[ 0.1000,  0.2000,  0.3000,  0.4000],
        [ 0.5000,  0.6000,  0.7000,  0.8000],
        [ 0.9000,  1.0000,  1.1000,  1.2000],
        [10.0000, 11.0000, 12.0000, 13.0000],
        [14.0000, 15.0000, 16.0000, 17.0000],
        [18.0000, 19.0000, 20.0000, 21.0000]])
tensor([[-1.0017, -1.0046, -1.0070, -1.0088],
        [-0.9457, -0.9518, -0.9569, -0.9613],
        [-0.8896, -0.8989, -0.9069, -0.9138],
        [ 0.3853,  0.4230,  0.4566,  0.4866],
        [ 0.9457,  0.9518,  0.9569,  0.9613],
        [ 1.5061,  1.4805,  1.4573,  1.4360]],
       grad_fn=<NativeBatchNormBackward0>)
BatchNorm 后的输出还原张量形状:
 tensor([[[-1.0017, -1.0046, -1.0070, -1.0088],
         [-0.9457, -0.9518, -0.9569, -0.9613],
         [-0.8896, -0.8989, -0.9069, -0.9138]],

        [[ 0.3853,  0.4230,  0.4566,  0.4866],
         [ 0.9457,  0.9518,  0.9569,  0.9613],
         [ 1.5061,  1.4805,  1.4573,  1.4360]]], grad_fn=<ViewBackward0>)


In [18]:
feature1 = X_reshape[:, 0]  # 取第一个样本的特征向量
feature1_mean = feature1.mean()
feature1_var = feature1.var(unbiased=False)  # 使用有偏估计
epsilon = 1e-5  # 防止除零
feature1_norm = (feature1 - feature1_mean) / (feature1_var + epsilon).sqrt()
print(f"\n第一个特征的归一化:\n{feature1_norm}")


第一个特征的归一化:
tensor([-1.0017, -0.9457, -0.8896,  0.3853,  0.9457,  1.5061])
