In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# 一般用于batch下一个维度
input = torch.randn(2,4)
m = nn.BatchNorm1d(4)
output = m(input)
print(output)   
print(output.mean(0))  # Mean across the batch dimension
print(output.std(dim=0, unbiased=False))   # Standard deviation across the batch dimension

tensor([[-1.0000,  0.9947, -1.0000,  1.0000],
        [ 1.0000, -0.9947,  1.0000, -1.0000]],
       grad_fn=<NativeBatchNormBackward0>)
tensor([0.0000e+00, 2.3842e-07, 0.0000e+00, 0.0000e+00],
       grad_fn=<MeanBackward1>)
tensor([1.0000, 0.9947, 1.0000, 1.0000], grad_fn=<StdBackward0>)


In [7]:
input = torch.randn(2,2,2,3)    # [N, C, H, W] 
m = nn.BatchNorm2d(2)
output = m(input)
print(output)

first_channel = output[:, 0, :, :].reshape(-1)
print(first_channel)  # First channel of the output
print(first_channel.mean(0))  # Mean across the batch dimension
print(first_channel.std(dim=0, unbiased=False))  # Standard deviation across the batch dimension

tensor([[[[-0.1321, -1.3829,  0.0633],
          [-0.2338,  1.7586, -0.4787]],

         [[-0.4129,  0.1674,  1.9476],
          [-0.5788, -1.5462,  0.9578]]],


        [[[ 1.1817,  0.5309,  1.0532],
          [-0.8182, -1.7818,  0.2397]],

         [[-0.4996,  1.4713, -0.7869],
          [-1.0685, -0.0793,  0.4281]]]], grad_fn=<NativeBatchNormBackward0>)
tensor([-0.1321, -1.3829,  0.0633, -0.2338,  1.7586, -0.4787,  1.1817,  0.5309,
         1.0532, -0.8182, -1.7818,  0.2397], grad_fn=<UnsafeViewBackward0>)
tensor(-4.9671e-09, grad_fn=<MeanBackward1>)
tensor(1.0000, grad_fn=<StdBackward0>)


In [8]:
# 均值和标准差是在最后 D 个维度上计算的
batch, sentence_length, embedding_dim = 20, 5, 10
embedding = torch.randn(batch, sentence_length, embedding_dim)
layer_norm = nn.LayerNorm(embedding_dim)
# Activate module
output = layer_norm(embedding)

print(output[0,0,:].mean(dim=0))
print(output[0,0,:].std(dim=0, unbiased=False))  # Standard deviation across the embedding dimension

tensor(-3.5763e-08, grad_fn=<MeanBackward1>)
tensor(1.0000, grad_fn=<StdBackward0>)


In [9]:
N, C, H, W = 20, 5, 10, 10
input = torch.randn(N, C, H, W)
# Normalize over the last three dimensions (i.e. the channel and spatial dimensions)
# as shown in the image below
layer_norm = nn.LayerNorm([C, H, W])
output = layer_norm(input)

print(output[0,:].mean())
print(output[0,:].std(unbiased=False))

print(output.mean(dim=(1, 2, 3)))  # Mean across the batch and spatial dimensions

tensor(-9.5367e-10, grad_fn=<MeanBackward0>)
tensor(1.0000, grad_fn=<StdBackward0>)
tensor([-9.5367e-10,  1.0490e-08, -9.5367e-10, -1.3351e-08, -1.9073e-09,
         1.9073e-09,  9.5367e-09, -9.5367e-09, -2.8610e-09,  2.8610e-09,
         5.7220e-09,  8.5831e-09, -1.9073e-09, -9.5367e-09,  3.8147e-09,
         1.1444e-08,  1.2398e-08, -4.7684e-09,  0.0000e+00, -1.9073e-09],
       grad_fn=<MeanBackward1>)


In [10]:
# torch2.7才有
rms_norm = nn.RMSNorm([2, 3])
input = torch.randn(2, 2, 3)
rms_norm(input)

AttributeError: module 'torch.nn' has no attribute 'RMSNorm'

layernorm VS RMSnorm

In [11]:
bs, seq_len, embedding_dim = 20, 5, 10
x = torch.randn(bs, seq_len, embedding_dim)
layer_norm = nn.LayerNorm(embedding_dim)
output = layer_norm(x)
print(output[2, 1, :].mean(dim=0))
print(output[0, 3, :].std(dim=0, unbiased=False))

tensor(5.9605e-09, grad_fn=<MeanBackward1>)
tensor(1.0000, grad_fn=<StdBackward0>)


In [12]:
class RMSNorm(torch.nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):
        output = self._norm(x.float()).type_as(x)
        return output * self.weight

In [13]:
rms_norm = RMSNorm(embedding_dim)
x_rms = rms_norm(x)
print(x.pow(2).mean(-1, keepdim=True).shape)
print(x[0, 0, :])
print(x_rms[0, 0, :])

torch.Size([20, 5, 1])
tensor([ 0.7677, -0.5252, -0.8281, -0.1732,  0.3264,  0.3328,  0.9180,  0.7997,
         2.9066, -0.8317])
tensor([ 0.6889, -0.4713, -0.7430, -0.1554,  0.2928,  0.2986,  0.8237,  0.7176,
         2.6080, -0.7463], grad_fn=<SliceBackward0>)


In [None]:
x[0, 0, :] / torch.sqrt(torch.sum(x[0, 0, :].pow(2)) / embedding_dim)

tensor([-1.1814,  1.5350,  1.2818,  0.7059, -0.6640,  0.7068, -0.7011,  0.2216,
        -1.2237, -1.0622])

In [None]:
# 根号n 根号10 = 3.16
print(torch.norm(x_rms[0, 0, :]))
import numpy as np
print(np.sqrt(embedding_dim))

tensor(3.1623, grad_fn=<LinalgVectorNormBackward0>)
3.1622776601683795
