In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# 一般用于batch下一个维度
input = torch.randn(2,4)
m = nn.BatchNorm1d(4)
output = m(input)
print(output)   
print(output.mean(0))  # Mean across the batch dimension
print(output.std(dim=0, unbiased=False))   # Standard deviation across the batch dimension

tensor([[-1.0000,  1.0000,  1.0000,  0.9999],
        [ 1.0000, -1.0000, -1.0000, -0.9999]],
       grad_fn=<NativeBatchNormBackward0>)
tensor([ 0.0000e+00,  0.0000e+00, -2.9802e-08, -2.9802e-08],
       grad_fn=<MeanBackward1>)
tensor([1.0000, 1.0000, 1.0000, 0.9999], grad_fn=<StdBackward0>)


In [None]:
input = torch.randn(2,2,2,3)    # [N, C, H, W] 
m = nn.BatchNorm2d(2)
output = m(input)
print(output)

first_channel = output[:, 0, :, :].reshape(-1)
print(first_channel)  # First channel of the output
print(first_channel.mean(0))  # Mean across the batch dimension
print(first_channel.std(dim=0, unbiased=False))  # Standard deviation across the batch dimension

tensor([[[[-1.9804, -0.2279,  1.3264],
          [-0.1175,  0.7658,  1.6648]],

         [[-0.0873, -0.8197, -0.2554],
          [-0.5422,  2.2112, -0.1473]]],


        [[[ 1.0093, -0.9595, -0.4740],
          [-0.8451, -0.1211, -0.0409]],

         [[-0.2429, -0.2483,  0.3325],
          [ 1.2085,  0.5927, -2.0018]]]], grad_fn=<NativeBatchNormBackward0>)
tensor([-1.9804, -0.2279,  1.3264, -0.1175,  0.7658,  1.6648,  1.0093, -0.9595,
        -0.4740, -0.8451, -0.1211, -0.0409], grad_fn=<UnsafeViewBackward0>)
tensor(-2.4835e-08, grad_fn=<MeanBackward1>)
tensor(1.0000, grad_fn=<StdBackward0>)


In [34]:
# 均值和标准差是在最后 D 个维度上计算的
batch, sentence_length, embedding_dim = 20, 5, 10
embedding = torch.randn(batch, sentence_length, embedding_dim)
layer_norm = nn.LayerNorm(embedding_dim)
# Activate module
output = layer_norm(embedding)

print(output[0,0,:].mean(dim=0))
print(output[0,0,:].std(dim=0, unbiased=False))  # Standard deviation across the embedding dimension

tensor(3.2783e-08, grad_fn=<MeanBackward1>)
tensor(1.0000, grad_fn=<StdBackward0>)


In [40]:
N, C, H, W = 20, 5, 10, 10
input = torch.randn(N, C, H, W)
# Normalize over the last three dimensions (i.e. the channel and spatial dimensions)
# as shown in the image below
layer_norm = nn.LayerNorm([C, H, W])
output = layer_norm(input)

print(output[0,:].mean())
print(output[0,:].std(unbiased=False))

print(output.mean(dim=(1, 2, 3)))  # Mean across the batch and spatial dimensions

tensor(7.1526e-09, grad_fn=<MeanBackward0>)
tensor(1.0000, grad_fn=<StdBackward0>)
tensor([ 7.1526e-09, -6.6757e-09, -1.3828e-08,  1.9073e-09,  4.7684e-09,
         1.9073e-09, -5.7220e-09,  1.0490e-08,  0.0000e+00, -6.6757e-09,
        -1.1444e-08,  1.3351e-08, -4.7684e-09,  1.9073e-09,  7.0930e-09,
        -5.7220e-09, -3.8147e-09, -1.9073e-09, -6.9141e-09,  1.7166e-08],
       grad_fn=<MeanBackward1>)


In [None]:
# torch2.7才有
rms_norm = nn.RMSNorm([2, 3])
input = torch.randn(2, 2, 3)
rms_norm(input)

AttributeError: module 'torch.nn' has no attribute 'RMSNorm'

layernorm VS RMSnorm

In [47]:
bs, seq_len, embedding_dim = 20, 5, 10
x = torch.randn(bs, seq_len, embedding_dim)
layer_norm = nn.LayerNorm(embedding_dim)
output = layer_norm(x)
print(output[2, 1, :].mean(dim=0))
print(output[0, 3, :].std(dim=0, unbiased=False))

tensor(-1.1921e-08, grad_fn=<MeanBackward1>)
tensor(1.0000, grad_fn=<StdBackward0>)


In [49]:
class RMSNorm(torch.nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):
        output = self._norm(x.float()).type_as(x)
        return output * self.weight

In [51]:
rms_norm = RMSNorm(embedding_dim)
x_rms = rms_norm(x)
print(x[0, 0, :])
print(x_rms[0, 0, :])

tensor([-1.3787,  1.7914,  1.4959,  0.8239, -0.7749,  0.8249, -0.8182,  0.2586,
        -1.4281, -1.2396])
tensor([-1.1814,  1.5350,  1.2818,  0.7059, -0.6640,  0.7068, -0.7011,  0.2216,
        -1.2237, -1.0622], grad_fn=<SliceBackward0>)


In [52]:
x[0, 0, :] / torch.sqrt(torch.sum(x[0, 0, :].pow(2)) / embedding_dim)

tensor([-1.1814,  1.5350,  1.2818,  0.7059, -0.6640,  0.7068, -0.7011,  0.2216,
        -1.2237, -1.0622])

In [None]:
# 根号n 根号10 = 3.16
print(torch.norm(x_rms[0, 0, :]))
import numpy as np
print(np.sqrt(embedding_dim))

tensor(3.1623, grad_fn=<LinalgVectorNormBackward0>)
3.1622776601683795
