In [2]:
import torch
from torch import nn
from d2l import torch as d2l

In [3]:
def batchNormalization(X, gamma, beta, moving_mean, moving_var, eps, momentum):
    if not torch.is_grad_enabled():
        X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2, 4) # 2;全连接层, 4:卷积层
        if len(X.shape) == 2:
            mean = X.mean(dim=0) # 按行求均值,得到1Xn的向量
            var = ((X - mean)**2).mean(dim=0)
        else:
            mean = X.mean(dim=(0,2,3), keepdims=True)
            var = ((X - mean)**2).mean(dim=(0,2,3),keepdim=True)
        X_hat = (X - mean) / torch.sqrt(var + eps)
        moving_mean = momentum * moving_mean + (1 - momentum) * mean
        moving_var = momentum * moving_var + (1 - momentum) * var
    Y=gamma * X_hat + beta
    return Y, moving_mean, moving_var.data

In [18]:
X = torch.randn(2,2,3)
X

tensor([[[-1.7600,  1.2787,  1.9835],
         [ 1.0104,  0.9450,  1.2862]],

        [[ 0.2103,  0.6154, -1.7522],
         [-0.5461, -2.8473,  1.0881]]])

In [49]:
X.mean(dim=(0,2)).shape

torch.Size([2])

In [20]:
X[0]

tensor([[-1.7600,  1.2787,  1.9835],
        [ 1.0104,  0.9450,  1.2862]])

In [29]:
torch.cat((X[0][0], X[1][0]),dim=0).mean()
torch.cat((X[0][1], X[1][1]),dim=0).mean()

tensor(0.1561)

In [44]:
bn_input = torch.randn((2,3,4,4))

In [32]:
batchNormalization(bn_input)

TypeError: batchNormalization() missing 6 required positional arguments: 'gamma', 'beta', 'moving_mean', 'moving_var', 'eps', and 'momentum'

In [70]:
bn = nn.BatchNorm2d(2)

In [46]:
bn(bn_input)

ValueError: expected 4D input (got 3D input)

In [47]:
torch.is_grad_enabled()

True

In [50]:
Sigmoid(bn_input)

NameError: name 'Sigmoid' is not defined

In [61]:
ln = nn.LayerNorm(4)

In [62]:
ln.weight

Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)

In [72]:
input = torch.randn(1,2,3,4)

In [77]:
ln(input)

tensor([[[[-0.4938, -0.0699,  1.6220, -1.0583],
          [-0.4204,  0.6967, -1.4226,  1.1463],
          [-0.4573,  1.3256, -1.3443,  0.4760]],

         [[-0.9545, -0.9178,  1.4330,  0.4393],
          [-0.7525,  0.7001, -1.1867,  1.2391],
          [-0.0539, -1.3965,  0.0198,  1.4306]]]],
       grad_fn=<NativeLayerNormBackward0>)

In [81]:
input[:,:,:,0].mean()

tensor(-0.1227)

In [82]:
input[:,:,:,0].var()

tensor(0.0130)

In [85]:
(input[0,0,:,:]-input[0,0,:,:].mean())/input[0,0,:,:].var()

tensor([[-0.6773, -0.4444,  0.4852, -0.9874],
        [-0.3460,  0.6279, -1.2197,  1.0198],
        [-0.4251,  2.7350, -1.9973,  1.2291]])

In [88]:
bn(input) 

tensor([[[[-0.5519, -0.3621,  0.3954, -0.8047],
          [-0.2819,  0.5117, -0.9940,  0.8311],
          [-0.3464,  2.2288, -1.6276,  1.0017]],

         [[-0.5902, -0.5574,  1.5466,  0.6572],
          [-0.6311,  0.8712, -1.0801,  1.4286],
          [-0.4614, -1.7130, -0.3928,  0.9224]]]],
       grad_fn=<NativeBatchNormBackward0>)

In [121]:
bn1 = nn.BatchNorm1d(3) # 特征数量3

In [114]:
input = torch.tensor(
    [[[1,0,2,0,0],
     [0,1,0,3,8],
     [2,5,3,0,6]],
    [[1,0,2,7,6],
     [5,1,0,1,1],
     [3,5,3,0,0]]]
    ,dtype=torch.float32
) # 2 batch, 3 features, 5 word
input

tensor([[[1., 0., 2., 0., 0.],
         [0., 1., 0., 3., 8.],
         [2., 5., 3., 0., 6.]],

        [[1., 0., 2., 7., 6.],
         [5., 1., 0., 1., 1.],
         [3., 5., 3., 0., 0.]]])

In [117]:
input[:,0,:].mean()

tensor(1.9000)

In [126]:
input[:,0,:].var()

tensor(6.5444)

In [125]:
(input[:,0,:] - input[:,0,:].mean()) / (torch.sqrt(input[:,0,:].var() ) + 1e-5)

tensor([[-0.3518, -0.7427,  0.0391, -0.7427, -0.7427],
        [-0.3518, -0.7427,  0.0391,  1.9936,  1.6027]])

In [122]:
bn1(input)

tensor([[[-0.3708, -0.7829,  0.0412, -0.7829, -0.7829],
         [-0.8032, -0.4016, -0.8032,  0.4016,  2.4097],
         [-0.3333,  1.0952,  0.1429, -1.2857,  1.5714]],

        [[-0.3708, -0.7829,  0.0412,  2.1014,  1.6894],
         [ 1.2048, -0.4016, -0.8032, -0.4016, -0.4016],
         [ 0.1429,  1.0952,  0.1429, -1.2857, -1.2857]]],
       grad_fn=<NativeBatchNormBackward0>)

In [136]:
bn2 = nn.BatchNorm2d(1)
x = torch.randn(1,1,4,2)
x

tensor([[[[-0.1972,  0.7265],
          [-0.1274, -0.3454],
          [-0.4251, -0.3399],
          [-0.0634,  1.6831]]]])

In [137]:
bn2(x)

tensor([[[[-0.4555,  0.8969],
          [-0.3534, -0.6725],
          [-0.7892, -0.6645],
          [-0.2596,  2.2976]]]], grad_fn=<NativeBatchNormBackward0>)

In [182]:
input = torch.randn(2,3,4) # 2:batch, 3:feature, 4:sequence length
bn1 = nn.BatchNorm1d(3) # 选取特征为2
print(bn1(input))

tensor([[[ 0.3635, -0.9590, -0.3511,  2.2032],
         [-0.4991,  0.2462,  0.4101,  1.0989],
         [ 0.7936, -0.4599,  1.7564,  0.1505]],

        [[ 0.2624, -1.3754, -0.0637, -0.0799],
         [ 0.5055,  0.2822,  0.3728, -2.4166],
         [-0.3519,  0.2814, -0.2209, -1.9491]]],
       grad_fn=<NativeBatchNormBackward0>)


In [184]:
a1 = (input[:,0,:] - input[:,0,:].mean()) / torch.sqrt(input[:,0,:].var(unbiased=False) + 1e-5)
a2 = (input[:,1,:] - input[:,1,:].mean()) / torch.sqrt(input[:,1,:].var(unbiased=False) + 1e-5)
a3 = (input[:,2,:] - input[:,2,:].mean()) / torch.sqrt(input[:,2,:].var(unbiased=False) + 1e-5)

In [192]:
a = torch.zeros(2,3,4)
a[:,0,:]=a1
a[:,1,:]=a2
a[:,2,:]=a3
a - bn1(input) < 1e-5

tensor([[[True, True, True, True],
         [True, True, True, True],
         [True, True, True, True]],

        [[True, True, True, True],
         [True, True, True, True],
         [True, True, True, True]]])

In [251]:
ln1 = nn.LayerNorm(4)# 4 features
alpha = ln1.weight = torch.nn.Parameter(torch.randn(4))
beta = ln1.bias = torch.nn.Parameter(torch.randn(4))
ln1.weight, ln1.bias

(Parameter containing:
 tensor([ 0.4261, -0.1346,  1.1897, -0.6672], requires_grad=True),
 Parameter containing:
 tensor([-1.2664, -1.0838,  0.7171, -0.1019], requires_grad=True))

In [252]:
ln1(input)

tensor([[[-1.2487, -0.9394,  0.0503, -1.1637],
         [-1.8762, -1.0677,  0.9184, -1.0235],
         [-1.1452, -0.9165,  2.4514,  0.2312]],

        [[-0.8751, -0.8562,  1.1918, -0.3508],
         [-0.9794, -1.1497,  1.3889,  1.0512],
         [-1.1603, -1.2192,  1.1998,  1.0065]]],
       grad_fn=<NativeLayerNormBackward0>)

In [255]:
b = torch.zeros(2,3,4)
b[0,0,:] = alpha * (input[0,0,:] - input[0,0,:].mean()) / torch.sqrt(input[0,0,:].var(unbiased=False) + 1e-5) + beta
b[0,1,:] = alpha *(input[0,1,:] - input[0,1,:].mean()) / torch.sqrt(input[0,1,:].var(unbiased=False) + 1e-5)+ beta
b[0,2,:] = alpha *(input[0,2,:] - input[0,2,:].mean()) / torch.sqrt(input[0,2,:].var(unbiased=False) + 1e-5)+ beta
b[1,0,:] = alpha *(input[1,0,:] - input[1,0,:].mean()) / torch.sqrt(input[1,0,:].var(unbiased=False) + 1e-5)+ beta
b[1,1,:] = alpha *(input[1,1,:] - input[1,1,:].mean()) / torch.sqrt(input[1,1,:].var(unbiased=False) + 1e-5)+ beta
b[1,2,:] = alpha *(input[1,2,:] - input[1,2,:].mean()) / torch.sqrt(input[1,2,:].var(unbiased=False) + 1e-5)+ beta
b - ln1(input) < 1e-5

tensor([[[True, True, True, True],
         [True, True, True, True],
         [True, True, True, True]],

        [[True, True, True, True],
         [True, True, True, True],
         [True, True, True, True]]])

In [227]:
p = torch.tensor([1,0],dtype=torch.float32)
p

tensor([1., 1.])

In [228]:
p.var(unbiased=False)

tensor(0.)