## Batch Normalization

Pytorch: [BatchNorm2d](https://pytorch.org/docs/master/nn.html#torch.nn.BatchNorm1d)

Normalize each dimension(such as `x[:, k, :, :]` in CHW version):
$$\hat{x}^{(k)} = \frac{x^{(k)} - \textbf{E}[x^{(k)}]}{\sqrt{  \text{Var}[x^{(k)}] + \epsilon   }}$$

Scale and shift the normalized value:
$$ y^{(k)} = \gamma ^ {(k)} \hat{x} ^{(k)} + \beta^{(k)} $$

For pytorch, $\gamma$ and $\beta$ are stored in `bn.weight` and `bn.bias` respectively. If `affine` is `False`, `bn` has no learnable parameters and thus `bn.weight` and `bn.bias` are `None`.

```
running_mean = (1 - momentum) * running_mean + momentum * batch_mean
running_var  = (1 - momentum) * running_var  + momentum * batch_var
```

Note that `torch.var` is default to unbiased, which means that it is  $Var[x] = \frac{m}{m - 1}$

In [2]:
import torch
import numpy as np

x = torch.arange(48).reshape(2, 4, 3, 2).float()
bn = torch.nn.BatchNorm2d(4, affine=False)
print('bn.eps:', bn.eps)
y = bn(x)

y_mine = torch.zeros_like(y)
for i in range(4):
    y_mine[:, i, :, :] = (x[:, i, :, :] - torch.mean(x[:, i, :, :])) / torch.sqrt(torch.var(x[:, i, :, :], unbiased=False) + bn.eps)

print('y:', y, sep='\n')
print('y_mine:', y_mine, sep='\n')

bn.eps: 1e-05
y:
tensor([[[[-1.1963, -1.1138],
          [-1.0313, -0.9488],
          [-0.8663, -0.7838]],

         [[-1.1963, -1.1138],
          [-1.0313, -0.9488],
          [-0.8663, -0.7838]],

         [[-1.1963, -1.1138],
          [-1.0313, -0.9488],
          [-0.8663, -0.7838]],

         [[-1.1963, -1.1138],
          [-1.0313, -0.9488],
          [-0.8663, -0.7838]]],


        [[[ 0.7838,  0.8663],
          [ 0.9488,  1.0313],
          [ 1.1138,  1.1963]],

         [[ 0.7838,  0.8663],
          [ 0.9488,  1.0313],
          [ 1.1138,  1.1963]],

         [[ 0.7838,  0.8663],
          [ 0.9488,  1.0313],
          [ 1.1138,  1.1963]],

         [[ 0.7838,  0.8663],
          [ 0.9488,  1.0313],
          [ 1.1138,  1.1963]]]])
y_mine:
tensor([[[[-1.1963, -1.1138],
          [-1.0313, -0.9488],
          [-0.8663, -0.7838]],

         [[-1.1963, -1.1138],
          [-1.0313, -0.9488],
          [-0.8663, -0.7838]],

         [[-1.1963, -1.1138],
          [-1.0313, -0

In [3]:
bn = torch.nn.BatchNorm2d(4) # eps=1e-5, affine=True(gamma and beta are enabled)
bn.bias[:] += 5 # set to 5
print('bn.bias:', bn.bias.data)
y = bn(x)


y_mine = torch.zeros_like(y)
for i in range(4):
    y_mine[:, i, :, :] = (x[:, i, :, :] - torch.mean(x[:, i, :, :])) / torch.sqrt(torch.var(x[:, i, :, :], unbiased=False) + bn.eps) * bn.weight[i] + bn.bias[i]

print('y:', y, sep='\n')
print('y_mine:', y_mine, sep='\n')

bn.bias: tensor([5., 5., 5., 5.])
y:
tensor([[[[4.7718, 4.7875],
          [4.8033, 4.8190],
          [4.8347, 4.8505]],

         [[4.8400, 4.8511],
          [4.8621, 4.8731],
          [4.8842, 4.8952]],

         [[4.7793, 4.7946],
          [4.8098, 4.8250],
          [4.8402, 4.8554]],

         [[3.8254, 3.9064],
          [3.9874, 4.0684],
          [4.1494, 4.2304]]],


        [[[5.1495, 5.1653],
          [5.1810, 5.1967],
          [5.2125, 5.2282]],

         [[5.1048, 5.1158],
          [5.1269, 5.1379],
          [5.1489, 5.1600]],

         [[5.1446, 5.1598],
          [5.1750, 5.1902],
          [5.2054, 5.2207]],

         [[5.7696, 5.8506],
          [5.9316, 6.0126],
          [6.0936, 6.1746]]]], grad_fn=<ThnnBatchNormBackward>)
y_mine:
tensor([[[[4.7718, 4.7875],
          [4.8033, 4.8190],
          [4.8347, 4.8505]],

         [[4.8400, 4.8511],
          [4.8621, 4.8731],
          [4.8842, 4.8952]],

         [[4.7793, 4.7946],
          [4.8098, 4.8250],
   

### Inference

$$y^{(k)} = \frac{x^{(k)} - \text{running_mean}^{(k)}}{\sqrt{  \text{running_var}^{(k)} + \epsilon   }} * \gamma^{(k)} + \beta^{(k)}$$


In [4]:
bn = torch.nn.BatchNorm2d(4).eval() # eps=1e-5, affine=True(gamma and beta are enabled)
y = bn(x)

y_mine = torch.zeros_like(y)
for i in range(4):
    y_mine[:, i, :, :] = (x[:, i, :, :] - bn.running_mean[i]) / torch.sqrt(bn.running_var[i] + bn.eps) * bn.weight[i] + bn.bias[i]
    
print('y:', y, sep='\n')
print('y_mine:', y_mine, sep='\n')

y:
tensor([[[[ 0.0000,  0.0891],
          [ 0.1781,  0.2672],
          [ 0.3562,  0.4453]],

         [[ 3.7017,  4.3187],
          [ 4.9356,  5.5526],
          [ 6.1695,  6.7865]],

         [[ 9.0030,  9.7533],
          [10.5035, 11.2538],
          [12.0040, 12.7543]],

         [[15.5209, 16.3832],
          [17.2455, 18.1078],
          [18.9700, 19.8323]]],


        [[[ 2.1375,  2.2266],
          [ 2.3156,  2.4047],
          [ 2.4937,  2.5828]],

         [[18.5086, 19.1256],
          [19.7425, 20.3595],
          [20.9764, 21.5934]],

         [[27.0091, 27.7593],
          [28.5096, 29.2598],
          [30.0101, 30.7603]],

         [[36.2155, 37.0778],
          [37.9401, 38.8023],
          [39.6646, 40.5269]]]], grad_fn=<ThnnBatchNormBackward>)
y_mine:
tensor([[[[ 0.0000,  0.0891],
          [ 0.1781,  0.2672],
          [ 0.3562,  0.4453]],

         [[ 3.7017,  4.3187],
          [ 4.9356,  5.5526],
          [ 6.1695,  6.7865]],

         [[ 9.0030,  9.7533],
   

In [5]:
bn = bn.train()
bn(x)
print(bn.running_mean)
bn = bn.eval()

y = bn(x)

y_mine = torch.zeros_like(y)
for i in range(4):
    y_mine[:, i, :, :] = (x[:, i, :, :] - bn.running_mean[i]) / torch.sqrt(bn.running_var[i] + bn.eps) * bn.weight[i] + bn.bias[i]
    
print('y:', y, sep='\n')
print('y_mine:', y_mine, sep='\n')

tensor([1.4500, 2.0500, 2.6500, 3.2500])
y:
tensor([[[[-0.0314, -0.0097],
          [ 0.0119,  0.0336],
          [ 0.0552,  0.0768]],

         [[ 0.5923,  0.7423],
          [ 0.8922,  1.0422],
          [ 1.1921,  1.3421]],

         [[ 1.7050,  1.8874],
          [ 2.0697,  2.2521],
          [ 2.4344,  2.6168]],

         [[ 3.0913,  3.3009],
          [ 3.5105,  3.7201],
          [ 3.9297,  4.1392]]],


        [[[ 0.4881,  0.5098],
          [ 0.5314,  0.5531],
          [ 0.5747,  0.5964]],

         [[ 4.1912,  4.3412],
          [ 4.4912,  4.6411],
          [ 4.7911,  4.9410]],

         [[ 6.0815,  6.2639],
          [ 6.4462,  6.6286],
          [ 6.8109,  6.9933]],

         [[ 8.1213,  8.3309],
          [ 8.5405,  8.7500],
          [ 8.9596,  9.1692]]]], grad_fn=<ThnnBatchNormBackward>)
y_mine:
tensor([[[[-0.0314, -0.0097],
          [ 0.0119,  0.0336],
          [ 0.0552,  0.0768]],

         [[ 0.5923,  0.7423],
          [ 0.8922,  1.0422],
          [ 1.1921,  1.3