In [1]:
import torch
import torch.nn as nn

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [2]:
outputs = torch.randn(3,6)
outputs 

tensor([[ 0.7196, -0.4500,  1.7217, -1.1994, -0.1944, -1.0804],
        [-0.8168,  0.8535,  0.4712, -0.1405, -0.6911, -0.8297],
        [ 1.1576,  1.3676, -1.7349, -1.0120, -0.8987,  1.2206]])

In [3]:
outputs.mean( dim=1, keepdim=True )

tensor([[-0.0805],
        [-0.1922],
        [ 0.0167]])

In [4]:
outputs.std( dim=1, keepdim=True )

tensor([[1.1216],
        [0.7186],
        [1.3813]])

In [5]:
torch.set_printoptions( sci_mode=False )

In [None]:
# layer normalization helps improve training stability and efficiency of neural network training
# The main idea behind layer normalization is to adjust the activations (outputs)
# of a neural network layer to have a mean of 0 and a variance of 1, also
# known as unit variance. This adjustment speeds up the convergence to effective
# weights and ensures consistent, reliable training.


class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [7]:
ln = LayerNorm(6)
outputs_normalized = ln(outputs)

In [8]:
outputs_normalized 

tensor([[ 0.7815, -0.3609,  1.7602, -1.0929, -0.1113, -0.9766],
        [-0.9521,  1.5942,  1.0113,  0.0789, -0.7606, -0.9717],
        [ 0.9048,  1.0713, -1.3891, -0.8158, -0.7259,  0.9547]],
       grad_fn=<AddBackward0>)