In [1]:
import torch
import torch.nn as nn

In [25]:
import torch
import torch.nn as nn

# Define input tensor (batch_size=2, features=3)
input_tensor = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])

# LayerNorm with normalized_shape = number of features (3 in this case)
layer_norm = nn.LayerNorm(normalized_shape=3)

# Apply LayerNorm
output_tensor = layer_norm(input_tensor)

print("Input Tensor:\n", input_tensor)
print("Output Tensor:\n", output_tensor)

# mean and variance of the input tensor
mean = input_tensor.mean(dim=1)
variance = input_tensor.var(dim=1, unbiased=False)
print("Mean of Input Tensor:", mean)
print("Variance of Input Tensor:", variance)

# mean and variance of the output tensor
mean = output_tensor.mean(dim=1)
variance = output_tensor.var(dim=1, unbiased=False)
print("Mean of Output Tensor:", mean)
print("Variance of Output Tensor:", variance)


Input Tensor:
 tensor([[1., 2., 3.],
        [4., 5., 6.]])
Output Tensor:
 tensor([[-1.2247,  0.0000,  1.2247],
        [-1.2247,  0.0000,  1.2247]], grad_fn=<NativeLayerNormBackward0>)
Mean of Input Tensor: tensor([2., 5.])
Variance of Input Tensor: tensor([0.6667, 0.6667])
Mean of Output Tensor: tensor([0., 0.], grad_fn=<MeanBackward1>)
Variance of Output Tensor: tensor([1.0000, 1.0000], grad_fn=<VarBackward0>)


In [8]:
# Access and display epsilon (ϵ), gamma (γ), and beta (β)
epsilon = layer_norm.eps  # Epsilon value
gamma = layer_norm.weight  # Gamma: learnable scale parameter
beta = layer_norm.bias    # Beta: learnable shift parameter

print(f"Epsilon (ϵ): {epsilon}")
print(f"Gamma (γ): {gamma}")
print(f"Beta (β): {beta}")

# Check if gamma and beta are learnable
print("Gamma (γ) requires_grad:", layer_norm.weight.requires_grad)  # True
print("Beta (β) requires_grad:", layer_norm.bias.requires_grad)    # True

Epsilon (ϵ): 1e-05
Gamma (γ): Parameter containing:
tensor([1., 1., 1.], requires_grad=True)
Beta (β): Parameter containing:
tensor([0., 0., 0.], requires_grad=True)
Gamma (γ) requires_grad: True
Beta (β) requires_grad: True


In [14]:
# manually calculate the output
# Input tensor
input_tensor = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])

# Define LayerNorm parameters
normalized_shape = input_tensor.size(1)  # Number of features
eps = 1e-5  # Small constant for numerical stability
gamma = torch.ones(normalized_shape)  # Learnable scale (initialized to 1)
beta = torch.zeros(normalized_shape)  # Learnable shift (initialized to 0)

# Step 1: Compute mean and variance along the last dimension
mean = input_tensor.mean(dim=1, keepdim=True)
variance = input_tensor.var(dim=1, keepdim=True, unbiased=False)

# Step 2: Normalize the input
normalized = (input_tensor - mean) / torch.sqrt(variance + eps)

# Step 3: Apply gamma and beta
output_tensor = normalized * gamma + beta

# Display results
print("Input Tensor:\n", input_tensor)
print("Mean:\n", mean)
print("Variance:\n", variance)
print("Normalized Tensor:\n", normalized)
print("Output Tensor (after applying gamma and beta):\n", output_tensor)





Input Tensor:
 tensor([[1., 2., 3.],
        [4., 5., 6.]])
Mean:
 tensor([[2.],
        [5.]])
Variance:
 tensor([[0.6667],
        [0.6667]])
Normalized Tensor:
 tensor([[-1.2247,  0.0000,  1.2247],
        [-1.2247,  0.0000,  1.2247]])
Output Tensor (after applying gamma and beta):
 tensor([[-1.2247,  0.0000,  1.2247],
        [-1.2247,  0.0000,  1.2247]])


In [19]:
# pure manual calculation
mean = (1.0 + 2.0 + 3.0) / 3.0
var = ((1.0 - mean) ** 2 + (2.0 - mean) ** 2 + (3.0 - mean) ** 2) / 3.0
eps = 1e-5

mean = torch.tensor(mean)
var = torch.tensor(var)
x1 = (1.0 - mean) / torch.sqrt(var + eps)
x2 = (2.0 - mean) / torch.sqrt(var + eps)
x3 = (3.0 - mean) / torch.sqrt(var + eps)

print(x1, x2, x3)

tensor(-1.2247) tensor(0.) tensor(1.2247)


In [24]:
# more test data

# Define input tensor (batch_size=2, features=3)
input_tensor = torch.tensor([[1.0, 2.0, 3.0, 5.0, 7.0], [4.0, 5.0, 6.0, 10.0, 21.0]])

# LayerNorm with normalized_shape = number of features (3 in this case)
layer_norm = nn.LayerNorm(normalized_shape=5)

# Apply LayerNorm
output_tensor = layer_norm(input_tensor)

print("Input Tensor:\n", input_tensor)
print("Output Tensor:\n", output_tensor)

# mean and variance of the input tensor
mean = input_tensor.mean(dim=1)
variance = input_tensor.var(dim=1, unbiased=False)
print("Mean of Input Tensor:", mean)
print("Variance of Input Tensor:", variance)

# mean and variance of the output tensor
mean = output_tensor.mean(dim=1)
variance = output_tensor.var(dim=1, unbiased=False)
print("Mean of Output Tensor:", mean)
print("Variance of Output Tensor:", variance)

Input Tensor:
 tensor([[ 1.,  2.,  3.,  5.,  7.],
        [ 4.,  5.,  6., 10., 21.]])
Output Tensor:
 tensor([[-1.2070, -0.7428, -0.2785,  0.6499,  1.5784],
        [-0.8331, -0.6729, -0.5127,  0.1282,  1.8905]],
       grad_fn=<NativeLayerNormBackward0>)
Mean of Input Tensor: tensor([3.6000, 9.2000])
Variance of Input Tensor: tensor([ 4.6400, 38.9600])
Mean of Output Tensor: tensor([3.5763e-08, 2.3842e-08], grad_fn=<MeanBackward1>)
Variance of Output Tensor: tensor([1.0000, 1.0000], grad_fn=<VarBackward0>)
