# Lecture 20: Layer Normalization

In [560]:
import torch
import torch.nn as nn 
from torch import nn


In [561]:
torch.manual_seed(123)
batch_example = torch.rand(2, 5)
emb_dim = batch_example.shape[-1]
hidden_layers = 6
layer = nn.Sequential(nn.Linear(emb_dim, hidden_layers), nn.ReLU())
out = layer(batch_example)

print(f"Input Batch:\n{batch_example}\n")
print(f"Layer Output:\n{out}")

Input Batch:
tensor([[0.2961, 0.5166, 0.2517, 0.6886, 0.0740],
        [0.8665, 0.1366, 0.1025, 0.1841, 0.7264]])

Layer Output:
tensor([[0.0000, 0.0000, 0.4091, 0.6587, 0.3914, 0.0000],
        [0.0000, 0.0000, 0.1902, 0.3182, 0.6486, 0.0000]],
       grad_fn=<ReluBackward0>)


In [562]:
mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True)
print(f"Output Mean Value:\n{mean}\n")
print(f"Output Variance:\n{var}")

Output Mean Value:
tensor([[0.2432],
        [0.1928]], grad_fn=<MeanBackward1>)

Output Variance:
tensor([[0.0799],
        [0.0670]], grad_fn=<VarBackward0>)


In [563]:
torch.set_printoptions(sci_mode=False)
out_normalized = (out - mean) / torch.sqrt(var)
mean = out_normalized.mean(dim=-1, keepdim=True)
var = out_normalized.var(dim=-1, keepdim=True)
print(f"Layer Output normalized:\n{out_normalized}\n")
print(f"Mean Value of Normalized Output (Sci Mode set to False):\n{mean}\n")
print(f"Variance Value of normalized Output:\n{var}")

Layer Output normalized:
tensor([[-0.8603, -0.8603,  0.5869,  1.4698,  0.5242, -0.8603],
        [-0.7450, -0.7450, -0.0102,  0.4844,  1.7608, -0.7450]],
       grad_fn=<DivBackward0>)

Mean Value of Normalized Output (Sci Mode set to False):
tensor([[     0.0000],
        [    -0.0000]], grad_fn=<MeanBackward1>)

Variance Value of normalized Output:
tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [564]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, unbiased=False, keepdim=True)
        norm = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm + self.shift

In [565]:
layer_norm = LayerNorm(emb_dim=emb_dim)
output_normalized = layer_norm.forward(batch_example)
print(f"Example Batch:\n{batch_example}\n")
print(f"Mean Value:\n{batch_example.mean(dim=-1, keepdim=True)}\n\nVariance Value:\n{batch_example.var(dim=-1, unbiased=False, keepdim=True)}\n")
print(f"Output normalized:\n{output_normalized}\n")
print(f"Mean Value:\n{output_normalized.mean(dim=-1, keepdim=True)}\n")
print(f"Variance Value:\n{output_normalized.var(dim=-1, unbiased=False, keepdim=True)}")

Example Batch:
tensor([[0.2961, 0.5166, 0.2517, 0.6886, 0.0740],
        [0.8665, 0.1366, 0.1025, 0.1841, 0.7264]])

Mean Value:
tensor([[0.3654],
        [0.4032]])

Variance Value:
tensor([[0.0460],
        [0.1057]])

Output normalized:
tensor([[-0.3229,  0.7049, -0.5302,  1.5069, -1.3587],
        [ 1.4247, -0.8199, -0.9248, -0.6739,  0.9940]], grad_fn=<AddBackward0>)

Mean Value:
tensor([[     0.0000],
        [    -0.0000]], grad_fn=<MeanBackward1>)

Variance Value:
tensor([[0.9998],
        [0.9999]], grad_fn=<VarBackward0>)


In [566]:
torch.set_printoptions(sci_mode=False)
batch_example1 = torch.rand(3, 5)

output_normalized_sm = torch.softmax(batch_example1, dim=-1)
output_normalized_nl = layer_norm.forward(batch_example1)

sum_sm = sum(output_normalized_sm[1])
sum_nl = sum(output_normalized_nl[1])

on_sm_mean = output_normalized_sm.mean(dim=-1, keepdim=True)
on_nl_mean = output_normalized_nl.mean(dim=-1, keepdim=True)

on_sm_var = output_normalized_sm.var(dim=-1, unbiased=False, keepdim=True)
on_nl_var = output_normalized_nl.var(dim=-1, unbiased=False, keepdim=True)

print(f"Row sum applying Softmax Normalization: {sum_sm:.2f}\nRow sum applying Layer Normalization: {sum_nl:.2f}\n")
print(f"Example Batch Softmax normalized:\n{output_normalized_sm}\nMean Values:\n{on_sm_mean}\nVariance Values:\n{on_sm_var}\n")
print(f"Example Batch Layer normalization:\n{output_normalized_nl}\nMean Values:\n{on_nl_mean}\nVariance Values:\n{on_nl_var}")

Row sum applying Softmax Normalization: 1.00
Row sum applying Layer Normalization: 0.00

Example Batch Softmax normalized:
tensor([[0.1511, 0.1768, 0.2551, 0.2218, 0.1951],
        [0.2221, 0.2242, 0.2417, 0.1868, 0.1252],
        [0.1455, 0.3037, 0.2042, 0.1322, 0.2145]])
Mean Values:
tensor([[0.2000],
        [0.2000],
        [0.2000]])
Variance Values:
tensor([[0.0013],
        [0.0017],
        [0.0037]])

Example Batch Layer normalization:
tensor([[-1.4612, -0.5941,  1.4379,  0.6639, -0.0465],
        [ 0.5510,  0.5900,  0.9076, -0.1802, -1.8685],
        [-0.9192,  1.5546,  0.2206, -1.2413,  0.3852]], grad_fn=<AddBackward0>)
Mean Values:
tensor([[     0.0000],
        [     0.0000],
        [    -0.0000]], grad_fn=<MeanBackward1>)
Variance Values:
tensor([[0.9997],
        [0.9998],
        [0.9999]], grad_fn=<VarBackward0>)
