## Positional Encoding

This notebook will code positional encoding for Transformer neural networks with pytrch

In [6]:
import torch
import torch.nn as nn

max_sequence_length = 10
d_model = 64

$$
PE(\text{position}, 2i) = \sin\bigg( \frac{ \text{position} }{10000^\frac{2i}{d_{model}}} \bigg)
$$

$$
PE(\text{position}, 2i+1) = \cos\bigg( \frac{ \text{position} }{10000^\frac{2i}{d_{model}}} \bigg)
$$

We can rewrite these as

$$
PE(\text{position}, i) = \sin\bigg( \frac{ \text{position} }{10000^\frac{i}{d_{model}}} \bigg) \text{ when i is even}
$$

$$
PE(\text{position}, i) = \cos\bigg( \frac{ \text{position} }{10000^\frac{i-1}{d_{model}}} \bigg) \text{ when i is odd}
$$

In [7]:
even_i = torch.arange(0, d_model, 2).float()
even_i

tensor([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18., 20., 22., 24., 26.,
        28., 30., 32., 34., 36., 38., 40., 42., 44., 46., 48., 50., 52., 54.,
        56., 58., 60., 62.])

In [8]:
even_denominator = torch.pow(10000, even_i/d_model)
even_denominator

tensor([1.0000e+00, 1.3335e+00, 1.7783e+00, 2.3714e+00, 3.1623e+00, 4.2170e+00,
        5.6234e+00, 7.4989e+00, 1.0000e+01, 1.3335e+01, 1.7783e+01, 2.3714e+01,
        3.1623e+01, 4.2170e+01, 5.6234e+01, 7.4989e+01, 1.0000e+02, 1.3335e+02,
        1.7783e+02, 2.3714e+02, 3.1623e+02, 4.2170e+02, 5.6234e+02, 7.4989e+02,
        1.0000e+03, 1.3335e+03, 1.7783e+03, 2.3714e+03, 3.1623e+03, 4.2170e+03,
        5.6234e+03, 7.4989e+03])

In [9]:
odd_i = torch.arange(1, d_model, 2).float()
odd_i

tensor([ 1.,  3.,  5.,  7.,  9., 11., 13., 15., 17., 19., 21., 23., 25., 27.,
        29., 31., 33., 35., 37., 39., 41., 43., 45., 47., 49., 51., 53., 55.,
        57., 59., 61., 63.])

In [10]:
even_denominator = torch.pow(10000, (odd_i - 1)/d_model)
even_denominator

tensor([1.0000e+00, 1.3335e+00, 1.7783e+00, 2.3714e+00, 3.1623e+00, 4.2170e+00,
        5.6234e+00, 7.4989e+00, 1.0000e+01, 1.3335e+01, 1.7783e+01, 2.3714e+01,
        3.1623e+01, 4.2170e+01, 5.6234e+01, 7.4989e+01, 1.0000e+02, 1.3335e+02,
        1.7783e+02, 2.3714e+02, 3.1623e+02, 4.2170e+02, 5.6234e+02, 7.4989e+02,
        1.0000e+03, 1.3335e+03, 1.7783e+03, 2.3714e+03, 3.1623e+03, 4.2170e+03,
        5.6234e+03, 7.4989e+03])

`even_denominator` and `odd_denominator` are the same! So we can just do one of these actions and call the resulting variable `denominator`

In [11]:
denominator = even_denominator

In [12]:
position = torch.arange(max_sequence_length, dtype=torch.float).reshape(max_sequence_length, 1)

In [13]:
position

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.],
        [7.],
        [8.],
        [9.]])

In [18]:
even_PE = torch.sin(position / denominator)
odd_PE = torch.cos(position / denominator)

In [19]:
even_PE

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 8.4147e-01,  6.8156e-01,  5.3317e-01,  4.0931e-01,  3.1098e-01,
          2.3492e-01,  1.7689e-01,  1.3296e-01,  9.9833e-02,  7.4919e-02,
          5.6204e-02,  4.2157e-02,  3.1618e-02,  2.3712e-02,  1.7782e-02,
          1.3335e-02,  9.9998e-03,  7.4989e-03,  5.6234e-03,  4.2170e-03,
          3.1623e-03,  2.3714e-03,  1.7783e-03,  1.3335e-03,  1.0000e-03,
          7.4989e-04,  5.6234e-04,  4.2170e-04,  3.1623e-04,  2.3714e-04,
          1.7783e-04,  1.3335e-04],
        [ 9.0930e-01,  9.9748e-01,  9.02

In [20]:
even_PE.shape

torch.Size([10, 32])

In [21]:
odd_PE

tensor([[ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
          1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
          1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
          1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000],
        [ 0.5403,  0.7318,  0.8460,  0.9124,  0.9504,  0.9720,  0.9842,  0.9911,
          0.9950,  0.9972,  0.9984,  0.9991,  0.9995,  0.9997,  0.9998,  0.9999,
          0.9999,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
          1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000],
        [-0.4161,  0.0709,  0.4315,  0.6649,  0.8066,  0.8896,  0.9374,  0.9646,
          0.9801,  0.9888,  0.9937,  0.9964,  0.9980,  0.9989,  0.9994,  0.9996,
          0.9998,  0.9999,  0.9999,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
          1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000],
        [-0.9900, -0.6279

In [22]:
odd_PE.shape

torch.Size([10, 32])

In [23]:
stacked = torch.stack([even_PE, odd_PE], dim=2)
stacked.shape

torch.Size([10, 32, 2])

In [24]:
PE = torch.flatten(stacked, start_dim=1, end_dim=2)
PE

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,
          0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,
          0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,
          0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,
          0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,
          0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,
          0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  6.8156e-01,  7

## Class

Let's combine all the code above into a cute class

In [25]:
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = torch.arange(self.max_sequence_length).reshape(self.max_sequence_length, 1)
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE

In [26]:
pe = PositionalEncoding(d_model=6, max_sequence_length=10)
pe.forward()

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0464,  0.9989,  0.0022,  1.0000],
        [ 0.9093, -0.4161,  0.0927,  0.9957,  0.0043,  1.0000],
        [ 0.1411, -0.9900,  0.1388,  0.9903,  0.0065,  1.0000],
        [-0.7568, -0.6536,  0.1846,  0.9828,  0.0086,  1.0000],
        [-0.9589,  0.2837,  0.2300,  0.9732,  0.0108,  0.9999],
        [-0.2794,  0.9602,  0.2749,  0.9615,  0.0129,  0.9999],
        [ 0.6570,  0.7539,  0.3192,  0.9477,  0.0151,  0.9999],
        [ 0.9894, -0.1455,  0.3629,  0.9318,  0.0172,  0.9999],
        [ 0.4121, -0.9111,  0.4057,  0.9140,  0.0194,  0.9998]])

Happy Coding!