In [None]:
import torch
import torch.nn as nn
import math
import numpy as np

## input embedding


In [22]:
# first we;ll be building input embeddings
# allows to convert token into embedding of dim 1x52  : token -> input ID(position in vocab) ->embedding


class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        """

        Args:
            d_model (int): dim of vector
            vocab_size (int): # of words in vocab
        """

        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)

    def forward(self, x):
        return self.embedding(x)


# Example usage
input_embeddings = InputEmbeddings(d_model=6, vocab_size=1000)
print(positional)
# Create an example input tensor (batch size 1, sequence length 5, embedding dimension 20)
batch_of_sentences = torch.tensor([[5, 6, 7, 0, 0]])  # Shape: (batch_size, max_sentence_length)
print(batch_of_sentences.shape)


# Pass through the embedding layer
# The forward method is called automatically when you use the instance like a function.
embedded_sentences = input_embeddings(batch_of_sentences)
embedded_sentences.shape, embedded_sentences  # (batch, seq_len, embedding dim)

InputEmbeddings(
  (embedding): Embedding(5, 6)
)
torch.Size([1, 5])


(torch.Size([1, 5, 6]),
 tensor([[[-0.4121,  0.0757, -0.6342, -2.1711,  0.3468,  1.4517],
          [-0.1795,  0.0305, -0.8155, -0.5573,  0.0734,  0.3309],
          [ 1.6115, -1.4192, -0.5432, -2.9637,  0.1750, -1.2888],
          [ 0.0209, -0.1609,  1.0488, -0.9283,  0.2751, -0.1187],
          [ 0.0209, -0.1609,  1.0488, -0.9283,  0.2751, -0.1187]]],
        grad_fn=<EmbeddingBackward0>))

In [13]:
vocab_size = 5
d_model = 6
nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)

Embedding(5, 6)

## positional encoding


In [26]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        """
                Since our model contains no recurrence and no convolution, in order for the model to make use of the
        order of the sequence, we must inject some information about the relative or absolute position of the
        tokens in the sequence. To this end, we add "positional encodings" to the input embeddings at the
        bottoms of the encoder and decoder stacks. The positional encodings have the same dimension dmodel
        as the embeddings, so that the two can be summed. There are many choices of positional encodings,
        learned and fixed [9].
        In this work, we use sine and cosine functions of different frequencies:
            `PE(pos,2i) = sin(pos/(10000)**2i/dmodel)`
            `PE(pos,2i+1) = cos(pos/(10000)**2i/dmodel)`
        where pos is the position and i is the dimension. That is, each dimension of the positional encoding
        corresponds to a sinusoid. The wavelengths form a geometric progression from 2π to 10000 · 2π. We
        chose this function because we hypothesized it would allow the model to easily learn to attend by
        relative positions, since for any fixed offset k, P E(pos+k) can be represented as a linear function of
        PE(pos).

        Keyword arguments:
        dropout -- to make model less overfit
        seq_len -- Specifies the maximum length of sequence that the model can handle. This helps determine the scale and range of the positional encodings.
        """
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        # positional encodeing shape: seq_len X d_model i.e. each token will be represented (1*d_model) vector

        """
        
        formula :`PE(pos,2i) = cos(pos/(10000)**2i/dmodel) for i=1,3,5, ...and `PE(pos,2i) = sin(pos/(10000)**2i/dmodel) for i=2,4,6, ...and `
        
        """

        #  Create a model of shape (seq_len , d_model)

        pe = torch.zeros(seq_len, d_model)
        #  create a vector of shape(seq_len,1) to represent position of word in sequence

        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)  # (seq_len,1)  # pos in formula
        # create denominator of formula
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        # apply sin to even positions
        pe[:, 0::2] = torch.sin(position * div_term)

        # apply cos to odd positions
        pe[:, 1::2] = torch.cos(position * div_term)

        # now we need to add batch dimension to these sentences so we can apply it to whole sentences, so to all the batch of sentence, because weill have batch of sentences.
        # adding batch dim
        pe = pe.unsqueeze(0)  # (1, seq_len, d_model)

        # register this tensor in buffer of module  .. it is done for the tensor that you want to keep inside the module, not as a lerarned parameter but you want it to be saved when you save the file of the model
        # you should register it as a buffer. this way the tensor would be saved in file along with state of model
        self.register_buffer("pe", pe)  # This is typically used to register a buffer that should not to be considered a model parameter.
        """
        Say you have a linear layer nn.Linear. You already have weight and bias parameters. But if you need a new parameter you use register_parameter() to register a new named parameter that is a tensor.
        When you register a new parameter it will appear inside the module.parameters() iterator, but when you register a buffer it will not.
        The difference:
        Buffers are named tensors that do not update gradients at every step, like parameters. For buffers, you create your custom logic (fully up to you).
        """

    def forward(self, x):
        """
        we need to add positional encoding to every token/word inside sequence/sentence
        """
        x = x + (self.pe[:, : x.shape[1], :]).requires_grad_(False)  # x:token and pe is positional encoding  # because we dont want to learn pe because these are fixed
        return self.dropout(x)


# Example usage
positional = PositionalEncoding(d_model=6, seq_len=5, dropout=0.5)

# Create an example input tensor (batch size , sequence length , embedding dimension )

# Apply positional encoding
output_tensor = positional(embedded_sentences)
print("input ", embedded_sentences)

print("input shape", embedded_sentences.shape)
print("output shape", output_tensor.shape)

print(output_tensor)  # (1, seq_len,d_model)

input  tensor([[[-0.4121,  0.0757, -0.6342, -2.1711,  0.3468,  1.4517],
         [-0.1795,  0.0305, -0.8155, -0.5573,  0.0734,  0.3309],
         [ 1.6115, -1.4192, -0.5432, -2.9637,  0.1750, -1.2888],
         [ 0.0209, -0.1609,  1.0488, -0.9283,  0.2751, -0.1187],
         [ 0.0209, -0.1609,  1.0488, -0.9283,  0.2751, -0.1187]]],
       grad_fn=<EmbeddingBackward0>)
input shape torch.Size([1, 5, 6])
output shape torch.Size([1, 5, 6])
tensor([[[-0.8241,  2.1514, -0.0000, -2.3423,  0.0000,  0.0000],
         [ 0.0000,  1.1417, -1.5383,  0.8833,  0.0000,  2.6618],
         [ 5.0416, -3.6708, -0.9010, -0.0000,  0.0000, -0.5776],
         [ 0.0000, -0.0000,  0.0000,  0.1241,  0.5631,  0.0000],
         [-0.0000, -0.0000,  0.0000,  0.1091,  0.5674,  1.7626]]],
       grad_fn=<MulBackward0>)


### register_buffer


In [4]:
class MyModule(nn.Module):
    def __init__(self):
        super(MyModule, self).__init__()

        # Register a buffer tensor with zeros
        self.register_buffer("buffer_tensor", torch.zeros(3, 3))

        # Register another buffer tensor with specific values
        data = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float32)
        self.register_buffer("another_buffer", data)

    def forward(self, x):
        # Use the buffer tensors in the forward pass
        output = x + self.buffer_tensor
        return output


# Create an instance of MyModule
model = MyModule()

# Print the module to see its structure
print(model)

# Accessing the buffer tensors
print("Buffer tensor:")
print(model.buffer_tensor)

print("\nAnother buffer tensor:")
print(model.another_buffer)

MyModule()
Buffer tensor:
tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])

Another buffer tensor:
tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]])
