In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader, random_split

In [3]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

In [18]:
from pathlib import Path
import warnings
from typing import Any
from tqdm import tqdm
import math

### Coding the input Embedding layer.

In [286]:
class inputEmebeddingLayer(nn.Module):
    ### vocab_size contains the vocab size and d_model contains the embedding dimention that we expect to create with this layer.
    def __init__(self,vocab_size:int,d_model:int):
        super().__init__()
        self.d_model=d_model ### Expected dimention 
        self.vocab_size=vocab_size ### models vocabulary.
        self.embedding=nn.Embedding(self.vocab_size,self.d_model) ### rowwise vocability and column wise expectected dimentions for vector.
    def forward(self,input_tokens):
        ### recieves inputs of fixed context length
        ### The input tokens will be in [int] representation.
        return self.embedding(input_tokens) * math.sqrt(self.d_model) ## multiplied for neumerical stability

In [287]:
input_embedding_layer=inputEmebeddingLayer(vocab_size=11,d_model=4)

In [288]:
input_embeddings=input_embedding_layer(torch.tensor([[1,2,3,4],[3,4,5,5]])) ### sending a batch of 2.
input_embeddings

tensor([[[ 9.6439e-04,  3.2745e+00, -1.7380e+00, -9.8305e-01],
         [ 2.2389e+00, -5.7690e-01, -3.3584e+00, -3.0447e+00],
         [ 1.2606e+00,  1.3314e+00,  2.3327e+00, -2.1338e-01],
         [ 3.8559e+00, -2.9000e-01,  1.6574e+00,  8.4139e-01]],

        [[ 1.2606e+00,  1.3314e+00,  2.3327e+00, -2.1338e-01],
         [ 3.8559e+00, -2.9000e-01,  1.6574e+00,  8.4139e-01],
         [ 7.8143e-01,  1.2731e+00,  9.7494e-01,  2.3673e+00],
         [ 7.8143e-01,  1.2731e+00,  9.7494e-01,  2.3673e+00]]],
       grad_fn=<MulBackward0>)

### Coding the positional Embedding Layer

In [None]:
class positionalEmbeddings(nn.Module):
    ### max_seq_len is the context window here.
    def __init__(self, max_seq_len:int,d_model:int,dropout:float):
        super().__init__()
        self.max_seq_len=max_seq_len
        self.d_model=d_model
        self.dropout=nn.Dropout(dropout)
        static_positional_embedding=torch.zeros(self.max_seq_len,self.d_model) ### initializing with zeros
        pairs=torch.arange(0,d_model,2).float() #indice pairs.
        pairwise_denominator=torch.exp(pairs*(-math.log(1e4)/self.d_model))
        positions=torch.arange(0,max_seq_len,dtype=torch.float).unsqueeze(1)
        static_positional_embedding[0:,0::2]=torch.sin(positions*pairwise_denominator)
        static_positional_embedding[0:,1::2]=torch.cos(positions*pairwise_denominator)
        static_positional_embedding=static_positional_embedding.unsqueeze(0)
        self.register_buffer('pe',static_positional_embedding)
    def forward(self,inputEmebeddings):
        output=input_embeddings+(self.pe[:,:inputEmebeddings.shape[1],:])
        return self.dropout(output)
        

In [302]:
pe=positionalEmbeddings(max_seq_len=11,d_model=4,dropout=0.3)

In [303]:
print(input_embeddings)

tensor([[[ 9.6439e-04,  3.2745e+00, -1.7380e+00, -9.8305e-01],
         [ 2.2389e+00, -5.7690e-01, -3.3584e+00, -3.0447e+00],
         [ 1.2606e+00,  1.3314e+00,  2.3327e+00, -2.1338e-01],
         [ 3.8559e+00, -2.9000e-01,  1.6574e+00,  8.4139e-01]],

        [[ 1.2606e+00,  1.3314e+00,  2.3327e+00, -2.1338e-01],
         [ 3.8559e+00, -2.9000e-01,  1.6574e+00,  8.4139e-01],
         [ 7.8143e-01,  1.2731e+00,  9.7494e-01,  2.3673e+00],
         [ 7.8143e-01,  1.2731e+00,  9.7494e-01,  2.3673e+00]]],
       grad_fn=<MulBackward0>)


In [304]:
pe(input_embeddings)

tensor([[[ 0.0000,  1.0000,  0.0000,  1.0000],
         [ 0.8415,  0.5403,  0.0100,  0.9999],
         [ 0.9093, -0.4161,  0.0200,  0.9998],
         [ 0.1411, -0.9900,  0.0300,  0.9996],
         [-0.7568, -0.6536,  0.0400,  0.9992],
         [-0.9589,  0.2837,  0.0500,  0.9988],
         [-0.2794,  0.9602,  0.0600,  0.9982],
         [ 0.6570,  0.7539,  0.0699,  0.9976],
         [ 0.9894, -0.1455,  0.0799,  0.9968],
         [ 0.4121, -0.9111,  0.0899,  0.9960],
         [-0.5440, -0.8391,  0.0998,  0.9950]]])


tensor([[[ 1.3777e-03,  6.1064e+00, -0.0000e+00,  2.4212e-02],
         [ 4.4005e+00, -0.0000e+00, -4.7834e+00, -0.0000e+00],
         [ 3.0998e+00,  1.3075e+00,  3.3611e+00,  1.1235e+00],
         [ 5.7101e+00, -1.8286e+00,  2.4106e+00,  2.6299e+00]],

        [[ 1.8008e+00,  3.3306e+00,  3.3325e+00,  1.1237e+00],
         [ 6.7106e+00,  0.0000e+00,  2.3820e+00,  0.0000e+00],
         [ 2.4153e+00,  1.2242e+00,  1.4213e+00,  4.8102e+00],
         [ 1.3179e+00,  4.0444e-01,  1.4356e+00,  0.0000e+00]]],
       grad_fn=<MulBackward0>)