<a href="https://colab.research.google.com/github/namanraiyani/TransformerFromScratch/blob/main/TransformerFromScratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
import math
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

Input Embedding

In [15]:
class InputEmbeddings(nn.Module):
  def __init__(self, embedding_dim, vocab_size):   # embedding_dim is d_model
    super().__init__()
    self.embedding_dim = embedding_dim
    self.vocab_size = vocab_size
    self.embedding = nn.Embedding(vocab_size, embedding_dim)

  def forward(self, x):
    return self.embedding(x) * math.sqrt(self.embedding_dim) # scale embeddings to match positional encoding scale

Positional Encoding

In [18]:
class PositionalEncoding(nn.Module):
  def __init__(self, embedding_dim, sequence_len, dropout):
    super().__init__()
    self.embedding_dim = embedding_dim
    self.sequence_len = sequence_len
    self.dropout = nn.Dropout(dropout)

    PE = torch.zeros(sequence_len, embedding_dim)
    position = torch.arange(0, sequence_len, dtype = torch.float)
    positition = position.unsqueeze(1)

    denominator_term = torch.exp(torch.arange(0, embedding_dim, step = 2).float() * (-math.log(10000.0) / embedding_dim))

    PE[:, 0::2] = torch.sin(position * denominator_term)
    PE[:, 1::2] = torch.cos(position * denominator_term)
    PE = PE.unsqueeze(0)

    self.register_buffer('PE', PE)

  def forward(self, x):
    x = x + (self.PE[:, :x.shape[1], :]).requires_grad_(False)
    return self.dropout(x)