In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

In [3]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

In [4]:
import warnings
from pathlib import Path
from typing import Any
from tqdm import tqdm
import math

In [5]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, num_embeddings: int) -> None:
        super().__init__()
        self.d_model = d_model # Dimension of Vec
        self.num_embeddings = num_embeddings # Size of Vocab
        self.embedding = nn.Embedding(num_embeddings, d_model)
    
    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model) # Normalization

In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        
        pe = torch.zeros(seq_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        
        i = torch.arange(0, d_model, 2, dtype=torch.float)
        div_term = torch.exp(i * (-math.log(10000)) / d_model)
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)
    
    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :])
        return self.dropout(x)

In [None]:
class LayerNormalization(nn.Module):
    def __init__(self, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        
        self.alpha = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        
        return self.alpha * (x-mean) / (std + self.eps) + self.bias

In [17]:
l = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float)
print(l.mean(dim=-1, keepdim=True))
print(torch.ones(1))

tensor([[2.],
        [5.]])
tensor([1.])
