In [4]:
from transformers import AutoConfig, AutoTokenizer
from math import sqrt
import pandas as pd
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def scaled_dot_attention(
    query: torch.Tensor, 
    key: torch.Tensor, 
    value: torch.Tensor, 
    masked: bool 
) -> torch.Tensor: 
    # query.size() = (batch_size, seq_size, head_dim)
    # TODO: add masking operation
    dim_k = key.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
    # scores.size() = (batch_size, seq_size, seq_size)
    if masked:
        mask = torch.triu(scores)
        scores = scores.masked_fill(mask == 0, -1e9)
    weights = torch.nn.functional.softmax(scores, dim = -1)
    output = torch.bmm(weights, value)
    # output.size() = (batch_size, seq_size, head_dim)
    return output

In [6]:
model = "distilbert-base-uncased"
config = AutoConfig.from_pretrained(model)
tokenizer = AutoTokenizer.from_pretrained(model)

In [7]:
config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.46.3",
  "vocab_size": 30522
}

In [8]:
inputs = tokenizer("Hawaiian white Christmas", return_tensors="pt")

In [9]:
class Embeddings(torch.nn.Module):
    def __init__(self, config: AutoConfig):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(
            num_embeddings = config.vocab_size,
            embedding_dim = config.dim,
        )
        self.position_embedding = torch.nn.Embedding(
            num_embeddings = config.max_position_embeddings,
            embedding_dim = config.dim,
        )
        self.layer_norm = torch.nn.LayerNorm(
            normalized_shape = config.dim,
        )
        self.dropout = torch.nn.Dropout()
        
    def forward(self, token_ids: torch.Tensor) -> torch.Tensor:
        """
        token_ids: (batch_size, seq_size)
        return: (batch_size, seq_size, embed_dim)
        """
        seq_size = token_ids.size(-1)
        position_ids = torch.arange(0, seq_size).unsqueeze(0)
        token_embedding = self.token_embedding(token_ids)  # (batch_size, seq_size, embed_dim)
        position_embedding = self.position_embedding(position_ids)  # (batch_size, seq_size, embed_dim)
        embedding = token_embedding + position_embedding
        embedding = self.layer_norm(embedding)
        embedding = self.dropout(embedding)
        return embedding  
    
class AttentionHead(torch.nn.Module):
    def __init__(self, embed_dim: int, head_dim: int):
        super().__init__()
        self.query = torch.nn.Linear(embed_dim, head_dim)
        self.key = torch.nn.Linear(embed_dim, head_dim)
        self.value = torch.nn.Linear(embed_dim, head_dim)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: (batch_size, seq_size, embed_dim)
        return: (batch_size, seq_size, head_dim)
        """
        # TODO: Add mask
        return scaled_dot_attention(
            self.query(x),
            self.key(x),
            self.value(x),
            None
        )

class AttentionHeads(torch.nn.Module):
    def __init__(self, config: AutoConfig):
        super().__init__()
        embed_dim = config.dim
        head_dim = config.dim // config.n_heads
        self.heads = torch.nn.ModuleList([
            AttentionHead(embed_dim, head_dim) for _ in range(config.n_heads)
        ])
        self.output_layer = torch.nn.Linear(embed_dim, embed_dim)
        
    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        """
        hidden_state: (batch_size, seq_size, embed_dim)
        return: (batch_size, seq_size, embed_dim)
        """
        # 
        x = torch.concat([head(hidden_state) for head in self.heads], dim = -1)
        x = self.output_layer(x)
        return x
        
    
class FeedForwardNet(torch.nn.Module):
    def __init__(self, config: AutoConfig):
        super().__init__()
        self.linear_layer_1 = torch.nn.Linear(config.dim, config.hidden_dim)
        self.gelu = torch.nn.GELU()
        self.dropout = torch.nn.Dropout()
        self.linear_layer_2 = torch.nn.Linear(config.hidden_dim, config.vocab_size)
        self.softmax = torch.nn.Softmax(dim = -1)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        hidden_state: (batch_size, embed_dim)
        return: (batch_size, vocab_size)
        """
        x = self.linear_layer_1(x)  # (batch_size, hidden_dim)
        x = self.gelu(x)  # (batch_size, hidden_dim)
        x = self.dropout(x)  # (batch_size, hidden_dim)
        x = self.linear_layer_2(x)  # (batch_size, vocab_size)
        x = self.softmax(x)  # (batch_size, vocab_size)
        return x
        
class GPT(torch.nn.Module):
    def __init__(self, config: AutoConfig):
        super().__init__()
        # Embedding for tokens
        self.embeddings = Embeddings(config)
        # attention heads
        self.attention_heads = AttentionHeads(config)
        # FNN
        self.feedforward = FeedForwardNet(config)
        
    def forward(self, token_ids: torch.Tensor) -> torch.Tensor:
        """
        token_ids: (batch_size, seq_size)
        return: (batch_size, vocab_size)
        """
        x = self.embeddings(token_ids)  # (batch_size, seq_size, embed_dim)
        x = self.attention_heads(x)[:, 0, :]  # (batch_size, embed_dim)
        x = self.feedforward(x)  # (batch_size, vocab_size)
        
        return x

In [10]:
model = GPT(config)

In [11]:
model(inputs.input_ids)

tensor([[3.2961e-05, 3.2880e-05, 3.4805e-05,  ..., 3.3031e-05, 3.4138e-05,
         3.3315e-05]], grad_fn=<SoftmaxBackward0>)