In [1]:
from transformers import AutoConfig, AutoTokenizer
from math import sqrt
import pandas as pd
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = "distilbert-base-uncased"
config = AutoConfig.from_pretrained(model)
tokenizer = AutoTokenizer.from_pretrained(model)

In [11]:
config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.46.3",
  "vocab_size": 30522
}

In [8]:
inputs = tokenizer("I'm currently in Tokyo!", return_tensors = "pt")
inputs

{'input_ids': tensor([[ 101, 1045, 1005, 1049, 2747, 1999, 5522,  999,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [9]:
def scaled_dot_attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:
    # query.size() = (batch_size, seq_size, head_dim)
    dim_k = key.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
    # scores.size() = (batch_size, seq_size, seq_size)
    weights = torch.nn.functional.softmax(scores, dim = -1)
    # weights.size() = (batch_size, seq_size, seq_size)
    return torch.bmm(weights, value) # (batch_size, seq_size, head_dim)

In [38]:
class AttentionHead(torch.nn.Module):
    def __init__(self, embed_dim: int, head_dim: int):
        super().__init__()
        self.query = torch.nn.Linear(embed_dim, head_dim)
        self.key = torch.nn.Linear(embed_dim, head_dim)
        self.value = torch.nn.Linear(embed_dim, head_dim)


    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x.size() = (batch_size, seq_size, embed_dim)
        x = scaled_dot_attention(
            self.query(x),
            self.key(x),
            self.value(x)
        )
        return x
    
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, config: AutoConfig):
        super().__init__()
        embed_dim = config.dim
        head_dim = config.dim // config.n_heads
        self.attention_heads = torch.nn.ModuleList([
            AttentionHead(embed_dim, head_dim) for _ in range(config.n_heads)
        ])
        self.output_layer = torch.nn.Linear(embed_dim, embed_dim)
        
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x.size() = (batch_size, seq_size, embed_dim)
        x = torch.concat([head(x) for head in self.attention_heads], dim = -1)
        # x.size() = (batch_size, seq_size, embed_dim)
        x = self.output_layer(x)
        # x.size() = (batch_size, seq_size, embed_dim)
        return x
    
class Embedding(torch.nn.Module):
    def __init__(self, config: AutoConfig):
        super().__init__()
        self.token_embeddings = torch.nn.Embedding(
            num_embeddings = config.vocab_size,
            embedding_dim = config.dim,
        )
        self.position_embeddings = torch.nn.Embedding(
            num_embeddings = config.max_position_embeddings,
            embedding_dim = config.dim,
        )
        self.layer_norm = torch.nn.LayerNorm(normalized_shape=config.dim)
        self.dropout = torch.nn.Dropout()
        
    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
        # input_ids.size() = (batch_size, seq_size)
        seq_size = input_ids.size(-1)
        # this step creates (1, seq_size) tensor
        positional_ids = torch.arange(seq_size, dtype = torch.long).unsqueeze(0)
        token_embeddings = self.token_embeddings(input_ids)
        position_embeddings = self.position_embeddings(positional_ids)
        embeddings = token_embeddings + position_embeddings
        # embeddings.size() = (batch_size, seq_size, embed_dim)
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings
        
class FNN(torch.nn.Module):
    def __init__(self, embed_dim: int, inter_dim: int):
        super().__init__()
        self.linear_layer_1 = torch.nn.Linear(embed_dim, inter_dim)
        self.gelu = torch.nn.GELU()
        self.dropout = torch.nn.Dropout()
        self.linear_layer_2 = torch.nn.Linear(inter_dim, 1)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.linear_layer_1(x)
        x = self.gelu(x)
        x = self.dropout(x)
        x = self.linear_layer_2(x)
        x = self.sigmoid(x)
        return x
        
    
class EncoderClassification(torch.nn.Module):
    def __init__(self, config: AutoConfig):
        super().__init__()
        # embedding
        self.embedding = Embedding(config)
        # attention heads
        self.attention_heads = MultiHeadAttention(config)
        # FNN
        self.fnn = FNN(config.dim, config.hidden_dim)
        
    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
        x = self.embedding(input_ids)
        x = self.attention_heads(x)[:, 0, :]
        x = self.fnn(x)[:, 0]
        return x

In [39]:
model = EncoderClassification(config)

In [40]:
model(inputs.input_ids)

tensor([0.5193], grad_fn=<SelectBackward0>)