In [9]:
import torch
from math import sqrt
from torch import nn
import torch.nn.functional as F 
from transformers import AutoConfig
from transformers import AutoTokenizer

In [5]:
model_ckpt = "bert-base-uncased"
config = AutoConfig.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
token_emb

Embedding(30522, 768)

In [6]:
text = "time flies like an arrow"
inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False)
inputs_embeds = token_emb(inputs.input_ids)
inputs_embeds.size() #[batch_size, seq_len, hidden_dim]

torch.Size([1, 5, 768])

Create the query, key, and value vectors and calculate the attention scores using the dot product as the similarity function:

In [7]:
query = key = value = inputs_embeds
dim_k = key.size(-1)
scores = torch.bmm(query, key.transpose(1,2)) / sqrt(dim_k)
scores.size()


torch.Size([1, 5, 5])

This has created a 5 × 5 matrix of attention scores per sample in the batch. We’ll see later that the query, key, and value vectors are generated by applying independent weight matrices W(Q, K, V) to the embeddings, but for now we’ve kept them equal for simplicity.

In [10]:
weights = F.softmax(scores, dim=-1)
weights.sum(dim=-1)

tensor([[1., 1., 1., 1., 1.]], grad_fn=<SumBackward1>)

In [11]:
def scaled_dot_product_attention(query, key, value):
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k) 
    weights = F.softmax(scores, dim=-1)
    return torch.bmm(weights, value)

In [12]:
from transformers import pipeline