In [35]:
import torch

In [36]:
embedding = torch.nn.Embedding(6, 3)

# create a sentence of length 5 represented by 3-dimensional embeddings
sentence = torch.tensor([1, 2, 4, 3, 2], dtype=torch.long)
embedded_sentence = embedding(sentence)
print(embedded_sentence)

tensor([[-0.3803,  0.7787,  0.3866],
        [ 0.5394,  0.0421, -0.5968],
        [ 0.2035,  0.1384, -0.1790],
        [ 0.6580, -0.8992,  0.7342],
        [ 0.5394,  0.0421, -0.5968]], grad_fn=<EmbeddingBackward0>)


In [37]:
# select the query word - for example word #3 in the sentence

qry_idx = 2
qry = embedded_sentence[qry_idx]

attention_scores_qry = torch.empty(sentence.shape[0])
# each word in the sentence is compared to the query word
# this is done using a dot product
for i, word in enumerate(embedded_sentence):
    attention_scores_qry[i] = torch.dot(qry, word)

print(attention_scores_qry) # unnormalized dot product attention scores
print(f"sum = {attention_scores_qry.sum():.2f}")

tensor([-0.0388,  0.2224,  0.0926, -0.1220,  0.2224], grad_fn=<CopySlices>)
sum = 0.38


In [38]:
# normalizing the attention scores

attention_scores_qry_norm = attention_scores_qry / torch.sum(attention_scores_qry)
print(attention_scores_qry_norm) # normalized dot product attention scores
print(f"sum = {attention_scores_qry_norm.sum():.2f}") # ok

tensor([-0.1030,  0.5905,  0.2458, -0.3238,  0.5905], grad_fn=<DivBackward0>)
sum = 1.00


In [39]:
# normalizing the attention scores using softmax

attention_scores_qry_softmax = torch.softmax(attention_scores_qry, dim=0)
print(attention_scores_qry_softmax) # normalized dot product attention scores
print(f"sum = {attention_scores_qry_softmax.sum():.2f}") # ok

tensor([0.1768, 0.2295, 0.2016, 0.1627, 0.2295], grad_fn=<SoftmaxBackward0>)
sum = 1.00
