# self attention with out weights

In [1]:
import torch 

In [2]:
inputs=torch.randn(5,3)

In [3]:
query=inputs[2]

attn_score=torch.empty(inputs.shape[0])

for i,val in enumerate(inputs):
    attn_score[i]=torch.dot(val,query)


print(f"the input:{query}")
print(f"the attention score:{attn_score}")


the input:tensor([-0.1215,  1.6585, -0.0447])
the attention score:tensor([ 2.4088,  2.1749,  2.7674,  1.7755, -1.3898])


# we know make the attention weight by normalizing the attention score

In [4]:
attn_weight=attn_score/attn_score.sum()
print(f"the attention score is: {attn_score}")
print(f"the attention weight is:{attn_weight}")

the attention score is: tensor([ 2.4088,  2.1749,  2.7674,  1.7755, -1.3898])
the attention weight is:tensor([ 0.3113,  0.2811,  0.3577,  0.2295, -0.1796])


In [5]:
def local_softmax(x):
    return torch.exp(x)/torch.exp(x).sum(dim=0)

In [6]:
local_softmax(attn_score)

tensor([0.2648, 0.2096, 0.3791, 0.1406, 0.0059])

In [7]:
torch.softmax(attn_score,dim=0)

tensor([0.2648, 0.2096, 0.3791, 0.1406, 0.0059])

# calculate the context vector

In [8]:
#with respect to the third(2) token
context_value=torch.zeros(query.size())
for i,val in enumerate(inputs):
    context_value+=attn_weight[i]*val

print(context_value)

tensor([0.0433, 1.8265, 0.3505])


# extracting context vector from all inputs

In [9]:
inputs = torch.tensor(
[[0.43, 0.15, 0.89],
[0.55, 0.87, 0.66], 
[0.57, 0.85, 0.64],
[0.22, 0.58, 0.33], 
[0.77, 0.25, 0.10],
[0.05, 0.80, 0.55]]
)


In [10]:
#attention score is
attention_score=torch.zeros(6,6)

for i,i_val in enumerate(inputs):
    for j,j_val in enumerate(inputs):
        attention_score[i,j]=torch.dot(i_val,j_val)

In [11]:
#we can also transpose the values here
att_score=inputs@inputs.T

In [12]:
att_score

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])

In [13]:
print(attention_score)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [14]:
#using torch softmax
attnetion_weight=torch.softmax(att_score,dim=1)

In [15]:
attnetion_weight

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])

In [16]:
attnetion_weight.sum(dim=1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

In [17]:
context_vector=attnetion_weight@inputs

In [18]:
context_vector

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])

# implementing self attention with trainable weights
* self attention mechanism is also called scaled dot product

In [19]:
inputs = torch.tensor(
[[0.43, 0.15, 0.89],
[0.55, 0.87, 0.66], 
[0.57, 0.85, 0.64],
[0.22, 0.58, 0.33], 
[0.77, 0.25, 0.10],
[0.05, 0.80, 0.55]]
)


In [20]:
x_2=inputs[1]
d_in=inputs.shape[1]
d_out=2

In [21]:
print(d_in)
print(d_out)

3
2


In [22]:
torch.manual_seed(123)

#lets define the query,key and value
w_query=torch.nn.parameter.Parameter(torch.randn(d_in,d_out),requires_grad=False)
w_key=torch.nn.parameter.Parameter(torch.randn(d_in,d_out),requires_grad=False)
w_value=torch.nn.parameter.Parameter(torch.randn(d_in,d_out),requires_grad=False)

print(w_query)
print(w_key)
print(w_value)

Parameter containing:
tensor([[-0.1115,  0.1204],
        [-0.3696, -0.2404],
        [-1.1969,  0.2093]])
Parameter containing:
tensor([[-0.9724, -0.7550],
        [ 0.3239, -0.1085],
        [ 0.2103, -0.3908]])
Parameter containing:
tensor([[ 0.2350,  0.6653],
        [ 0.3528,  0.9728],
        [-0.0386, -0.8861]])


In [23]:
query_2=x_2@w_query
key_2=x_2@w_key
value_2=x_2@w_value

In [24]:
query_2

tensor([-1.1729, -0.0048])

In [25]:
#general key and value 
key=inputs@w_key
value=inputs@w_value

In [26]:
print(key)
print("gap")
print(value)

tensor([[-0.1823, -0.6888],
        [-0.1142, -0.7676],
        [-0.1443, -0.7728],
        [ 0.0434, -0.3580],
        [-0.6467, -0.6476],
        [ 0.3262, -0.3395]])
gap
tensor([[ 0.1196, -0.3566],
        [ 0.4107,  0.6274],
        [ 0.4091,  0.6390],
        [ 0.2436,  0.4182],
        [ 0.2653,  0.6668],
        [ 0.2728,  0.3242]])


In [27]:
print(key.shape)
print(value.shape)

torch.Size([6, 2])
torch.Size([6, 2])


In [28]:
#attention score
key_2=key[1]
attn_score=query_2.dot(key_2)
print(f"the attention scoer {attn_score}")

the attention scoer 0.13763877749443054


In [29]:
#general attention score of the token
attn_score=query_2@key.T

In [30]:
attn_score

tensor([ 0.2172,  0.1376,  0.1730, -0.0491,  0.7616, -0.3809])

In [31]:
d_k=key.shape[1]
attn_weight_2=torch.softmax(attn_score/d_k**0.5,dim=-1)

In [32]:
attn_weight_2

tensor([0.1704, 0.1611, 0.1652, 0.1412, 0.2505, 0.1117])

In [33]:
context_value_2=attn_weight_2@value

In [36]:
print(context_value_2)

tensor([0.2854, 0.4081])


# lets formalize the process