## Example 1

In [22]:
import torch
import torch.nn as nn

# Define the input tensor
x = torch.tensor([[[-0.1, 0.1,  0.3]]])

# Create the multi-head attention layer
layer = nn.MultiheadAttention(embed_dim=3, num_heads=1, bias=False, batch_first=True)

custom_weights = torch.tensor( [[-0.3561,  0.3674, -0.5108],
                                [ 0.5146, -0.4764, -0.1490],
                                [ 0.5072, -0.2932, -0.5633],
                                [-0.4932, -0.4468,  0.0736],
                                [-0.6879, -0.4689, -0.1026],
                                [ 0.1847,  0.1858,  0.4469],
                                [-0.4110, -0.4083, -0.5549],
                                [ 0.3921, -0.0746, -0.1336],
                                [-0.6555, -0.3418, -0.2980]]).float()
layer.in_proj_weight = nn.Parameter(custom_weights)

custom_out_proj = torch.tensor([[-0.3601,  0.2771, -0.0573],
                                [-0.0896,  0.0567, -0.2882],
                                [ 0.3200,  0.1517,  0.0580]]).float()
layer.out_proj.weight = nn.Parameter(custom_out_proj)

# Perform the forward pass
# You can use x for both queries, keys, and values in this example
output_tensor, attn_output_weights = layer(x, x, x)  

# Print the shape of the output tensor
print(output_tensor)

tensor([[[ 0.0391,  0.0267, -0.0697]]], grad_fn=<TransposeBackward0>)


In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Define the input tensor
x = torch.tensor([[[-0.1, 0.1, 0.3]]])

q = torch.tensor(  [[-0.3561,  0.3674, -0.5108],
                    [ 0.5146, -0.4764, -0.1490],
                    [ 0.5072, -0.2932, -0.5633]]).float()
k = torch.tensor(  [[-0.4932, -0.4468,  0.0736],
                    [-0.6879, -0.4689, -0.1026],
                    [ 0.1847,  0.1858,  0.4469]]).float()
v = torch.tensor(  [[-0.4110, -0.4083, -0.5549],
                    [ 0.3921, -0.0746, -0.1336],
                    [-0.6555, -0.3418, -0.2980]]).float()
o = torch.tensor([[-0.3601,  0.2771, -0.0573],
                  [-0.0896,  0.0567, -0.2882],
                  [ 0.3200,  0.1517,  0.0580]]).float()

# Define the model parameters
embed_dim = 3
num_heads = 1
head_dim = embed_dim // num_heads

# Step 1: Linear projections for queries, keys, and values
query_proj = nn.Linear(embed_dim, embed_dim, bias=False)
key_proj = nn.Linear(embed_dim, embed_dim, bias=False)
value_proj = nn.Linear(embed_dim, embed_dim, bias=False)

# Custom weights for linear projections
query_proj.weight = nn.Parameter(q)
key_proj.weight = nn.Parameter(k)
value_proj.weight = nn.Parameter(v)

# Step 2: Split the input into multiple heads
query = query_proj(x)
key = key_proj(x)
value = value_proj(x)

# Reshape query, key, and value to have shape (batch_size, num_heads, seq_len, head_dim)
query = query.view(1, num_heads, -1, head_dim)
key = key.view(1, num_heads, -1, head_dim)
value = value.view(1, num_heads, -1, head_dim)

# Step 3: Compute scaled dot-product attention
attention_scores = torch.matmul(query, key.transpose(-2, -1)) / (head_dim ** 0.5)
attention_weights = F.softmax(attention_scores, dim=-1)
context = torch.matmul(attention_weights, value)

# Step 4: Concatenate and project back
context = context.view(1, -1, embed_dim)
out_proj = nn.Linear(embed_dim, embed_dim, bias=False)
out_proj.weight = nn.Parameter(o)
output = out_proj(context)

# Print the shape of the output tensor
print(output)


tensor([[[ 0.0391,  0.0267, -0.0697]]], grad_fn=<UnsafeViewBackward0>)


In [18]:
# remove bs

import torch
import torch.nn as nn
import torch.nn.functional as F

# Define the input tensor (column vectors)
x = torch.tensor([[[-0.1, 0.1, 0.3]]])
x = x.reshape(1, 3)

q = torch.tensor(  [[-0.3561,  0.3674, -0.5108],
                    [ 0.5146, -0.4764, -0.1490],
                    [ 0.5072, -0.2932, -0.5633]]).float()
k = torch.tensor(  [[-0.4932, -0.4468,  0.0736],
                    [-0.6879, -0.4689, -0.1026],
                    [ 0.1847,  0.1858,  0.4469]]).float()
v = torch.tensor(  [[-0.4110, -0.4083, -0.5549],
                    [ 0.3921, -0.0746, -0.1336],
                    [-0.6555, -0.3418, -0.2980]]).float()
o = torch.tensor([[-0.3601,  0.2771, -0.0573],
                  [-0.0896,  0.0567, -0.2882],
                  [ 0.3200,  0.1517,  0.0580]]).float()

# Define the model parameters
embed_dim = 3
num_heads = 1
head_dim = embed_dim // num_heads

# Step 1: Linear projections for queries, keys, and values
query = x@q.T
key = x@k.T
value = x@v.T

# Reshape query, key, and value to have shape (batch_size, num_heads, seq_len, head_dim)
query = query.view(num_heads, -1, head_dim)
key = key.view(num_heads, -1, head_dim)
value = value.view(num_heads, -1, head_dim)

# Step 3: Compute scaled dot-product attention
attention_scores = torch.matmul(query, key.transpose(-2, -1)) / (head_dim ** 0.5)
attention_weights = F.softmax(attention_scores, dim=-1)
context = torch.matmul(attention_weights, value)


print('attention_scores', attention_scores)
print('attention_weights', attention_weights)
print('context', context)


# Step 4: Concatenate and project back
context = context.view(-1, embed_dim)
output = context@o.T

# Print the shape of the output tensor
print(output)

attention_scores tensor([[[-0.0198]]])
attention_weights tensor([[[1.]]])
context tensor([[[-0.1662, -0.0868, -0.0580]]])
tensor([[ 0.0391,  0.0267, -0.0697]])


## Example 2

In [24]:
import torch
import torch.nn as nn

# Define the input tensor
#x = torch.randn(1, 2, 3)
x = torch.tensor([[[-0.1767, -0.2996, -0.6140],
                   [ 0.4852, -1.1095, -0.3858]]])

# Create the multi-head attention layer
layer = nn.MultiheadAttention(embed_dim=3, num_heads=1, bias=False, batch_first=True)

custom_weights = torch.tensor( [[-0.3561,  0.3674, -0.5108],
                                [ 0.5146, -0.4764, -0.1490],
                                [ 0.5072, -0.2932, -0.5633],
                                [-0.4932, -0.4468,  0.0736],
                                [-0.6879, -0.4689, -0.1026],
                                [ 0.1847,  0.1858,  0.4469],
                                [-0.4110, -0.4083, -0.5549],
                                [ 0.3921, -0.0746, -0.1336],
                                [-0.6555, -0.3418, -0.2980]]).float()
layer.in_proj_weight = nn.Parameter(custom_weights)

custom_out_proj = torch.tensor([[-0.3601,  0.2771, -0.0573],
                                [-0.0896,  0.0567, -0.2882],
                                [ 0.3200,  0.1517,  0.0580]]).float()
layer.out_proj.weight = nn.Parameter(custom_out_proj)

# Perform the forward pass
# You can use x for both queries, keys, and values in this example
output_tensor, attn_output_weights = layer(x, x, x)  

# Print the shape of the output tensor
print(output_tensor)

tensor([[[-0.1469, -0.1176,  0.2046],
         [-0.1481, -0.1185,  0.2045]]], grad_fn=<TransposeBackward0>)


In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Define the input tensor
x = torch.tensor([[[-0.1767, -0.2996, -0.6140],
                   [ 0.4852, -1.1095, -0.3858]]])

q = torch.tensor(  [[-0.3561,  0.3674, -0.5108],
                    [ 0.5146, -0.4764, -0.1490],
                    [ 0.5072, -0.2932, -0.5633]]).float()
k = torch.tensor(  [[-0.4932, -0.4468,  0.0736],
                    [-0.6879, -0.4689, -0.1026],
                    [ 0.1847,  0.1858,  0.4469]]).float()
v = torch.tensor(  [[-0.4110, -0.4083, -0.5549],
                    [ 0.3921, -0.0746, -0.1336],
                    [-0.6555, -0.3418, -0.2980]]).float()
o = torch.tensor([[-0.3601,  0.2771, -0.0573],
                  [-0.0896,  0.0567, -0.2882],
                  [ 0.3200,  0.1517,  0.0580]]).float()

# Define the model parameters
embed_dim = 3
num_heads = 1
head_dim = embed_dim // num_heads

# Step 1: Linear projections for queries, keys, and values
query_proj = nn.Linear(embed_dim, embed_dim, bias=False)
key_proj = nn.Linear(embed_dim, embed_dim, bias=False)
value_proj = nn.Linear(embed_dim, embed_dim, bias=False)

# Custom weights for linear projections
query_proj.weight = nn.Parameter(q)
key_proj.weight = nn.Parameter(k)
value_proj.weight = nn.Parameter(v)

# Step 2: Split the input into multiple heads
query = query_proj(x)
key = key_proj(x)
value = value_proj(x)

# Reshape query, key, and value to have shape (batch_size, num_heads, seq_len, head_dim)
query = query.view(1, num_heads, -1, head_dim)
key = key.view(1, num_heads, -1, head_dim)
value = value.view(1, num_heads, -1, head_dim)

# Step 3: Compute scaled dot-product attention
attention_scores = torch.matmul(query, key.transpose(-2, -1)) / (head_dim ** 0.5)
attention_weights = F.softmax(attention_scores, dim=-1)
context = torch.matmul(attention_weights, value)

# Step 4: Concatenate and project back
context = context.view(1, -1, embed_dim)
out_proj = nn.Linear(embed_dim, embed_dim, bias=False)
out_proj.weight = nn.Parameter(o)
output = out_proj(context)

# Print the shape of the output tensor
print(output)

torch.Size([1, 2, 3])
tensor([[[-0.1469, -0.1176,  0.2046],
         [-0.1481, -0.1185,  0.2045]]], grad_fn=<UnsafeViewBackward0>)


In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Define the input tensor
x = torch.tensor([[[-0.1, 0.1,  0.3],
                   [ 0.4, -1.1, -0.3]]])
x = x.reshape(2, 3)

q = torch.tensor(  [[-0.3561,  0.3674, -0.5108],
                    [ 0.5146, -0.4764, -0.1490],
                    [ 0.5072, -0.2932, -0.5633]]).float()
k = torch.tensor(  [[-0.4932, -0.4468,  0.0736],
                    [-0.6879, -0.4689, -0.1026],
                    [ 0.1847,  0.1858,  0.4469]]).float()
v = torch.tensor(  [[-0.4110, -0.4083, -0.5549],
                    [ 0.3921, -0.0746, -0.1336],
                    [-0.6555, -0.3418, -0.2980]]).float()
o = torch.tensor([[-0.3601,  0.2771, -0.0573],
                  [-0.0896,  0.0567, -0.2882],
                  [ 0.3200,  0.1517,  0.0580]]).float()

# Define the model parameters
embed_dim = 3
num_heads = 1
head_dim = embed_dim // num_heads

# Step 1: Linear projections for queries, keys, and values
query = x@q.T
key = x@k.T
value = x@v.T

# Reshape query, key, and value to have shape (batch_size, num_heads, seq_len, head_dim)
query = query.view(num_heads, -1, head_dim)
key = key.view(num_heads, -1, head_dim)
value = value.view(num_heads, -1, head_dim)

# Step 3: Compute scaled dot-product attention
attention_scores = torch.matmul(query, key.transpose(-2, -1)) / (head_dim ** 0.5)
attention_weights = F.softmax(attention_scores, dim=-1)
context = torch.matmul(attention_weights, value)


print('attention_scores', attention_scores)
print('attention_weights', attention_weights)
print('context', context)


# Step 4: Concatenate and project back
context = context.view(-1, embed_dim)
output = context@o.T

# Print the shape of the output tensor
print(output)

attention_scores tensor([[[-0.0198,  0.0028],
         [ 0.0438, -0.0465]]])
attention_weights tensor([[[0.4944, 0.5056],
         [0.5225, 0.4775]]])
context tensor([[[0.1460, 0.0982, 0.0741],
         [0.1286, 0.0879, 0.0667]]])
tensor([[-0.0296, -0.0289,  0.0659],
        [-0.0258, -0.0258,  0.0583]])
