In [2]:
import numpy as np 

def self_attention(Q, K, V, mask=None):
    """
    Computes self-attention for the given query (Q), key (K), and value matrics.

    Parameters:
    - Q: Query matrix
    - K: Key matrix
    - V: Value matrix
    - mask: Optional mask to ignore specific positions

    Returns:
    - The result of self-attention computation.
    """

    # Step 1: Calculate scores by multiplying Q and K^T
    scores = np.dot(Q, K.T)

    # Step 2: Scale the scores 
    d_k = K.shape[1] # d_k is the dimension of the key vectors
    scores /= np.sqrt(d_k)

    # Step 3: Apply mask (if any) - mask should be broadcastable to scores shape
    if mask is not None:
        scores = np.where(mask, scores, -1e9) # large negative value for masked positions
    
    # Step 4: Softmax to get the attention weights
    attention_weights = np.exp(scores)
    attention_weights /= np.sum(attention_weights, axis=-1, keepdims=True)

    # Step 5: Multiply the attention weights by the values
    output = np.dot(attention_weights, V)

    return output, attention_weights

# Example usage:
np.random.seed(0)
Q = np.random.rand(3, 4) # 3 tokens, each with a 4-dimentional query vector
K = np.random.rand(3, 4) # 3 tokens, each with a 4-dimentional key vector
V = np.random.rand(3, 4) # 3 tokens, each with a 4-dimentional value vector

output, attention_weights = self_attention(Q, K, V)

print("Self-attention output:\n", output)
print("Attention weights:\n", attention_weights)


import torch 
import torch.nn.functional as F 

# Define input sequence (3 words with 4 features each)
# For simplicity, we assume this is already embedded
input_seq = torch.tensor([
    [1, 0, 1, 0], # Word 1
    [0, 1, 0, 1], # Word 2
    [1, 1, 1, 1] # Word 3
], dtype=torch.float32)

# Define dimensions
d_model = input_seq.size(1) # Feature dimension (4 in this case)

# Initialize Query, Key, and Value weight matrices
# Normally these would be learned parameters
W_q = torch.nn.Linear(d_model, d_model, bias=False)
W_k = torch.nn.Linear(d_model, d_model, bias=False)
W_v = torch.nn.Linear(d_model, d_model, bias=False)

# Generate queries, keys, and values
queries = W_q(input_seq)
keys = W_k(input_seq)
values = W_v(input_seq)

# Calculate the attention scores (scaled dot-product)
# Scale factor = sqrt(d_model) to keep scores in a similar range
scale_factor = d_model * 0.5
attention_scores = torch.matmul(
    queries,
    keys.transpose(-2, -1)
) / scale_factor


# Apply softmax to get attention weights
attention_weights = F.softmax(attention_scores, dim=-1)


# Multiply the attention weights by the values
output = torch.matmul(attention_weights, values)

print("Input sequence:")
print(input_seq)
print("\nQueries:")
print(queries)
print("\nKeys:")
print(keys)
print("\nValues:")
print(values)
print("\nAttention Scores:")
print(attention_scores)
print("\nAttention Weights (after softmax):")
print(attention_weights)
print("\nOutput (self-attention result):")
print(output)

Self-attention output:
 [[0.38799283 0.53502878 0.13623257 0.75889379]
 [0.39492337 0.53118004 0.1382761  0.75593604]
 [0.39012207 0.53832171 0.12822185 0.75093991]]
Attention weights:
 [[0.26820899 0.341926   0.38986501]
 [0.25100881 0.35895866 0.39003253]
 [0.25670316 0.31516267 0.42813417]]
Input sequence:
tensor([[1., 0., 1., 0.],
        [0., 1., 0., 1.],
        [1., 1., 1., 1.]])

Queries:
tensor([[-0.1235, -0.4448,  0.6748,  0.7309],
        [ 0.5004, -0.3253,  0.0442, -0.6436],
        [ 0.3769, -0.7701,  0.7190,  0.0873]], grad_fn=<MmBackward0>)

Keys:
tensor([[-0.0025,  0.0747,  0.2400,  0.0152],
        [ 0.0435, -0.1385, -0.5375, -0.2110],
        [ 0.0411, -0.0638, -0.2975, -0.1958]], grad_fn=<MmBackward0>)

Values:
tensor([[ 0.5420, -0.5419,  0.0119, -0.1146],
        [ 0.0555,  0.6403,  0.3940, -0.1727],
        [ 0.5974,  0.0984,  0.4058, -0.2873]], grad_fn=<MmBackward0>)

Attention Scores:
tensor([[ 0.0701, -0.2303, -0.1603],
        [-0.0123,  0.0894,  0.0771],
     