In [1]:
import numpy as np 

def self_attention(Q, K, V, mask=None):
    """
    Computes self-attention for the given query (Q), key (K), and value matrics.

    Parameters:
    - Q: Query matrix
    - K: Key matrix
    - V: Value matrix
    - mask: Optional mask to ignore specific positions

    Returns:
    - The result of self-attention computation.
    """

    # Step 1: Calculate scores by multiplying Q and K^T
    scores = np.dot(Q, K.T)

    # Step 2: Scale the scores 
    d_k = K.shape[1] # d_k is the dimension of the key vectors
    scores /= np.sqrt(d_k)

    # Step 3: Apply mask (if any) - mask should be broadcastable to scores shape
    if mask is not None:
        scores = np.where(mask, scores, -1e9) # large negative value for masked positions
    
    # Step 4: Softmax to get the attention weights
    attention_weights = np.exp(scores)
    attention_weights /= np.sum(attention_weights, axis=-1, keepdims=True)

    # Step 5: Multiply the attention weights by the values
    output = np.dot(attention_weights, V)

    return output, attention_weights

# Example usage:
np.random.seed(0)
Q = np.random.rand(3, 4) # 3 tokens, each with a 4-dimentional query vector
K = np.random.rand(3, 4) # 3 tokens, each with a 4-dimentional key vector
V = np.random.rand(3, 4) # 3 tokens, each with a 4-dimentional value vector

output, attention_weights = self_attention(Q, K, V)

print("Self-attention output:\n", output)
print("Attention weights:\n", attention_weights)


Self-attention output:
 [[0.38799283 0.53502878 0.13623257 0.75889379]
 [0.39492337 0.53118004 0.1382761  0.75593604]
 [0.39012207 0.53832171 0.12822185 0.75093991]]
Attention weights:
 [[0.26820899 0.341926   0.38986501]
 [0.25100881 0.35895866 0.39003253]
 [0.25670316 0.31516267 0.42813417]]
