In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Example input: batch of sentences with word embeddings
# Assume we have a batch of 2 sentences, each with 3 words, and an embedding size of 4
batch_size = 2
seq_length = 3
embedding_dim = 4

# Random input embeddings (batch_size, seq_length, embedding_dim)
input_embeddings = torch.rand(batch_size, seq_length, embedding_dim)

print("Input embeddings:", input_embeddings)



Input embeddings: tensor([[[0.3737, 0.9901, 0.2007, 0.5810],
         [0.5722, 0.1240, 0.1568, 0.2650],
         [0.2675, 0.7471, 0.7178, 0.2514]],

        [[0.2130, 0.4490, 0.0515, 0.8130],
         [0.6847, 0.5774, 0.8823, 0.9048],
         [0.3371, 0.3823, 0.2571, 0.8125]]])


Step 1: Define Linear Layers for Q, K, V Transformations
We need to define the linear transformations for Q, K, and V:

In [2]:
d_k = 4  # Dimension of keys/queries
d_v = 4  # Dimension of values

# Linear layers for transforming input embeddings to Q, K, V
W_q = nn.Linear(embedding_dim, d_k)
W_k = nn.Linear(embedding_dim, d_k)
W_v = nn.Linear(embedding_dim, d_v)


Examine the output of a linear layer

In [5]:
print("Weights:")
print(W_q.weight)

print("\nBiases:")
print(W_q.bias)


Weights:
Parameter containing:
tensor([[-0.3365, -0.1809, -0.0087, -0.4713],
        [-0.0403, -0.0960, -0.2113,  0.1327],
        [-0.2775,  0.3236, -0.0979,  0.2060],
        [-0.3081,  0.3066,  0.3339, -0.2492]], requires_grad=True)

Biases:
Parameter containing:
tensor([-0.0809,  0.2758, -0.2320,  0.2945], requires_grad=True)


Step 2: Apply Linear Transformations to Get Q, K, V
We transform the input embeddings into Q, K, and V:

In [13]:
# Transform the input embeddings
print(input_embeddings.shape)
Q = W_q(input_embeddings)  # (batch_size, seq_length, d_k)
K = W_k(input_embeddings)  # (batch_size, seq_length, d_k)
V = W_v(input_embeddings)  # (batch_size, seq_length, d_v)

print("Q:", Q)
print("K:", K)
print("V:", V)


torch.Size([2, 3, 4])
Q: tensor([[[-0.6613,  0.2004,  0.0847,  0.4051],
         [-0.4221,  0.2429, -0.3114,  0.1426],
         [-0.4308,  0.0750, -0.0830,  0.6181]],

        [[-0.6174,  0.3211,  0.0166,  0.1811],
         [-0.8498,  0.1264, -0.1352,  0.3296],
         [-0.6486,  0.2790, -0.0596,  0.1912]]], grad_fn=<ViewBackward0>)
K: tensor([[[-0.2621, -0.4322,  0.4397, -0.1514],
         [-0.6064,  0.0798,  0.3596,  0.1834],
         [-0.5191,  0.0446,  0.6768, -0.4237]],

        [[-0.3610, -0.4171,  0.1336,  0.0934],
         [-0.9146, -0.0535,  0.6703, -0.0579],
         [-0.5351, -0.2750,  0.2517,  0.0883]]], grad_fn=<ViewBackward0>)
V: tensor([[[0.5564, 0.6698, 0.3383, 0.8247],
         [0.1886, 0.2945, 0.1173, 0.2909],
         [0.6719, 0.6491, 0.6703, 0.8188]],

        [[0.2928, 0.4051, 0.0033, 0.6127],
         [0.4489, 1.0383, 0.3072, 1.1090],
         [0.2925, 0.5191, 0.0530, 0.6824]]], grad_fn=<ViewBackward0>)


My Tests

In [18]:
A1 = torch.rand(4,5,3,4)
A2 = torch.rand(4,2)

A3 = A1@A2

print(A3.shape)

torch.Size([4, 5, 3, 2])


Code example from this page:
https://www.analyticsvidhya.com/blog/2024/04/understanding-transformers-a-deep-dive-into-nlps-core-technology/

In [6]:
#import libraries
import torch
import torch.nn.functional as F

# Example input sequence
input_sequence = torch.tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]])

# Generate random weights for Key, Query, and Value matrices
random_weights_key = torch.randn(input_sequence.size(-1), input_sequence.size(-1))
random_weights_query = torch.randn(input_sequence.size(-1), input_sequence.size(-1))
random_weights_value = torch.randn(input_sequence.size(-1), input_sequence.size(-1))

# Compute Key, Query, and Value matrices
key = torch.matmul(input_sequence, random_weights_key)
query = torch.matmul(input_sequence, random_weights_query)
value = torch.matmul(input_sequence, random_weights_value)

# Compute attention scores
attention_scores = torch.matmul(query, key.T) / torch.sqrt(torch.tensor(query.size(-1),
dtype=torch.float32))

# Apply softmax to obtain attention weights
attention_weights = F.softmax(attention_scores, dim=-1)

# Compute weighted sum of Value vectors
output = torch.matmul(attention_weights, value)

print("Output after self-attention:")
print(output)

Output after self-attention:
tensor([[ 0.2783, -1.5008,  0.6212],
        [ 0.2857, -1.5538,  0.6385],
        [ 0.2930, -1.6056,  0.6554]])


### Size of Q, K, and V Matrices

1. **Input Sequence:**
   - Number of words (sequence length): \( N = 10 \)
   - Embedding dimension: \( d_{\text{model}} = 512 \)

2. **Weights for Q, K, and V:**
   - The weight matrices \( W_Q \), \( W_K \), and \( W_V \) are used to project the input embeddings into the query, key, and value spaces.

3. **Dimensionality of Q, K, and V:**
   - The dimensions of the weight matrices are typically \( d_{\text{model}} \times d_k \) for \( W_Q \) and \( W_K \), and \( d_{\text{model}} \times d_v \) for \( W_V \), where \( d_k \) and \( d_v \) are often equal to \( d_{\text{model}} \) for simplicity, but they can be different depending on the implementation.

4. **Projection Matrices:**
   - \( W_Q \in \mathbb{R}^{d_{\text{model}} \times d_k} \)
   - \( W_K \in \mathbb{R}^{d_{\text{model}} \times d_k} \)
   - \( W_V \in \mathbb{R}^{d_{\text{model}} \times d_v} \)

For simplicity, let's assume \( d_k = d_v = d_{\text{model}} \):

- \( W_Q \in \mathbb{R}^{512 \times 512} \)
- \( W_K \in \mathbb{R}^{512 \times 512} \)
- \( W_V \in \mathbb{R}^{512 \times 512} \)

### Computation of Q, K, and V:

Given an input sequence \( X \) of shape \( (N, d_{\text{model}}) = (10, 512) \):

- **Query Matrix (Q):**
  \[ Q = X \cdot W_Q \]
  - Shape of \( Q \): \( (10, 512) \)

- **Key Matrix (K):**
  \[ K = X \cdot W_K \]
  - Shape of \( K \): \( (10, 512) \)

- **Value Matrix (V):**
  \[ V = X \cdot W_V \]
  - Shape of \( V \): \( (10, 512) \)

### Attention Mechanism:

The attention scores are computed as:
\[ \text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right) V \]

### Example Calculation:

1. **Input Sequence:**
   - \( X \) has a shape of \( (10, 512) \).

2. **Weight Matrices:**
   - \( W_Q \), \( W_K \), and \( W_V \) each have a shape of \( (512, 512) \).

3. **Query, Key, and Value Projections:**
   - \( Q = X \cdot W_Q \): Shape \( (10, 512) \)
   - \( K = X \cdot W_K \): Shape \( (10, 512) \)
   - \( V = X \cdot W_V \): Shape \( (10, 512) \)

### Summary

In this scenario, with an input sequence of 10 words and 512-dimensional embeddings:
- The query, key, and value matrices \( Q \), \( K \), and \( V \) will each have a shape of \( (10, 512) \).
- The weight matrices \( W_Q \), \( W_K \), and \( W_V \) will each have a shape of \( (512, 512) \).

These dimensions ensure that the attention mechanism can compute attention scores and attended values effectively, capturing the relationships and dependencies between tokens in the sequence.


In [None]:
# ChatGPT Generated Code
import torch

# Example input sequence
input_sequence = torch.randn(10, 512)  # 10 words, 512-dimensional embeddings

# Generate random weights for Key, Query, and Value matrices
random_weights_key = torch.randn(input_sequence.size(-1), input_sequence.size(-1))
random_weights_query = torch.randn(input_sequence.size(-1), input_sequence.size(-1))
random_weights_value = torch.randn(input_sequence.size(-1), input_sequence.size(-1))

print("Size of input_sequence:", input_sequence.size())
print("Size of random_weights_key:", random_weights_key.size())
print("Size of random_weights_query:", random_weights_query.size())
print("Size of random_weights_value:", random_weights_value.size())

# Compute Q, K, and V matrices
Q = input_sequence @ random_weights_query
K = input_sequence @ random_weights_key
V = input_sequence @ random_weights_value

print("Size of Q:", Q.size())
print("Size of K:", K.size())
print("Size of V:", V.size())
