## Setup



In [None]:
import torch

num_tokens = 5
embed_dim = 8

print(f"Number of tokens: {num_tokens}")
print(f"Embedding dimension: {embed_dim}")

Q = torch.randn(num_tokens, embed_dim)
K = torch.randn(num_tokens, embed_dim)
V = torch.randn(num_tokens, embed_dim)

print(f"\nShape of Query (Q) matrix: {Q.shape}")
print(f"Shape of Key (K) matrix: {K.shape}")
print(f"Shape of Value (V) matrix: {V.shape}")

print("Dummy Q, K, V matrices created successfully.")

Number of tokens: 5
Embedding dimension: 8

Shape of Query (Q) matrix: torch.Size([5, 8])
Shape of Key (K) matrix: torch.Size([5, 8])
Shape of Value (V) matrix: torch.Size([5, 8])
Dummy Q, K, V matrices created successfully.


## Compute PyTorch Attention Scores (Scaled)




In [None]:
print("\n--- Calculating PyTorch Scaled Attention Scores ---")

attention_scores = torch.matmul(Q, K.T)

scaling_factor = torch.sqrt(torch.tensor(embed_dim, dtype=torch.float32))

scaled_attention_scores = attention_scores / scaling_factor

print(f"Shape of scaled attention scores: {scaled_attention_scores.shape}")

print("Scaled attention scores (Q @ K.T / sqrt(embed_dim)):\n", scaled_attention_scores)


--- Calculating PyTorch Scaled Attention Scores ---
Shape of scaled attention scores: torch.Size([5, 5])
Scaled attention scores (Q @ K.T / sqrt(embed_dim)):
 tensor([[-0.9626, -1.0573, -2.0671, -0.7733,  1.0316],
        [ 0.6510, -0.6209, -1.1043, -0.3764, -0.6955],
        [-0.1537, -0.1260, -0.1983,  0.0680,  0.8189],
        [ 0.6628,  0.5203,  2.0698,  0.9484,  1.6436],
        [ 0.9044, -0.2830,  0.5115,  0.2024, -0.6457]])


## Compute Attention Weights (Scaled)



In [None]:
print("\n--- Calculating PyTorch Scaled Attention Weights ---")

attention_weights = torch.softmax(scaled_attention_scores, dim=-1)

print(f"Shape of attention weights: {attention_weights.shape}")

print("Attention Weights (Softmax of Scaled Q @ K.T):\n", attention_weights)


--- Calculating PyTorch Scaled Attention Weights ---
Shape of attention weights: torch.Size([5, 5])
Attention Weights (Softmax of Scaled Q @ K.T):
 tensor([[0.0926, 0.0843, 0.0307, 0.1119, 0.6805],
        [0.4828, 0.1353, 0.0835, 0.1728, 0.1256],
        [0.1454, 0.1495, 0.1391, 0.1815, 0.3846],
        [0.1005, 0.0872, 0.4105, 0.1337, 0.2681],
        [0.3720, 0.1135, 0.2512, 0.1844, 0.0790]])


## Compute  Context Vector

In [None]:
print("\n--- Calculating PyTorch Scaled Context Vector ---")

context_vector = torch.matmul(attention_weights, V)

print(f"Shape of context vector: {context_vector.shape}")

print("Context Vector (Attention Weights @ V):\n", context_vector)


--- Calculating PyTorch Scaled Context Vector ---
Shape of context vector: torch.Size([5, 8])
Context Vector (Attention Weights @ V):
 tensor([[-0.1018, -0.6984, -0.2798,  0.8216, -0.2479,  0.0859, -0.2881,  0.6673],
        [ 0.3728, -1.0027,  0.1836,  0.0792,  0.0076, -0.7703, -0.9220, -0.0298],
        [ 0.1161, -0.6166, -0.2179,  0.4638,  0.0122, -0.1560, -0.6473,  0.2247],
        [ 0.3370, -0.6782,  0.0848,  0.2377,  0.1385, -0.5586, -0.7788, -0.3333],
        [ 0.4861, -0.9184,  0.2096, -0.0226,  0.1405, -0.8342, -0.9711, -0.3166]])


In [None]:
print("\n--- Initial Q, K, V Matrices (PyTorch) ---")
print("Original Query (Q) matrix representing token representations for queries:\n", Q)
print("\nOriginal Key (K) matrix representing token representations for keys:\n", K)
print("\nOriginal Value (V) matrix representing token representations for values:\n", V)


--- Initial Q, K, V Matrices (PyTorch) ---
Original Query (Q) matrix representing token representations for queries:
 tensor([[ 0.6977,  0.4826, -1.8565,  1.3198,  0.3462,  0.7798, -0.4995, -0.9835],
        [ 0.2701,  1.9504,  0.2245,  1.2016, -0.1669, -0.3142,  0.5633, -0.3367],
        [ 0.0395, -0.3411, -0.5176, -1.8177,  1.0107,  0.4173,  1.4420, -1.3674],
        [ 1.1321,  0.5711,  0.1820, -1.7744, -0.5664, -0.2617, -0.5426, -0.6553],
        [-1.0340,  0.3379,  1.4092, -0.3155,  0.0545, -0.9417,  0.2377, -0.0266]])

Original Key (K) matrix representing token representations for keys:
 tensor([[ 0.7875,  0.5182,  1.2047, -0.1642,  0.2804, -1.4729,  0.1833, -0.0753],
        [ 1.6376, -1.3338,  0.2757, -0.3444, -1.3397, -0.7225,  1.1119,  0.9560],
        [ 0.1045,  0.1098,  0.3895, -2.5926, -2.0610, -0.5188, -0.8200,  1.1382],
        [ 1.0796, -0.2785,  0.7774, -0.7364,  0.1170, -0.5735, -0.3661,  0.1700],
        [ 0.1256,  0.3077, -0.4248, -1.3286, -0.2098,  1.2452, -2.0347,

In [None]:
print("\n--- Scaled Attention Scores (PyTorch) ---")
print("Raw attention scores (Q @ K.T / sqrt(embed_dim)) showing scaled similarity between query and key tokens:\n",
scaled_attention_scores,
)


--- Scaled Attention Scores (PyTorch) ---
Raw attention scores (Q @ K.T / sqrt(embed_dim)) showing scaled similarity between query and key tokens:
 tensor([[-0.9626, -1.0573, -2.0671, -0.7733,  1.0316],
        [ 0.6510, -0.6209, -1.1043, -0.3764, -0.6955],
        [-0.1537, -0.1260, -0.1983,  0.0680,  0.8189],
        [ 0.6628,  0.5203,  2.0698,  0.9484,  1.6436],
        [ 0.9044, -0.2830,  0.5115,  0.2024, -0.6457]])


In [None]:
print("\n--- Scaled Attention Weights (PyTorch) ---")
print("Normalized attention weights (Softmax of Scaled Q @ K.T) showing how much each query token attends to each key token:\n",
attention_weights,
)


--- Scaled Attention Weights (PyTorch) ---
Normalized attention weights (Softmax of Scaled Q @ K.T) showing how much each query token attends to each key token:
 tensor([[0.0926, 0.0843, 0.0307, 0.1119, 0.6805],
        [0.4828, 0.1353, 0.0835, 0.1728, 0.1256],
        [0.1454, 0.1495, 0.1391, 0.1815, 0.3846],
        [0.1005, 0.0872, 0.4105, 0.1337, 0.2681],
        [0.3720, 0.1135, 0.2512, 0.1844, 0.0790]])


In [None]:
print("\n--- Scaled Context Vector (PyTorch) ---")
print("Final context vector (Attention Weights @ V) representing the context-aware representation for each token:\n",
context_vector,
)


--- Scaled Context Vector (PyTorch) ---
Final context vector (Attention Weights @ V) representing the context-aware representation for each token:
 tensor([[-0.1018, -0.6984, -0.2798,  0.8216, -0.2479,  0.0859, -0.2881,  0.6673],
        [ 0.3728, -1.0027,  0.1836,  0.0792,  0.0076, -0.7703, -0.9220, -0.0298],
        [ 0.1161, -0.6166, -0.2179,  0.4638,  0.0122, -0.1560, -0.6473,  0.2247],
        [ 0.3370, -0.6782,  0.0848,  0.2377,  0.1385, -0.5586, -0.7788, -0.3333],
        [ 0.4861, -0.9184,  0.2096, -0.0226,  0.1405, -0.8342, -0.9711, -0.3166]])


In [None]:
print("\n--- Calculating TensorFlow Scaled Attention Scores ---")

attention_scores_tf_raw = tf.matmul(Q_tf, tf.transpose(K_tf))

scaling_factor_tf = tf.sqrt(tf.cast(embed_dim, tf.float32))

scaled_attention_scores_tf = attention_scores_tf_raw / scaling_factor_tf

print(f"Shape of scaled attention scores (TensorFlow): {scaled_attention_scores_tf.shape}")

print("Scaled attention scores (Q_tf @ K_tf.T / sqrt(embed_dim)):\n", scaled_attention_scores_tf)


--- Calculating TensorFlow Scaled Attention Scores ---
Shape of scaled attention scores (TensorFlow): (5, 5)
Scaled attention scores (Q_tf @ K_tf.T / sqrt(embed_dim)):
 tf.Tensor(
[[-1.1715143  -0.70157117  1.8508699   0.7938507  -0.47439685]
 [-0.9528719  -0.04579314  1.5392618  -0.48467758  0.74428874]
 [-0.294289    1.1593198   0.17506891  0.61089295  0.10231236]
 [-0.9951796  -0.66143984  0.21328577 -0.67285043 -0.04337731]
 [ 0.19320318 -0.7317817  -2.87341    -0.5238799  -0.87627673]], shape=(5, 5), dtype=float32)


In [None]:
print("\n--- Calculating TensorFlow Scaled Attention Weights ---")

attention_weights_tf = tf.nn.softmax(scaled_attention_scores_tf, axis=-1)

print(f"Shape of attention weights (TensorFlow): {attention_weights_tf.shape}")

print("Attention Weights (Softmax of Scaled Q_tf @ K_tf.T):\n", attention_weights_tf)


--- Calculating TensorFlow Scaled Attention Weights ---
Shape of attention weights (TensorFlow): (5, 5)
Attention Weights (Softmax of Scaled Q_tf @ K_tf.T):
 tf.Tensor(
[[0.03097358 0.04955472 0.63620365 0.22107443 0.06219358]
 [0.04420935 0.1095099  0.5343601  0.0706071  0.2413135 ]
 [0.09227954 0.39482048 0.14755195 0.22815023 0.1371978 ]
 [0.10293112 0.14371035 0.3446486  0.14207985 0.26663008]
 [0.4396615  0.17434223 0.02047884 0.21463138 0.15088609]], shape=(5, 5), dtype=float32)


In [None]:
print("\n--- Calculating TensorFlow Scaled Context Vector ---")

context_vector_tf = tf.matmul(attention_weights_tf, V_tf)

print(f"Shape of context vector (TensorFlow): {context_vector_tf.shape}")

print("Context Vector (Attention Weights @ V_tf):\n", context_vector_tf)


--- Calculating TensorFlow Scaled Context Vector ---
Shape of context vector (TensorFlow): (5, 8)
Context Vector (Attention Weights @ V_tf):
 tf.Tensor(
[[ 1.1241323  -0.59088886 -0.19652964  0.21491177  0.48043168 -0.08062097
  -0.971363   -0.36646673]
 [ 1.06236     0.03360325 -0.40229282  0.5087005   0.21427517 -0.14786546
  -0.7432229  -0.5621588 ]
 [ 0.27724195 -0.03608703 -1.2358981   0.63904124 -0.45489448 -0.14722407
  -0.02529149 -0.52401996]
 [ 0.6535382  -0.0276138  -0.4779798   0.2838771  -0.24898633 -0.11048048
  -0.5758577  -0.46360675]
 [ 0.00838823 -0.5870342  -0.5820206  -0.55757934 -0.8990232   0.41084766
  -0.362198    0.06709458]], shape=(5, 8), dtype=float32)


In [None]:
import tensorflow as tf

print("\n--- Initial Q, K, V Matrices (TensorFlow) ---")
print("Original Query (Q_tf) matrix representing token representations for queries:\n", Q_tf)
print("\nOriginal Key (K_tf) matrix representing token representations for keys:\n", K_tf)
print("\nOriginal Value (V_tf) matrix representing token representations for values:\n", V_tf)


--- Initial Q, K, V Matrices (TensorFlow) ---
Original Query (Q_tf) matrix representing token representations for queries:
 tf.Tensor(
[[-0.9146993  -1.2328101   0.27749616 -1.0733912   0.4440446  -1.3729824
   1.0591654   2.1605346 ]
 [-1.7342167  -2.11384    -0.03782409 -1.4539984   0.49261218  0.05395482
  -1.1483284   0.6430254 ]
 [-1.6785321   0.09654163  1.2842544  -1.1607221  -0.41235134 -1.1613528
   1.0900464  -0.91839594]
 [-1.1268284   0.15216608  0.84470975 -0.7586678  -0.34276998  2.2665145
  -0.28306943  0.3309613 ]
 [ 1.9150887   0.09954559  1.2366453   0.18552396 -1.3960669   0.58268857
   1.0396348  -0.27854902]], shape=(5, 8), dtype=float32)

Original Key (K_tf) matrix representing token representations for keys:
 tf.Tensor(
[[ 0.26634452  0.3253534  -0.828123    0.7098225  -1.1356668  -0.7047835
  -0.47131383 -0.7596592 ]
 [-0.59167933  0.8698879  -0.93958133 -0.9433399  -0.8200248  -1.1388083
  -0.51563406 -1.3229511 ]
 [-2.2696419  -0.44619924 -0.57590055  1.08906

In [None]:
print("\n--- Scaled Attention Scores (TensorFlow) ---")
print("Raw attention scores (Q_tf @ K_tf.T / sqrt(embed_dim)) showing scaled similarity between query and key tokens:\n",
scaled_attention_scores_tf,
)


--- Scaled Attention Scores (TensorFlow) ---
Raw attention scores (Q_tf @ K_tf.T / sqrt(embed_dim)) showing scaled similarity between query and key tokens:
 tf.Tensor(
[[-1.1715143  -0.70157117  1.8508699   0.7938507  -0.47439685]
 [-0.9528719  -0.04579314  1.5392618  -0.48467758  0.74428874]
 [-0.294289    1.1593198   0.17506891  0.61089295  0.10231236]
 [-0.9951796  -0.66143984  0.21328577 -0.67285043 -0.04337731]
 [ 0.19320318 -0.7317817  -2.87341    -0.5238799  -0.87627673]], shape=(5, 5), dtype=float32)


In [None]:
print("\n--- Scaled Attention Weights (TensorFlow) ---")
print("Normalized attention weights (Softmax of Scaled Q_tf @ K_tf.T) showing how much each query token attends to each key token:\n",
attention_weights_tf,
)


--- Scaled Attention Weights (TensorFlow) ---
Normalized attention weights (Softmax of Scaled Q_tf @ K_tf.T) showing how much each query token attends to each key token:
 tf.Tensor(
[[0.03097358 0.04955472 0.63620365 0.22107443 0.06219358]
 [0.04420935 0.1095099  0.5343601  0.0706071  0.2413135 ]
 [0.09227954 0.39482048 0.14755195 0.22815023 0.1371978 ]
 [0.10293112 0.14371035 0.3446486  0.14207985 0.26663008]
 [0.4396615  0.17434223 0.02047884 0.21463138 0.15088609]], shape=(5, 5), dtype=float32)


In [None]:
print("\n--- Scaled Context Vector (TensorFlow) ---")
print("Final context vector (Attention Weights @ V_tf) representing the context-aware representation for each token:\n",
context_vector_tf,
)


--- Scaled Context Vector (TensorFlow) ---
Final context vector (Attention Weights @ V_tf) representing the context-aware representation for each token:
 tf.Tensor(
[[ 1.1241323  -0.59088886 -0.19652964  0.21491177  0.48043168 -0.08062097
  -0.971363   -0.36646673]
 [ 1.06236     0.03360325 -0.40229282  0.5087005   0.21427517 -0.14786546
  -0.7432229  -0.5621588 ]
 [ 0.27724195 -0.03608703 -1.2358981   0.63904124 -0.45489448 -0.14722407
  -0.02529149 -0.52401996]
 [ 0.6535382  -0.0276138  -0.4779798   0.2838771  -0.24898633 -0.11048048
  -0.5758577  -0.46360675]
 [ 0.00838823 -0.5870342  -0.5820206  -0.55757934 -0.8990232   0.41084766
  -0.362198    0.06709458]], shape=(5, 8), dtype=float32)


In [None]:
import tensorflow as tf

num_tokens = 5
embed_dim = 8

print(f"Number of tokens: {num_tokens}")
print(f"Embedding dimension: {embed_dim}")

Q_tf = tf.random.normal((num_tokens, embed_dim))
K_tf = tf.random.normal((num_tokens, embed_dim))
V_tf = tf.random.normal((num_tokens, embed_dim))

print(f"\nShape of Query (Q_tf) matrix: {Q_tf.shape}")
print(f"Shape of Key (K_tf) matrix: {K_tf.shape}")
print(f"Shape of Value (V_tf) matrix: {V_tf.shape}")

print("Dummy Q_tf, K_tf, V_tf matrices created successfully.")

Number of tokens: 5
Embedding dimension: 8

Shape of Query (Q_tf) matrix: (5, 8)
Shape of Key (K_tf) matrix: (5, 8)
Shape of Value (V_tf) matrix: (5, 8)
Dummy Q_tf, K_tf, V_tf matrices created successfully.
