In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Layer

In [None]:
class ScaledDotProductAttention(Layer):
  def init(self):
    super(ScaledDotProductAttention, self).init()

  def call(self, query, key, value):
    scores = tf.matmul(query, key, transpose_b=True) # shape (batch_size, num_heads, seq_len_q, seq_len_k)
    d_k = tf.cast(tf.shape(key)[-1], tf.float32)
    scaled_scores = scores / tf.sqrt(d_k)
    attention_weights = tf.nn.softmax(scaled_scores, axis=-1) # shape (batch_size, num_heads, seq_len_q, seq_len_k)
    output = tf.matmul(attention_weights, value) # shape (batch_size, num_heads, seq_len_q, depth)
    return output, attention_weights

In [None]:
class MultiHeadAttention(Layer):
 def __init__(self, num_heads, d_model):
     super(MultiHeadAttention, self).__init__()
     self.num_heads = num_heads
     self.d_model = d_model
     self.depth = d_model // num_heads

     self.wq = tf.keras.layers.Dense(d_model)  # Query transformation
     self.wk = tf.keras.layers.Dense(d_model)  # Key transformation
     self.wv = tf.keras.layers.Dense(d_model)  # Value transformation
     self.dense = tf.keras.layers.Dense(d_model)  # Linear transformation of the output

 def split_heads(self, x):
     batch_size = tf.shape(x)[0]
     x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))  # (batch_size, seq_len, num_heads, depth)
     return tf.transpose(x, perm=(0, 2, 1, 3))  # (batch_size, num_heads, seq_len, depth)

 def call(self, query, key, value):
     query = self.wq(query)  # shape (batch_size, seq_len_q, d_model)
     key = self.wk(key)      # shape (batch_size, seq_len_k, d_model)
     value = self.wv(value)  # shape (batch_size, seq_len_v, d_model)

     query = self.split_heads(query)  # shape (batch_size, num_heads, seq_len_q, depth)
     key = self.split_heads(key)      # shape (batch_size, num_heads, seq_len_k, depth)
     value = self.split_heads(value)  # shape (batch_size, num_heads, seq_len_v, depth)

     output, attention_weights = ScaledDotProductAttention()(query, key, value)  # shape (batch_size, num_heads, seq_len_q, depth)

     output = tf.transpose(output, perm=(0, 2, 1, 3))  # shape (batch_size, seq_len_q, num_heads, depth)
     output = tf.reshape(output, (tf.shape(output)[0], -1, self.d_model))  # (batch_size, seq_len_q, d_model)

     return self.dense(output)

In [None]:
# Sample sentences
sentences = [
    "The cat sat on the mat.",
    "The dog barked at the cat.",
    "The mouse ran away."
]

# Sample Input: Tokenize and create embedding vector for demonstration
# For simplicity, let's use random embeddings here in practice, you would use word embeddings or TF-IDF.
embeddings = np.random.rand(len(sentences), 10, 64)  # (batch_size, seq_len, d_model)
query = tf.convert_to_tensor(embeddings.astype(np.float32))
key = tf.convert_to_tensor(embeddings.astype(np.float32))
value = tf.convert_to_tensor(embeddings.astype(np.float32))

# Initialize Multi-Head Attention
multi_head_attention = MultiHeadAttention(num_heads=4, d_model=64)

# Get the attention output
attention_output = multi_head_attention(query, key, value)

# Print the output shape
print(attention_output.shape)  # Expected output: (batch_size, seq_len_q, d_model)

(3, 10, 64)


### Explanation of the Code

Scaled Dot-Product Attention: The core attention mechanism calculates the attention score by taking the dot product of the query and key tensors. The scores are then scaled (by the square root of the dimension of the keys) and passed through a softmax function to produce attention weights. Finally, these weights are used to compute the weighted sum of the values.

Multi-Head Attention: The model is initialized with the number of attention heads and the output dimension.
For each input (query, key, value), dense layers are used to project them into the model space.
The split_heads function reshapes the projected tensors to separate the different attention heads for parallel processing.
The output of the scaled dot-product attention is reshaped and passed through a final dense layer.

Example Sentences: The example sentences are tokenized and converted into random embedding vectors (for demonstration). In a real scenario, you would use pre-trained embeddings or trained word vectors.
Model Execution: By calling the multi_head_attention layer, you can see the resulting shape of the output tensor.