In [61]:
import numpy as np
import tensorflow as tf

In [62]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0) # only difference

Cross attention is an attention mechanism that mixes two different embedding sequences.

Note that:
 - The two sequences must have the same dimension.
 - The two sequences can be of different modalities (e.g. text, image, sound).
 - One of the sequences defines the output length as it plays a role of a query input.
 - The other sequence then produces key and value input.
 
![selfatt](https://i.imgur.com/ikt6Lfi.png)

In [71]:
# https://vaclavkosar.com/ml/cross-attention-in-transformer-architecture
# https://colab.research.google.com/github/mrm8488/shared_colab_notebooks/blob/master/basic_self_attention_.ipynb

restaurant = np.array([[1,2,1,3,4,4]]) # 1x6
review = np.array([[1,2,1],[2,2,2],[2,0,2],[0,2,0],[2,1,1]]) # 5x3

restaurant = tf.convert_to_tensor(restaurant, dtype=tf.float32)
review = tf.convert_to_tensor(review, dtype=tf.float32)

# La nueva dimensión tiene que ser la misma (q y k, no v)
Wq = np.array([[1, 0], [1, 1], [0, 1], [1, 1], [1, 1], [0, 1]]) # 4x2
Wk = np.array([[1, 0], [0, 1], [1, 0]]) # 3x2
Wv = np.array([[1, 1], [1, 1], [1, 1]]) # 3x2

Wq = tf.convert_to_tensor(Wq, dtype=tf.float32)
Wk = tf.convert_to_tensor(Wk, dtype=tf.float32)
Wv = tf.convert_to_tensor(Wv, dtype=tf.float32)

# Proyección
q = tf.matmul(restaurant, Wq)
k = tf.matmul(review, Wk)
v = tf.matmul(review, Wv)

print(f"seq_a:{restaurant.shape}, seq_b:{review.shape}")
print(f"Wq:{Wq.shape}, Wk:{Wk.shape}, Wv:{Wv.shape}")
print(f"q:{q.shape}, k:{k.shape}, v:{v.shape}")

seq_a:(1, 6), seq_b:(5, 3)
Wq:(6, 2), Wk:(3, 2), Wv:(3, 2)
q:(1, 2), k:(5, 2), v:(5, 2)


In [72]:
att_w = tf.matmul(q, k, transpose_b=True)
att_w = tf.nn.softmax(att_w, axis=-1)

weighted_values = v[:,None] * tf.transpose(att_w)[:,:,None]
out = tf.reduce_sum(weighted_values, axis=0)  # 6

print(f"att_shape:{att_w.shape}, out_shape:{out.shape}")

att_shape:(1, 5), out_shape:(1, 2)


#### Keras

```OJO: PARECE QUE KERAS PROYECTA SALIDA AL TAMAÑO DE LA ENTRADA DE QUERY SI NO SE LE DICE NADA```

In [73]:
layer = tf.keras.layers.MultiHeadAttention(num_heads=1, key_dim=2, value_dim=2, output_shape=None)
output_tensor, weights = layer(query=tf.expand_dims(restaurant,0), key=tf.expand_dims(review,0), value=tf.expand_dims(review,0), return_attention_scores=True)
out = output_tensor.numpy()
att_w =  weights.numpy()

# Output: 
# if output_shape is None, the output shape is (B, T, E), where T is for target sequence shapes and E is the query input last dimension.
# Otherwise, the multi-head outputs are project to the shape specified by output_shape.
print(f"att_shape:{att_w.shape}, out_shape:{out.shape}")

att_shape:(1, 1, 1, 5), out_shape:(1, 1, 6)
