# INTRO
- Goal of this notebook is to explore Attention mechanism using generated numerical dataset.
- Dataset contains 1D vectors and task of the model is to 'predict' minimum value of the input vector
- Attention exploration: attention score might be maximum for index with minimum value in the input vector


- Based on:
  - https://github.com/thomlake/pytorch-attention/tree/master
  - https://colab.research.google.com/drive/1NwwPKYlyOIxeRI1BBUMZ-uFIApyZxARy?usp=sharing#scrollTo=3SqrOPZBKR0r

  Please share feedback with: @piotrekgrl

In [1]:
import math

import torch
from torch import nn
import numpy as np

In [2]:
# Set seed for reproducibility
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7a27a8a8ae30>

# DATASET

In [3]:
QUERY = np.array([0.42], dtype=np.float32) # static query for my task - we want to compute minimum from a given array

In [4]:
DATA_DIM = 1  # input array dimension

In [5]:
class Data(object):
    def __init__(self, query, dim):
        self.query = query
        self.dim = dim

    def create_minibatches(self, n, m, length):
        minibatches = []
        for _ in range(n):
            context = np.random.normal(0, 1, (m, length, self.dim)).astype(np.float32)
            target_indices = np.argmin(context, axis=1).squeeze()
            target = context[np.arange(m), target_indices][:, None]
            query = np.tile(self.query, (m, 1, 1))

            query = torch.tensor(query)
            context = torch.tensor(context)
            target = torch.tensor(target)
            minibatches.append((query, context, target, target_indices))
        return minibatches

In [6]:
# Training parameters
batch_size = 8
n_train, n_valid = 200, 100
length = 16

In [7]:
data = Data(QUERY, DATA_DIM)

In [8]:
# Generate data
train_batches = data.create_minibatches(n_train, batch_size, length)
valid_batches = data.create_minibatches(n_valid, batch_size, length)

## Single data point overview

In [9]:
# QUERY (STATIC)
train_batches[0][0][0]

tensor([[0.4200]])

In [10]:
# CONTEXT
train_batches[0][1][0]

tensor([[ 0.4967],
        [-0.1383],
        [ 0.6477],
        [ 1.5230],
        [-0.2342],
        [-0.2341],
        [ 1.5792],
        [ 0.7674],
        [-0.4695],
        [ 0.5426],
        [-0.4634],
        [-0.4657],
        [ 0.2420],
        [-1.9133],
        [-1.7249],
        [-0.5623]])

In [11]:
# TARGET (min(CONTEXT))
train_batches[0][2][0]

tensor([[-1.9133]])

In [12]:
# TARGET INDEX (argmin(CONTEXT))
train_batches[0][3][0]

13

# ATTENTION

In [13]:
class SimpleAttentionNet(nn.Module):
    def __init__(self, data_dim, hidden_dim):
        super().__init__()

        # Q Linear Layer
        self.W_q = nn.Linear(data_dim, hidden_dim)

        # K Linear Layer
        self.W_k = nn.Linear(data_dim, hidden_dim)
        self.d_k = hidden_dim

        # V Linear Layer
        self.W_v = nn.Linear(data_dim, hidden_dim)

        # Output Linear Layer
        self.W_o = nn.Linear(hidden_dim, 1)

    def forward(self, q, x, return_weight=False):
        # Compute Q, K, V representations
        Q = self.W_q(q.squeeze(-1)).unsqueeze(1)
        K = self.W_k(x)
        V = self.W_v(x)

    # source: MultiHeadAttention.scaled_dot_product_attention()
    # https://colab.research.google.com/drive/1NwwPKYlyOIxeRI1BBUMZ-uFIApyZxARy?usp=sharing#scrollTo=3SqrOPZBKR0r
    # START
        # Compute attention scores and weights
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
    # END

        # Final output
        output = self.W_o(output)

        # Return output and optionally attn_weights
        if return_weight:
            return output, attn_probs.squeeze(1)
        return output

In [14]:
# Initialize model, optimizer, and loss function
net = SimpleAttentionNet(data_dim=DATA_DIM, hidden_dim=5)
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
mse_loss = nn.MSELoss()

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [16]:
count_parameters(net)

36

In [17]:
# Training loop
epochs = 50
for epoch in range(epochs):
    train_loss = 0
    for query, context, target, _ in train_batches:
        optimizer.zero_grad()
        output = net(query, context)
        loss = mse_loss(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {train_loss / len(train_batches):.4f}")

Epoch 1, Loss: 1.9570
Epoch 2, Loss: 0.5266
Epoch 3, Loss: 0.4744
Epoch 4, Loss: 0.4431
Epoch 5, Loss: 0.4156
Epoch 6, Loss: 0.3920
Epoch 7, Loss: 0.3720
Epoch 8, Loss: 0.3554
Epoch 9, Loss: 0.3416
Epoch 10, Loss: 0.3304
Epoch 11, Loss: 0.3217
Epoch 12, Loss: 0.3152
Epoch 13, Loss: 0.3108
Epoch 14, Loss: 0.3079
Epoch 15, Loss: 0.3062
Epoch 16, Loss: 0.3053
Epoch 17, Loss: 0.3048
Epoch 18, Loss: 0.3045
Epoch 19, Loss: 0.3044
Epoch 20, Loss: 0.3043
Epoch 21, Loss: 0.3041
Epoch 22, Loss: 0.3040
Epoch 23, Loss: 0.3038
Epoch 24, Loss: 0.3035
Epoch 25, Loss: 0.3030
Epoch 26, Loss: 0.3020
Epoch 27, Loss: 0.2994
Epoch 28, Loss: 0.2892
Epoch 29, Loss: 0.2297
Epoch 30, Loss: 0.0943
Epoch 31, Loss: 0.0300
Epoch 32, Loss: 0.0158
Epoch 33, Loss: 0.0116
Epoch 34, Loss: 0.0093
Epoch 35, Loss: 0.0077
Epoch 36, Loss: 0.0066
Epoch 37, Loss: 0.0057
Epoch 38, Loss: 0.0050
Epoch 39, Loss: 0.0045
Epoch 40, Loss: 0.0040
Epoch 41, Loss: 0.0036
Epoch 42, Loss: 0.0033
Epoch 43, Loss: 0.0030
Epoch 44, Loss: 0.00

In [18]:
# Example
query, context, target, target_indices = valid_batches[0]

with torch.no_grad():
    output, weight = net(query, context, return_weight=True)

In [27]:
np.argmin(context[2].numpy()) # index of minimum element in the input data

11

In [28]:
np.argmax(weight[2].numpy()) # maximum attention weight

11

In [29]:
output[2] # predicted value

tensor([[-1.6838]])

In [30]:
target[2] # predicted target

tensor([[-1.7276]])

In [None]:
## Looks good?