The most recent version of this notebook is available at https://github.com/nadiinchi/dl_labs/blob/master/lab_attention.ipynb

In [None]:
import torch
from torch import nn
from torch import optim
import numpy as np
import math

In [None]:
class Attention(nn.Module):
    def __init__(self):
        super().__init__()

    def get_scores(self, features, queries):
        """
        features: [batch_size x num_objects x obj_feature_dim]
        queries:  [batch_size x num_queries x query_feature_dim]
        Returns matrix of scores with shape [batch_size x num_queries x num_objects].
        """
        raise NotImplementedError()                

    def attend(self, features, queries, mask=None):
        """
        features:        [batch_size x num_objects x obj_feature_dim]
        queries:         [batch_size x num_queries x query_feature_dim]
        mask, optional:  [batch_size x num_queries x num_objects]
        Returns matrix of features for queries with shape [batch_size x num_queries x obj_feature_dim].
        If mask is not None, set corresponding to mask weights to zero.
        Saves detached weights as self.last_weights for further visualization.
        """
        # your code here
        return result

In [None]:
class AdditiveAttention(Attention):
    """
    Bahdanau et al. "Neural Machine Translation by Jointly Learning to Align and Translate", 2014.
    """
    def __init__(self, obj_feature_dim, query_feature_dim, hidden_dim):
        """
        obj_feature_dim   - dimensionality of attention object features vector
        query_feature_dim - dimensionality of attention query vector
        hidden_dim        - dimensionality of latent vectors of attention 
        """
        super().__init__()
        # your code here

    def get_scores(self, features, queries):
        """
        features: [batch_size x num_objects x obj_feature_dim]
        queries:  [batch_size x num_queries x query_feature_dim]
        Returns matrix of scores with shape [batch_size x num_queries x num_objects].
        """
        # your code here
        return result

In [None]:
class MultiplicativeAttention(Attention):
    """
    Luong et al. "Effective approaches to attention-based neural machine translation", 2015.
    """
    def __init__(self):
        super().__init__()

    def get_scores(self, features, queries):
        """
        features: [batch_size x num_objects x feature_dim]
        queries:  [batch_size x num_queries x feature_dim]
        Returns matrix of scores with shape [batch_size x num_queries x num_objects].
        """
        # your code here
        return result

In [None]:
class ScaledDotProductAttention(Attention):
    """
    Vaswani et al. "Attention Is All You Need", 2017.
    """
    def __init__(self):
        super().__init__()

    def get_scores(self, features, queries):
        """
        features: [batch_size x num_objects x feature_dim]
        queries:  [batch_size x num_queries x feature_dim]
        Returns matrix of scores with shape [batch_size x num_queries x num_objects].
        """
        # your code here
        return result

In [None]:
# time to check that your attention works
# your code here

In [None]:
def perm_generator(batch_size, perm_size):
    """
    Generates batch of batch_size objects.
    Each object consists of two random permutations with length perm_size.
    The target for the object is the product of its two permutations.
    """
    # your code here
    return objects, correct_answers

In [None]:
# time to check your generator
# your code here

In [None]:
class PositionalEncoder(nn.Module):
    def __init__(self, dim, max_len=50, scale=10000.0, add=True):
        """
        Transforms input as described by Vaswani et al. in "Attention Is All You Need", 2017.
        dim     - dimension of positional embeddings.
        max_len - maximal length of sequence, for precomputing
        scale   - scale factor for frequency for positional embeddings
        add     - boolean, if add is False, concatenate positional embeddings with input instead of adding
        """
        super().__init__()
        
        self.dim = dim
        self.add = add
        if add:
            self.extra_output_shape = 0
        else:
            self.extra_output_shape = dim

        # your code here
               
    def forward(self, input):
        """
        input - [batch_size x sequence_len x features_dim]
        If self.add is True, self.dim = featurs_dim.
        Returns input with added or concatenated positional embeddings (depending on self.add).
        """
        # your code here
        return result

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

# time to draw positional encoder
# your code here

In [None]:
def get_autoregressive_mask(size):
    """
    Returns attention mask of given size for autoregressive model.
    """
    # your code here
    return res

In [None]:
class PermMultiplier(nn.Module):
    def __init__(self, perm_len, embedding_dim, hidden_dim, attention, pos_enc, autoregressive):
        """
        perm_len       - permutation length (the input is twice longer)
        embedding_dim  - dimensionality of integer embeddings
        hidden_dim     - dimensionality of LSTM output
        attention      - Attention object
        pos_enc        - PositionalEncoder object or None
        autoregressive - boolean, if True, then model must use autoregressive mask for attention
        """
        super().__init__()
        self.autoregressive = autoregressive
        self.perm_len = perm_len
        # your code here

    def forward(self, input):
        """
        Perform forward pass through layers:
        + get embeddings from input sequence (using both embeddings
          and positional embeddings if pos_enc is not None)
        + run LSTM on embeddings
        + use output of LSTM as an attention queries
        + attend on the embedded sequence using queries (note autoregressive flag)
        + make final linear tranformation to obtain logits
        """
        # your code here

In [None]:
perm_len = 10

In [None]:
# time to set up a model
# you can check that without pos_enc model doesn't work
# not-autoregressive model can be learned easily, but it is less isefull
# try to learn autoregressive model if possible
pos_enc = PositionalEncoder(?, perm_len * 2, ?, ?)
attention = ?
model = PermMultiplier(perm_len, ?, ?, attention, pos_enc, ?)
if torch.cuda.is_available():
    model = model.cuda()

In [None]:
# set up optimizer
gd = optim.Adam(model.parameters(), lr=?)

In [None]:
# do optimization
avg_loss = None
forget = 0.99
batch_size = 64
iterator = range(?)
for i in iterator:
    gd.zero_grad()
    batch = perm_generator(batch_size, perm_len)
    if torch.cuda.is_available():
        batch = batch[0].cuda(), batch[1].cuda()
    # compute batch loss
    # your code here
    loss.backward()
    if avg_loss is None:
        avg_loss = float(loss)
    else:
        avg_loss = forget * avg_loss + (1 - forget) * float(loss)
    descr_str = 'Iteration %05d, loss %.5f.' % (i, avg_loss)
    print('\r', descr_str, end='')
    gd.step()

In [None]:
# time to check your model
batch = perm_generator(batch_size, perm_len)
if torch.cuda.is_available():
    batch = batch[0].cuda(), batch[1].cuda()
print('Input:\n', batch[0][:5])
print('Output:\n', ?)
print('Correct:\n', batch[1][:5])

In [None]:
# visualize attention map for some object
# your code here

In [None]:
# play with model and learn something new about attention!