A simple self attention layer with masking and support for multi headed.

In [1]:
# Torch imports
import torch
from torch import nn
import torch.autograd
import torch.nn.functional as F

import numpy as np
from typing import List, Optional, Dict, Tuple

# Local imports
from utils import *


In [79]:
bs = 4
n = 5
ent_emb_dims = 3
rel_emb_dims = 4
out_features = 7
alpha_leaky = 0.2

In [85]:
def self_attention_template():
    # Setting things up
    bs = 4
    n = 5
    ent_emb_dims = 3
    rel_emb_dims = 4
    out_features = 7
    alpha_leaky = 0.2

    matrix = torch.randn(bs,n,2*ent_emb_dims + rel_emb_dims) # concat s,p,o.
    print(f"shape of matrix is bs*n*emb_dim i.e {matrix.shape}")
    
    # passing it through layer1
    w1 = nn.Linear(2 * ent_emb_dim + rel_emb_dim, out_features)
    nn.init.xavier_normal_(w1.weight.data, gain=1.414)

    c = w1(matrix)
    print(f"shape of c is {c.shape}")
    
    # passing it through layer2
    w2 = nn.Linear(out_features,1)
    nn.init.xavier_normal_(w2.weight.data, gain=1.414)

    b = w2(c)
    leaky_relu = nn.LeakyReLU(alpha_leaky)
    b = leaky_relu(b).squeeze()
    print(f"shape of b is {b.shape}")
    
    # There will be no masking here. So simply a softmax and then multiply and sum across n.
    alphas = torch.softmax(b,dim=1)
    h = torch.sum((alphas.unsqueeze(-1)*c),dim=1)
    
    print(f"shape of final vector by {h.shape}")

In [86]:
self_attention_template()

shape of matrix is bs*n*emb_dim i.e torch.Size([4, 5, 10])
shape of c is torch.Size([4, 5, 7])
shape of b is torch.Size([4, 5])
shape of final vector by torch.Size([4, 7])


In [116]:
def self_attention_template_multi_head(num_head, final_layer=False):
    # Setting things up
    bs = 4
    n = 5
    ent_emb_dims = 3
    rel_emb_dims = 4
    out_features = 7
    alpha_leaky = 0.2

    matrix = torch.randn(bs,n,2*ent_emb_dims + rel_emb_dims) # concat s,p,o.
    print(f"shape of matrix is bs*n*emb_dim i.e {matrix.shape}")
    
    # passing it through layer1
    w1 = nn.Linear(2 * ent_emb_dim + rel_emb_dim, out_features)
    nn.init.xavier_normal_(w1.weight.data, gain=1.414)

    c = w1(matrix)
    print(f"shape of c is {c.shape}")
    
    # passing it through layer2
    w2 = nn.Linear(out_features,num_head)
    nn.init.xavier_normal_(w2.weight.data, gain=1.414)

    b = w2(c)
    leaky_relu = nn.LeakyReLU(alpha_leaky)
    b = leaky_relu(b).squeeze()
    
    print(f"shape of b is {b.shape}")
    
    # There will be no masking here. So simply a softmax and then multiply and sum across n.
    alphas = torch.softmax(b,dim=1)
    print(f"shape of alphas is {alphas.shape}")
    
    h = torch.bmm(c.transpose(1,2),alphas)
    if not final_layer:
        h = h.view(bs,-1)
        h = F.elu(h)
    else:
        h = torch.mean(h, dim=-1)
        
    print(f"shape of final vector by {h.shape}")

In [119]:
self_attention_template_multi_head(num_head=8, final_layer=False)

shape of matrix is bs*n*emb_dim i.e torch.Size([4, 5, 10])
shape of c is torch.Size([4, 5, 7])
shape of b is torch.Size([4, 5, 8])
shape of alphas is torch.Size([4, 5, 8])
shape of final vector by torch.Size([4, 56])
