In [194]:
import torch
from torch.autograd import Function
#import torch.nn

# Inherit from Function
class LinearFunction(Function):
    # Note that both forward and backward are @staticmethods
    @staticmethod
    # bias is an optional argument
    def forward(ctx, input, weight, bias=None):
        ctx.save_for_backward(input, weight, bias)
        output = input.mm(weight.t())
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
        return output

    # This function has only a single output, so it gets only one gradient
    @staticmethod
    def backward(ctx, grad_output):
        # This is a pattern that is very convenient - at the top of backward
        # unpack saved_tensors and initialize all gradients w.r.t. inputs to
        # None. Thanks to the fact that additional trailing Nones are
        # ignored, the return statement is simple even when the function has
        # optional inputs.
        input, weight, bias = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None

        # These needs_input_grad checks are optional and there only to
        # improve efficiency. If you want to make your code simpler, you can
        # skip them. Returning gradients for inputs that don't require it is
        # not an error.
        print('grad_output',grad_output.shape)
        if ctx.needs_input_grad[0]:
            grad_input = grad_output.mm(weight)
            #grad_input = grad_output.mm(weight.t())
            print('grad_input',grad_input.shape)
        if ctx.needs_input_grad[1]:
            grad_weight = grad_output.t().mm(input)
            print('grad_weight',grad_weight.shape)
        if bias is not None and ctx.needs_input_grad[2]:
            
            grad_bias = grad_output.sum(0).squeeze(0)

        return grad_input, grad_weight, grad_bias
    
output_features = 4
input_features = 8
x = (torch.ones((20,input_features))+1)#.unsqueeze(0)
w = (torch.ones((output_features,input_features))+2)#.unsqueeze(0)
b = (torch.ones((1))*13).squeeze()
print(b.shape)
x.requires_grad_(True)
w.requires_grad_(True)
b.requires_grad_(True)
print(x.shape)
print(w.shape)
y = LinearFunction.apply(x,w)#,b)
#y = y.mm(x)
print('y',y.shape)
t = torch.ones_like(y)#(y.size())
print(t.shape)
y.backward(t)

print('grad x',x.grad)
print('grad w',w.grad)
print('grad b',b.grad)
print('-'*50)
print('grad x',1*w)
print('grad w',1*x)

torch.Size([])
torch.Size([20, 8])
torch.Size([4, 8])
y torch.Size([20, 4])
torch.Size([20, 4])
grad_output torch.Size([20, 4])
grad_input torch.Size([20, 8])
grad_weight torch.Size([4, 8])
grad x tensor([[12., 12., 12., 12., 12., 12., 12., 12.],
        [12., 12., 12., 12., 12., 12., 12., 12.],
        [12., 12., 12., 12., 12., 12., 12., 12.],
        [12., 12., 12., 12., 12., 12., 12., 12.],
        [12., 12., 12., 12., 12., 12., 12., 12.],
        [12., 12., 12., 12., 12., 12., 12., 12.],
        [12., 12., 12., 12., 12., 12., 12., 12.],
        [12., 12., 12., 12., 12., 12., 12., 12.],
        [12., 12., 12., 12., 12., 12., 12., 12.],
        [12., 12., 12., 12., 12., 12., 12., 12.],
        [12., 12., 12., 12., 12., 12., 12., 12.],
        [12., 12., 12., 12., 12., 12., 12., 12.],
        [12., 12., 12., 12., 12., 12., 12., 12.],
        [12., 12., 12., 12., 12., 12., 12., 12.],
        [12., 12., 12., 12., 12., 12., 12., 12.],
        [12., 12., 12., 12., 12., 12., 12., 12.],
   

In [1]:
import torch
from torch.autograd import Function
import torch.nn.functional as F
import torch.nn as nn
#import torch.nn

class Att_op(Function):
    @staticmethod
    def forward(ctx, v, q, weigths): # bxn x bxn nxn
        ctx.save_for_backward(v,q,weigths)
        #print('weigths',weigths.shape)
        #print('v',v.shape)
        vv = torch.mm(v, weigths)
        #print('vv',vv.shape)
        out = torch.bmm(vv.unsqueeze(1),q.unsqueeze(-1)).squeeze() # bx1xn x bxnx1
        return out
    
    def backward(ctx, grad_output):
        return grad_output, grad_output, grad_output
    
class Att_F(Function):
    @staticmethod
    def forward(ctx, values, query, weigths): # bxtxn x bxn n
        ctx.save_for_backward(values, query)
        b = values.shape[0]
        time_steps = values.shape[1]
        output = torch.zeros((b,time_steps,1))
        for i in range(time_steps):
            v = values[:,i,:]
            q = query
            out = Att_op.apply(v,q,weigths)
            #print('v',v.shape) # bxn
            #print('query',query.shape) # bxn
            #print('out',out.shape) # bx1
            output[:,i,0] = out
        output = F.softmax(output,dim=1)
        return output # bxtx1
    
    def backward(ctx, grad_output):
        
        return grad_output, grad_output, grad_output
        

b = 3
input_features = 2
values = (torch.ones((b,3,input_features))*1)#.unsqueeze(0)
weigths = nn.Parameter(torch.ones((input_features,input_features)))
#weigths = (torch.ones((input_features,input_features))*1)#.unsqueeze(0)
print('weigths',weigths)

values[0,0,0] = -1
values[1,0,0] = -5
print(values.shape)
print(values)
query = (torch.ones((b,input_features))*0.5)#.unsqueeze(0)
print(query.shape)
print(query)
att = Att_F.apply(values, query, weigths)
print('att',att.shape)
print(att[1])
att = att.sum()
grads = torch.ones(input_features)
print('grads',grads.shape)
#att.backward(grads)
att.backward()
print('-'*50)
print(weigths.grad)

weigths Parameter containing:
tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
torch.Size([3, 3, 2])
tensor([[[-1.,  1.],
         [ 1.,  1.],
         [ 1.,  1.]],

        [[-5.,  1.],
         [ 1.,  1.],
         [ 1.,  1.]],

        [[ 1.,  1.],
         [ 1.,  1.],
         [ 1.,  1.]]])
torch.Size([3, 2])
tensor([[0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000]])
att torch.Size([3, 3, 1])
tensor([[0.0012],
        [0.4994],
        [0.4994]], grad_fn=<SelectBackward>)
grads torch.Size([2])


RuntimeError: Function Att_FBackward returned an invalid gradient at index 2 - got [3, 3, 1] but expected shape compatible with [2, 2]

In [1]:
import torch
from torch.autograd import Function
import torch.nn.functional as F
import torch.nn as nn

class MultiHeadAttentionContext(nn.Module):
    def __init__(self, input_features, output_features, head_size):
        super(MultiHeadAttentionContext, self).__init__()
        self.input_features = input_features
        self.output_features = output_features
        self.head_size = head_size
        
        self.concat_size = input_features*head_size
        self.v_linear = nn.ModuleList([nn.Linear(input_features,input_features) for i in range(head_size)])
        self.k_linear = nn.ModuleList([nn.Linear(input_features,input_features) for i in range(head_size)])
        self.q_linear = nn.ModuleList([nn.Linear(input_features,input_features) for i in range(head_size)])
        self.concat_linear = nn.Linear(self.concat_size,output_features)

    def forward(self, values, query): # txn n
        b = values.shape[0]
        #tt = values.shape[1]
        multihead = torch.zeros(b,self.input_features,self.head_size)#.unsqueeze(-1) # txnxm
        #print('multihead',multihead.shape)
        for i in range(head_size):
            vl = self.v_linear[i](values)
            kl = self.k_linear[i](values)
            ql = self.q_linear[i](query)
            #print(i,'-'*20);print('vl',vl.shape);print('ql',ql.shape)
            print(i,'-'*20);print('vl',vl.max());print('ql',ql.max())
            qk = torch.bmm(kl,ql.unsqueeze(-1)) # n * txn = t
            #print('qk',qk.shape)
            soft = F.softmax(qk,dim=1).expand_as(vl)
            #print('soft',soft.shape)
            #print('soft',soft[2,:])
            #print(vl[0,:,:2]) # b,t,n
            #print(soft[0,:]) # b,t
            att = (vl*soft).sum(dim=1)
            #att = vl[:,:,:]*soft[:,:,None]
            #print('att',att.shape)
            #print('att',att[0,:,:2])
            multihead[:,:,i] = att
        #print('multihead',multihead[0])
        multihead = multihead.view(b,-1)
        #print('multihead',multihead)
        out = self.concat_linear(multihead)
        #print('out',out.shape)
        return out
    
class AttentionRNN(nn.Module):
    def __init__(self, input_features, output_features, head_size):
        super(AttentionRNN, self).__init__()
        self.input_features = input_features
        self.output_features = output_features
        self.head_size = head_size
        self.attentionContext = MultiHeadAttentionContext(input_features, output_features, head_size)
        
    def forward(self, values): # txn n
        b = values.shape[0]
        time_length = values.shape[1]
        out = torch.zeros((b,time_length,self.output_features))
        for i in range(time_length):
            actual_query = values[:,i,:]
            #print('actual_query',actual_query.shape)
            #context = self.att(values,actual_query)
            values_ = values[:,:i+1,:]
            #print('values_',values_.shape)
            context = self.attentionContext(values_,actual_query)
            #print('context',context.shape)
            out[:,i,:] = context
            
        return out
    
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

head_size=30
input_features = 4
output_features = 10
attention1 = AttentionRNN(input_features, output_features, head_size)
attention2 = AttentionRNN(output_features, output_features, head_size)
final_linear = nn.Linear(30,1)
#attention = AttentionRNN(input_features, output_features, head_size)
print('attention',count_parameters(attention))
b = 2
t = 3
x = torch.rand((b,t,input_features))
#q = torch.rand((b,input_features))
print(x.shape)
#print(q.shape)
x = attention1(x)
x = attention2(x)
print(x[0])
x = x.view(b,-1)
print(x.shape)
x = final_linear(x)
print(x.shape)
x = x[0]
x.backward()
grad = attention2.attentionContext.concat_linear.weight.grad
print(grad.shape)
print(grad)

NameError: name 'attention' is not defined

In [69]:
input_features = 4
output_features = 10
l = nn.Linear(input_features, output_features)
l2 = nn.Linear(output_features, 1)
b = 2
x = torch.rand((b,input_features))
print(x[0])
print(x[1])
#q = torch.rand((b,input_features))
print(x.shape)
#print(q.shape)
x = l(x)
x = l2(x)
print(x.shape)
x = x[0]
x.backward()
grad = l.weight.grad
print(grad.shape)
print(grad)

tensor([0.9371, 0.7756, 0.9431, 0.5703])
tensor([0.9754, 0.2133, 0.1869, 0.3648])
torch.Size([2, 4])
torch.Size([2, 1])
torch.Size([10, 4])
tensor([[-0.1231, -0.1019, -0.1239, -0.0749],
        [ 0.2448,  0.2026,  0.2464,  0.1490],
        [ 0.0971,  0.0804,  0.0977,  0.0591],
        [ 0.0025,  0.0020,  0.0025,  0.0015],
        [-0.0542, -0.0449, -0.0546, -0.0330],
        [ 0.0961,  0.0795,  0.0967,  0.0585],
        [ 0.0491,  0.0407,  0.0495,  0.0299],
        [-0.1077, -0.0891, -0.1084, -0.0655],
        [-0.2129, -0.1762, -0.2142, -0.1296],
        [ 0.1786,  0.1478,  0.1797,  0.1087]])


In [130]:
from torch.autograd import Function

class MulConstant(Function):
    @staticmethod
    def forward(ctx, tensor, constant):
        # ctx is a context object that can be used to stash information
        # for backward computation
        ctx.constant = constant
        return tensor * constant

    @staticmethod
    def backward(ctx, grad_output):
        # We return as many input gradients as there were arguments.
        # Gradients of non-Tensor arguments to forward must be None.
        return grad_output * ctx.constant, None

t = torch.ones((2,2))
t[0,0] = 2
t.requires_grad_(True)
y = MulConstant.apply(t,8)
y2 = (y**2).sum()
print(y2)
y2.backward()
print(t.grad)
print(1*(y[0]*2)*8)

tensor(448., grad_fn=<SumBackward0>)
tensor([[256., 128.],
        [128., 128.]])
tensor([256., 128.], grad_fn=<MulBackward0>)


In [12]:
t = torch.zeros(8)
t2 = torch.zeros((5,8))
print(t.shape)
t = torch.unsqueeze(t,0)
print(t.shape)
t = t.expand_as(t2)
print(t.shape)

torch.Size([8])
torch.Size([1, 8])
torch.Size([5, 8])


In [42]:
a = torch.tensor([3.0, 2.0], requires_grad=True)
b = torch.tensor([4.0, 7.0])
ab_sum = a + b
print(ab_sum)
ab_res = (ab_sum*8).sum()
ab_res.backward()
print(ab_res)
print('grad',a.grad)
print('grad',ab_res.grad)

tensor([7., 9.], grad_fn=<AddBackward0>)
tensor(128., grad_fn=<SumBackward0>)
grad tensor([8., 8.])
grad None


In [78]:
a = torch.randn((2,2), requires_grad=True)
a = torch.randn((1), requires_grad=True)
print('a',a)
a2 = a**2
aa = torch.log(a2)
#aa = a2/3
b = aa.sum()
print('b:',b)
b.backward()
print('grad:',a.grad)

res = 1*(1/a2[0]w)*a[0]*2
#res = 1*1/3*a[0]*2
print(res)

a tensor([0.6115], requires_grad=True)
b: tensor(-0.9837, grad_fn=<SumBackward0>)
grad: tensor([3.2707])
tensor(3.2707, grad_fn=<MulBackward0>)


In [107]:
x = torch.randn((1,8), requires_grad=True)#.unsqueeze(0)
w = torch.randn((1,8), requires_grad=True)#.unsqueeze(0)
print('x',x.shape)
print('w',w.shape)
y = x.mm(w.t())
y2 = y**2
y3 = 10-y2
print('y',y.shape)
#y = y.sum()
print('y:',y)
y3.backward()
print('grad x:',x.grad)
print('grad w:',w.grad)
print('-'*50)
print('grad x:',1*y*2*w)
print('grad w:',1*y*2*x)

x torch.Size([1, 8])
w torch.Size([1, 8])
y torch.Size([1, 1])
y: tensor([[0.6215]], grad_fn=<MmBackward>)
grad x: tensor([[ 0.8652,  1.0976, -0.4235, -1.2149, -0.0071, -0.3101,  1.3825, -0.4072]])
grad w: tensor([[-1.8467,  0.4197,  1.4980, -1.9762,  1.1061, -1.6743,  0.6661,  2.7042]])
--------------------------------------------------
grad x: tensor([[-0.8652, -1.0976,  0.4235,  1.2149,  0.0071,  0.3101, -1.3825,  0.4072]],
       grad_fn=<MulBackward0>)
grad w: tensor([[ 1.8467, -0.4197, -1.4980,  1.9762, -1.1061,  1.6743, -0.6661, -2.7042]],
       grad_fn=<MulBackward0>)
