## Gradient computation with NumPy

In [1]:
import numpy as np 
def sigmoid(x):
    return 1/(1+np.exp(-x))
def sigmoid_der(x):
    return sigmoid(x)*(1-sigmoid(x))

def relu(x):
    # Fill code here for relu(x) = max(0,x)
    pass
def relu_der(x):
    # Fill code here for derivative of relu
    pass

def linear(W,x,beta):
    return np.matmul(W,x).reshape(-1,1)+beta

def dfXdX_func(W,x,beta,act_der):
    # Derivative of f(X_{k-1}) with respect to X_{k-1}
    return np.matmul(np.diag(act_der(linear(W,x,beta)).ravel()), 
                     W).reshape(W.shape)

def dfXdbeta_func(W,x,beta,act_der):
    # Derivative of f(X_{k-1}) with respect to beta_k
    dfXdbeta = np.diag(act_der(np.matmul(W,x)+beta).ravel())
    return dfXdbeta

def dfXdWi_func(W,x,beta,act_der,i):
    # Derivative of f(X_{k-1}) with respect to row i of W_k
    D = np.diag(act_der(np.matmul(W,x)+beta).ravel())
    Xi = np.zeros(W.shape)
    Xi[i,:] = x.T
    return np.matmul(D,Xi)


In [2]:
c = [2,3,1]
X0 = np.array([0.5,-0.5]).reshape(c[0],1)
W1 = np.array([[0.5,-2.5],[1.2,0.1],[-0.7,-1.2]]).reshape(c[1],c[0])
beta1 = np.array([-1.5,-1.3,2.0]).reshape(c[1],1)
W2 = np.array([-1.5, 0.2, 0.5]).reshape(c[2],c[1])
beta2 = np.array([1.0]).reshape([c[2],1])
W = {1:W1,2:W2}
beta = {1:beta1, 2:beta2}
X = {0:X0}

activation = sigmoid
activation_der = sigmoid_der
K = 2

# compute X
for i in range(1,K+1):
    X[i] = activation(linear(W[i],X[i-1],beta[i]))

import timeit
start = timeit.default_timer()

# compute df(X)/d(X)
dfXdX = {}
for i in range(2):
    dfXdX[i] = dfXdX_func(W[i+1],X[i],beta[i+1],activation_der)

# compute dl/dX
dldX = {2:1/X[2]}
for i in range(K-1,-1,-1):
    dldX[i] = np.matmul(dldX[i+1],dfXdX[i])

# compute dl/dW and dl/dbeta
dldW = {}
dldbeta = {}
for i in range(K,0,-1):
    dldbeta[i] = np.matmul(dldX[i],
                           dfXdbeta_func(W[i],X[i-1],beta[i],activation_der)
                          ).ravel()
    dldW[i] = np.array([np.matmul(dldX[i], 
                                  dfXdWi_func(W[i],X[i-1],beta[i],activation_der,j)
                                 ).ravel()
                        for j in range(W[i].shape[0])])

print('Time: ', timeit.default_timer() - start)  

print(f'dl/dX0: ', dldX[0])
print(f'dl/dX2: ', dldX[2])
print(f'dl/dX1: ', dldX[1])
print('dl/dW1:', dldW[1])
print('dl/dbeta1:', dldbeta[1])
print('dl/dW2:', dldW[2])
print('dl/dbeta2:', dldbeta[2])




Time:  0.02614442699996289
dl/dX0:  [[-0.05246989  0.28237541]]
dl/dX2:  [[1.46464132]]
dl/dX1:  [[-0.47585847  0.0634478   0.15861949]]
dl/dW1: [[-0.05948231  0.05948231]
 [ 0.00691248 -0.00691248]
 [ 0.0068411  -0.0068411 ]]
dl/dbeta1: [-0.11896462  0.01382496  0.01368219]
dl/dW2: [[0.15861949 0.10177702 0.28699041]]
dl/dbeta2: [0.31723898]


## Gradient computation with Pytorch

In [4]:
! pip install torch==1.9.0+cpu torchvision==0.10.0+cpu torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
import torch
import numpy as np
from torch.autograd import Variable

c = [2,3,1]
X0 = Variable(torch.Tensor(np.array([0.5,-0.5]).reshape(c[0],1)), requires_grad=True)
W1 = Variable(torch.Tensor(np.array([[0.5,-2.5],[1.2,0.1],[-0.7,-1.2]]).reshape(c[1],c[0])), requires_grad=True)
beta1 = Variable(torch.Tensor(np.array([-1.5,-1.3,2.0]).reshape(c[1],1)), requires_grad=True)
W2 = Variable(torch.Tensor(np.array([-1.5, 0.2, 0.5]).reshape(c[2],c[1])), requires_grad=True)
beta2 = Variable(torch.Tensor(np.array([1.0]).reshape([c[2],1])), requires_grad=True)

X1 = torch.sigmoid(torch.matmul(W1,X0)+beta1)
X1.retain_grad()
X2 = torch.sigmoid(torch.matmul(W2,X1)+beta2)
X2.retain_grad()
l = torch.log(X2)

Looking in links: https://download.pytorch.org/whl/torch_stable.html


In [5]:
import timeit
start = timeit.default_timer()
l.backward()
print('Time: ', timeit.default_timer() - start)  
print(f'dl/dX0: ', X0.grad)
print(f'dl/dX1: ', X1.grad)
print(f'dl/dX2: ', X2.grad)
print(f'dl/dW1: ', W1.grad)
print(f'dl/dbeta1: ', beta1.grad)
print(f'dl/dW2: ', W2.grad)
print(f'dl/dbeta2: ', beta2.grad)

Time:  0.0027164640050614253
dl/dX0:  tensor([[-0.0525],
        [ 0.2824]])
dl/dX1:  tensor([[-0.4759],
        [ 0.0634],
        [ 0.1586]])
dl/dX2:  tensor([[1.4646]])
dl/dW1:  tensor([[-0.0595,  0.0595],
        [ 0.0069, -0.0069],
        [ 0.0068, -0.0068]])
dl/dbeta1:  tensor([[-0.1190],
        [ 0.0138],
        [ 0.0137]])
dl/dW2:  tensor([[0.1586, 0.1018, 0.2870]])
dl/dbeta2:  tensor([[0.3172]])
