## Chain Rule Tutorial in PyTorch
http://cs231n.github.io/optimization-2/

The autograd feature saves is from computing the backprop logic

In [1]:
from __future__ import print_function
import torch
from torch.autograd import Variable

### Gradient of product
f = x * y
x = 4; y = -3

df/dx = -3
df/dy = 4

In [2]:
x = Variable(torch.Tensor([4]), requires_grad = True)
y = Variable(torch.Tensor([-3]), requires_grad = True)

In [3]:
z = x  *  y

In [4]:
z

Variable containing:
-12
[torch.FloatTensor of size 1]

In [5]:
z.backward()

In [6]:
x.grad

Variable containing:
-3
[torch.FloatTensor of size 1]

In [7]:
y.grad

Variable containing:
 4
[torch.FloatTensor of size 1]

### Chain Rule
f(x,y,z) = (x+y)z

q = x+y
f = qz

df/dq = z
df/dz = q
dq/dx = 1
dq/dy = 1

df/dx = df/dq . dq/dx
df/dy = df/dq . dq/dy
df/dz = q

x = -2; y = 5; z = -4

df/dx = z * 1 = -4 * 1 = -4
df/dy = z * 1 = -4 * 1 = -4
df/dz = q = (x+y) = 3


In [8]:
x = Variable(torch.Tensor([-2]), requires_grad = True)
y = Variable(torch.Tensor([5]), requires_grad = True)
z = Variable(torch.Tensor([-4]), requires_grad = True)

q = x + y
f = q * z


In [9]:
f

Variable containing:
-12
[torch.FloatTensor of size 1]

In [10]:
f.backward()

In [11]:
x.grad

Variable containing:
-4
[torch.FloatTensor of size 1]

In [12]:
y.grad

Variable containing:
-4
[torch.FloatTensor of size 1]

In [13]:
z.grad

Variable containing:
 3
[torch.FloatTensor of size 1]

### Another function example
sig(x) = 1 / ( 1 + e^-x)
dsig(x)/dx = ( 1 - sig(x)) sig(x)

In [14]:
#manually compute as per kparthy example, both above and this match
import math
w = [2,-3,-3] # assume some random weights and data
x = [-1, -2]

# forward pass
dot = w[0]*x[0] + w[1]*x[1] + w[2]
print (dot)
f = 1.0 / (1 + math.exp(-dot)) # sigmoid function

# backward pass through the neuron (backpropagation)
ddot = (1 - f) * f # gradient on dot variable, using the sigmoid gradient derivation
print (ddot)
dx = [w[0] * ddot, w[1] * ddot] # backprop into x
dw = [x[0] * ddot, x[1] * ddot, 1.0 * ddot] # backprop into w
# we're done! we have the gradients on the inputs to the circuit

1
0.19661193324148185


In [15]:
dx

[0.3932238664829637, -0.5898357997244456]

In [16]:
dw

[-0.19661193324148185, -0.3932238664829637, 0.19661193324148185]

###### And now the pytorch way

In [17]:
w = Variable(torch.Tensor([[2],[-3],[-3]]), requires_grad = True)
w

Variable containing:
 2
-3
-3
[torch.FloatTensor of size 3x1]

In [18]:
x = Variable(torch.Tensor([[-1],[-2],[1]]), requires_grad = True)
x

Variable containing:
-1
-2
 1
[torch.FloatTensor of size 3x1]

In [19]:
dot = w.dot(x)

In [20]:
dot

Variable containing:
 1
[torch.FloatTensor of size 1]

In [21]:
f = 1.0 / ( 1 + torch.exp(-dot))

In [22]:
f.backward()

In [23]:
w.grad

Variable containing:
-0.1966
-0.3932
 0.1966
[torch.FloatTensor of size 3x1]

In [24]:
x.grad

Variable containing:
 0.3932
-0.5898
-0.5898
[torch.FloatTensor of size 3x1]

### Using sigmoid in another function
f(x,y) = x + sig(y) / sig(x) +(x+y)^2

In [25]:
#forward pass manual
x = 3 # example values
y = -4

# forward pass
sigy = 1.0 / (1 + math.exp(-y)) # sigmoid in numerator   #(1)
num = x + sigy # numerator                               #(2)
sigx = 1.0 / (1 + math.exp(-x)) # sigmoid in denominator #(3)
xpy = x + y                                              #(4)
xpysqr = xpy**2                                          #(5)
den = sigx + xpysqr # denominator                        #(6)
invden = 1.0 / den                                       #(7)
f = num * invden # done! 
f

1.5456448841066441

In [26]:
#backpass manual
# backprop f = num * invden
dnum = invden # gradient on numerator                             #(8)
dinvden = num                                                     #(8)
# backprop invden = 1.0 / den 
dden = (-1.0 / (den**2)) * dinvden                                #(7)
# backprop den = sigx + xpysqr
dsigx = (1) * dden                                                #(6)
dxpysqr = (1) * dden                                              #(6)
# backprop xpysqr = xpy**2
dxpy = (2 * xpy) * dxpysqr                                        #(5)
# backprop xpy = x + y
dx = (1) * dxpy                                                   #(4)
dy = (1) * dxpy                                                   #(4)
# backprop sigx = 1.0 / (1 + math.exp(-x))
dx += ((1 - sigx) * sigx) * dsigx # Notice += !! See notes below  #(3)
# backprop num = x + sigy
dx += (1) * dnum                                                  #(2)
dsigy = (1) * dnum                                                #(2)
# backprop sigy = 1.0 / (1 + math.exp(-y))
dy += ((1 - sigy) * sigy) * dsigy                                 #(1)
# done! phew
dy

1.5922327514838093

###### And now the pytorch way

In [27]:
x = Variable(torch.Tensor([3]), requires_grad = True)
y = Variable(torch.Tensor([-4]), requires_grad = True)

In [28]:
sigy = 1.0 / ( 1 + torch.exp(-y))
sigy

Variable containing:
1.00000e-02 *
  1.7986
[torch.FloatTensor of size 1]

In [29]:
num = x + sigy
num

Variable containing:
 3.0180
[torch.FloatTensor of size 1]

In [30]:
sigx = 1.0 / (1 + torch.exp(-x))
sigx

Variable containing:
 0.9526
[torch.FloatTensor of size 1]

In [31]:
xpy = x + y
xpysqr = xpy**2
xpysqr

Variable containing:
 1
[torch.FloatTensor of size 1]

In [32]:
den = sigx + xpysqr
den

Variable containing:
 1.9526
[torch.FloatTensor of size 1]

In [33]:
invden = 1.0 / den

In [34]:
f = num * invden
f

Variable containing:
 1.5456
[torch.FloatTensor of size 1]

In [35]:
f.backward()

In [36]:
y.grad

Variable containing:
 1.5922
[torch.FloatTensor of size 1]

In [37]:
x.grad

Variable containing:
 2.0596
[torch.FloatTensor of size 1]

#### Gradients for vectorized operations

In [39]:
import numpy as np
W = np.random.randn(5, 10)
X = np.random.randn(10, 3)
D = W.dot(X)
dD = np.random.randn(*D.shape)

In [40]:
# backward pass manual way,          
# now suppose we had the gradient on D from above in the circuit
dW = dD.dot(X.T) #.T gives the transpose of the matrix
dX = W.T.dot(dD)
dW

array([[ 0.77600839, -0.14624223, -0.87395147,  0.20161769, -0.03936833,
        -0.87968948, -0.13980176,  0.31699298, -1.49537203, -0.7182259 ],
       [-0.99598126,  0.61441898,  1.328939  ,  0.14102044,  0.22313748,
         1.25579846,  0.27959311, -1.87007497,  1.96830334,  1.44868397],
       [-0.69009943, -0.25506974, -0.28325058,  0.39659129,  1.29403361,
         1.01380357, -1.17872413, -1.22622247, -2.24024926,  0.70729708],
       [ 0.4898205 , -0.01406694, -0.78918575,  0.49608004,  0.45314633,
        -0.42290229, -0.45244888, -0.78274063, -2.04722749, -0.18509468],
       [ 0.77196357, -0.0665504 , -0.29427519, -0.30117567, -0.87676367,
        -1.06424208,  0.6247388 ,  1.43668151,  0.68842805, -0.95141983]])

###### And now the pytorch way

In [41]:
#W (2x3)
#X (3x2)
Wpt = Variable(torch.from_numpy(W), requires_grad = True)
Xpt = Variable(torch.from_numpy(X), requires_grad = True)

Dpt = Wpt.mm(Xpt)
Dpt

Variable containing:
-4.3399 -0.5234 -0.9074
-3.0832  2.9045 -1.7857
-1.7470  0.2694  8.6217
 1.4014  1.9705 -1.8962
-7.1952 -0.9747 -6.5229
[torch.DoubleTensor of size 5x3]

In [42]:
Dpt.backward(torch.from_numpy(dD))

In [43]:
Wpt.grad

Variable containing:
 0.7760 -0.1462 -0.8740  0.2016 -0.0394 -0.8797 -0.1398  0.3170 -1.4954 -0.7182
-0.9960  0.6144  1.3289  0.1410  0.2231  1.2558  0.2796 -1.8701  1.9683  1.4487
-0.6901 -0.2551 -0.2833  0.3966  1.2940  1.0138 -1.1787 -1.2262 -2.2402  0.7073
 0.4898 -0.0141 -0.7892  0.4961  0.4531 -0.4229 -0.4524 -0.7827 -2.0472 -0.1851
 0.7720 -0.0666 -0.2943 -0.3012 -0.8768 -1.0642  0.6247  1.4367  0.6884 -0.9514
[torch.DoubleTensor of size 5x10]