## Chain Rule Tutorial in PyTorch
http://cs231n.github.io/optimization-2/

The autograd feature saves is from computing the backprop logic

In [102]:
from __future__ import print_function
import torch
from torch.autograd import Variable

### Gradient of product
f = x * y
x = 4; y = -3

df/dx = -3
df/dy = 4

In [103]:
x = Variable(torch.Tensor([4]), requires_grad = True)
y = Variable(torch.Tensor([-3]), requires_grad = True)

In [104]:
z = x  *  y

In [105]:
z

Variable containing:
-12
[torch.FloatTensor of size 1]

In [106]:
z.backward()

In [107]:
x.grad

Variable containing:
-3
[torch.FloatTensor of size 1]

In [108]:
y.grad

Variable containing:
 4
[torch.FloatTensor of size 1]

### Chain Rule
f(x,y,z) = (x+y)z

q = x+y
f = qz

df/dq = z
df/dz = q
dq/dx = 1
dq/dy = 1

df/dx = df/dq . dq/dx
df/dy = df/dq . dq/dy
df/dz = q

x = -2; y = 5; z = -4

df/dx = z * 1 = -4 * 1 = -4
df/dy = z * 1 = -4 * 1 = -4
df/dz = q = (x+y) = 3


In [109]:
x = Variable(torch.Tensor([-2]), requires_grad = True)
y = Variable(torch.Tensor([5]), requires_grad = True)
z = Variable(torch.Tensor([-4]), requires_grad = True)

q = x + y
f = q * z


In [111]:
f

Variable containing:
-12
[torch.FloatTensor of size 1]

In [110]:
f.backward()

In [115]:
x.grad

Variable containing:
-4
[torch.FloatTensor of size 1]

In [114]:
y.grad

Variable containing:
-4
[torch.FloatTensor of size 1]

In [113]:
z.grad

Variable containing:
 3
[torch.FloatTensor of size 1]

### Another function example
sig(x) = 1 / ( 1 + e^-x)
dsig(x)/dx = ( 1 - sig(x)) sig(x)

In [177]:
#manually compute as per kparthy example, both above and this match
import math
w = [2,-3,-3] # assume some random weights and data
x = [-1, -2]

# forward pass
dot = w[0]*x[0] + w[1]*x[1] + w[2]
print (dot)
f = 1.0 / (1 + math.exp(-dot)) # sigmoid function

# backward pass through the neuron (backpropagation)
ddot = (1 - f) * f # gradient on dot variable, using the sigmoid gradient derivation
print (ddot)
dx = [w[0] * ddot, w[1] * ddot] # backprop into x
dw = [x[0] * ddot, x[1] * ddot, 1.0 * ddot] # backprop into w
# we're done! we have the gradients on the inputs to the circuit

1
0.196611933241


In [178]:
dx

[0.3932238664829637, -0.5898357997244456]

In [179]:
dw

[-0.19661193324148185, -0.3932238664829637, 0.19661193324148185]

###### And now the pytorch way

In [166]:
w = Variable(torch.Tensor([[2],[-3],[-3]]), requires_grad = True)
w

Variable containing:
 2
-3
-3
[torch.FloatTensor of size 3x1]

In [167]:
x = Variable(torch.Tensor([[-1],[-2],[1]]), requires_grad = True)
x

Variable containing:
-1
-2
 1
[torch.FloatTensor of size 3x1]

In [168]:
dot = w.dot(x)

In [169]:
dot

Variable containing:
 1
[torch.FloatTensor of size 1]

In [170]:
f = 1.0 / ( 1 + torch.exp(-dot))

In [171]:
f.backward()

In [172]:
w.grad

Variable containing:
-0.1966
-0.3932
 0.1966
[torch.FloatTensor of size 3x1]

In [173]:
x.grad

Variable containing:
 0.3932
-0.5898
-0.5898
[torch.FloatTensor of size 3x1]

### Using sigmoid in another function
f(x,y) = x + sig(y) / sig(x) +(x+y)^2

In [191]:
#forward pass manual
x = 3 # example values
y = -4

# forward pass
sigy = 1.0 / (1 + math.exp(-y)) # sigmoid in numerator   #(1)
num = x + sigy # numerator                               #(2)
sigx = 1.0 / (1 + math.exp(-x)) # sigmoid in denominator #(3)
xpy = x + y                                              #(4)
xpysqr = xpy**2                                          #(5)
den = sigx + xpysqr # denominator                        #(6)
invden = 1.0 / den                                       #(7)
f = num * invden # done! 
f

1.5456448841066441

In [192]:
#backpass manual
# backprop f = num * invden
dnum = invden # gradient on numerator                             #(8)
dinvden = num                                                     #(8)
# backprop invden = 1.0 / den 
dden = (-1.0 / (den**2)) * dinvden                                #(7)
# backprop den = sigx + xpysqr
dsigx = (1) * dden                                                #(6)
dxpysqr = (1) * dden                                              #(6)
# backprop xpysqr = xpy**2
dxpy = (2 * xpy) * dxpysqr                                        #(5)
# backprop xpy = x + y
dx = (1) * dxpy                                                   #(4)
dy = (1) * dxpy                                                   #(4)
# backprop sigx = 1.0 / (1 + math.exp(-x))
dx += ((1 - sigx) * sigx) * dsigx # Notice += !! See notes below  #(3)
# backprop num = x + sigy
dx += (1) * dnum                                                  #(2)
dsigy = (1) * dnum                                                #(2)
# backprop sigy = 1.0 / (1 + math.exp(-y))
dy += ((1 - sigy) * sigy) * dsigy                                 #(1)
# done! phew
dy

1.5922327514838093

###### And now the pytorch way

In [193]:
x = Variable(torch.Tensor([3]), requires_grad = True)
y = Variable(torch.Tensor([-4]), requires_grad = True)

In [194]:
sigy = 1.0 / ( 1 + torch.exp(-y))
sigy

Variable containing:
1.00000e-02 *
  1.7986
[torch.FloatTensor of size 1]

In [195]:
num = x + sigy
num

Variable containing:
 3.0180
[torch.FloatTensor of size 1]

In [196]:
sigx = 1.0 / (1 + torch.exp(-x))
sigx

Variable containing:
 0.9526
[torch.FloatTensor of size 1]

In [197]:
xpy = x + y
xpysqr = xpy**2
xpysqr

Variable containing:
 1
[torch.FloatTensor of size 1]

In [198]:
den = sigx + xpysqr
den

Variable containing:
 1.9526
[torch.FloatTensor of size 1]

In [199]:
invden = 1.0 / den

In [200]:
f = num * invden
f

Variable containing:
 1.5456
[torch.FloatTensor of size 1]

In [201]:
f.backward()

In [202]:
y.grad

Variable containing:
 1.5922
[torch.FloatTensor of size 1]

In [203]:
x.grad

Variable containing:
 2.0596
[torch.FloatTensor of size 1]

#### Gradients for vectorized operations

In [307]:
W = np.random.randn(5, 10)
X = np.random.randn(10, 3)
D = W.dot(X)
dD = np.random.randn(*D.shape)

In [308]:
# backward pass manual way,          
# now suppose we had the gradient on D from above in the circuit
dW = dD.dot(X.T) #.T gives the transpose of the matrix
dX = W.T.dot(dD)
dW

array([[ -1.24563066e+00,   6.14397747e-01,  -4.82728902e-01,
          1.86971696e-02,  -2.41149732e+00,  -3.97392416e-03,
          2.47620799e-01,   1.60525785e-01,   5.64479623e-01,
          3.69659458e-01],
       [  8.81471879e-01,   5.51214973e-01,   8.80815080e-01,
         -2.17823389e+00,   3.53901330e-01,   2.46641085e-01,
          3.17895430e-01,  -2.83012215e-01,  -9.36255725e-01,
         -1.03309871e-01],
       [ -1.84801000e+00,   1.65857495e+00,   9.76831595e-01,
         -1.64977845e+00,  -1.32727297e+00,   5.20228541e-01,
          1.74682129e+00,  -1.38462650e+00,  -2.84894801e+00,
          1.67686800e-01],
       [ -2.99128820e+00,  -1.20486703e+00,  -1.68089257e+00,
          5.90286467e+00,   2.93233262e-01,  -4.21433123e-01,
         -6.53662066e-03,  -2.52421772e-01,   4.04101180e-01,
          8.94429130e-02],
       [  2.64643264e+00,   3.30115186e-01,   2.19931262e+00,
         -3.63884932e+00,   3.59216576e+00,   4.87119794e-01,
          5.10591730e-01

###### And now the pytorch way

In [309]:
#W (2x3)
#X (3x2)
Wpt = Variable(torch.from_numpy(W), requires_grad = True)
Xpt = Variable(torch.from_numpy(X), requires_grad = True)

Dpt = Wpt.mm(Xpt)
Dpt

Variable containing:
 4.2662  1.1782 -2.3520
-4.8139  0.5168  4.3307
-0.1592  3.1259 -1.2556
 3.1646 -1.7503  1.0288
 1.6286  3.6734 -0.5212
[torch.DoubleTensor of size 5x3]

In [310]:
Dpt.backward(torch.from_numpy(dD))

In [311]:
Wpt.grad

Variable containing:
-1.2456  0.6144 -0.4827  0.0187 -2.4115 -0.0040  0.2476  0.1605  0.5645  0.3697
 0.8815  0.5512  0.8808 -2.1782  0.3539  0.2466  0.3179 -0.2830 -0.9363 -0.1033
-1.8480  1.6586  0.9768 -1.6498 -1.3273  0.5202  1.7468 -1.3846 -2.8489  0.1677
-2.9913 -1.2049 -1.6809  5.9029  0.2932 -0.4214 -0.0065 -0.2524  0.4041  0.0894
 2.6464  0.3301  2.1993 -3.6388  3.5922  0.4871  0.5106 -0.9471 -2.8029 -0.6317
[torch.DoubleTensor of size 5x10]