In [14]:
import math

In [8]:
# Inputs
x = -2; y = 5; z = -4

In [9]:
# Nodes along our graph. f is the output
q = x + y
f = q * z

In [10]:
# Backpropogating through
df_dz = q # derivative of f wrt z is q (power rule)
df_dq = z

# what is df_dx? it is the partial of q wrt x times the partial of f wrt q
# this is because we want to know how much q changes due to x and because
# q influences f, we want to know how much f changes because of q. The
# product here is the multivariate chain rule.
df_dx = 1.00 * df_dq # dq_dx = 1.00 and we already know df_dq. multiply along the graph of partials
df_dy = 1.00 * df_dq # dq_dy = 1.00

# Storing the partial df_dq lets us cache this variable and efficiently reuse it later
# Note: we now know how much our output, f, changes wrt each input. In the cases of 
# x and y, our output is independent of changes to the input. The input z will drive
# changes to the output. The xy components of the gradient are directly proportional
# to the input value of z.

In [11]:
[df_dx, df_dy, df_dz] # this is the gradient
# If we now wish to perform optimization through stochastic gradient descent we could
# update weights on our inputs to move them in the opposite direction of the gradient
# (minimization) in order to minimize our output value. In this toy example the output
# is not meaningful, but in a real neural network the output is the output of our loss
# function, and minimizing this has great meaning.

[-4.0, -4.0, 3]

In [15]:
w = [2, -3, -3] # random weights and inputs
x = [-1, -2]

# forward pass (dot product followed by activation function)
dot = w[0]*x[0] + w[1]*x[1] + w[2]
f = 1.0 / (1 + math.exp(-dot)) # sigmoid

# backward pass
ddot = (1 - f) * f # this may look bizarre but algebraically it is the derivation of the gradient of the sigmoid
dx = [w[0] * ddot, w[1] * ddot]
dw = [x[0] * ddot, x[1] * ddot, 1 * ddot]

### Example Function

$f(x, y) = \frac{x + \sigma(y)}{\sigma(x) + (x+y)^{2}}$