In [1]:
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

In [2]:
x = tf.Variable(3.0)

with tf.GradientTape() as tape:
  y = x**2


 GradientTape.gradient(target, sources)

In [3]:
dy_dx = tape.gradient(y, x)
dy_dx.numpy()


6.0

In [4]:
w = tf.Variable(tf.random.normal((3, 2)), name='w')
b = tf.Variable(tf.zeros(2, dtype=tf.float32), name='b')
x = [[1., 2., 3.]]

with tf.GradientTape(persistent=True) as tape:
  y = x @ w + b
  loss = tf.reduce_mean(y**2)


In [5]:
[dl_dw, dl_db] = tape.gradient(loss, [w, b])


In [7]:
dl_dw

<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[ 0.45901048, -1.9368262 ],
       [ 0.91802096, -3.8736525 ],
       [ 1.3770314 , -5.8104787 ]], dtype=float32)>

In [8]:
dl_db

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([ 0.45901048, -1.9368262 ], dtype=float32)>

In [9]:
my_vars = {
    'w': w,
    'b': b
}

grad = tape.gradient(loss, my_vars)
grad['b']


<tf.Tensor: shape=(2,), dtype=float32, numpy=array([ 0.45901048, -1.9368262 ], dtype=float32)>

In [10]:
layer = tf.keras.layers.Dense(2, activation='relu')
x = tf.constant([[1., 2., 3.]])

with tf.GradientTape() as tape:
  # Forward pass
  y = layer(x)
  loss = tf.reduce_mean(y**2)

# Calculate gradients with respect to every trainable variable
grad = tape.gradient(loss, layer.trainable_variables)


In [11]:
grad

[<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
 array([[0., 0.],
        [0., 0.],
        [0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0., 0.], dtype=float32)>]

In [12]:
for var, g in zip(layer.trainable_variables, grad):
  print(f'{var.name}, shape: {g.shape}')


dense/kernel:0, shape: (3, 2)
dense/bias:0, shape: (2,)


tf.GradientTape provides hooks that give the user control over what is or is not watched.

To record gradients with respect to a tf.Tensor, you need to call GradientTape.watch(x):

In [13]:
x = tf.constant(3.0)
with tf.GradientTape() as tape:
  tape.watch(x)
  y = x**2

# dy = 2x * dx
dy_dx = tape.gradient(y, x)
print(dy_dx.numpy())




6.0


Conversely, to disable the default behavior of watching all tf.Variables, set watch_accessed_variables=False when creating the gradient tape. This calculation uses two variables, but only connects the gradient for one of the variables:

In [14]:
x0 = tf.Variable(0.0)
x1 = tf.Variable(10.0)

with tf.GradientTape(watch_accessed_variables=False) as tape:
  tape.watch(x1)
  y0 = tf.math.sin(x0)
  y1 = tf.nn.softplus(x1)
  y = y0 + y1
  ys = tf.reduce_sum(y)


Since GradientTape.watch was not called on x0, no gradient is computed with respect to it:

In [15]:
# dys/dx1 = exp(x1) / (1 + exp(x1)) = sigmoid(x1)
grad = tape.gradient(ys, {'x0': x0, 'x1': x1})

print('dy/dx0:', grad['x0'])
print('dy/dx1:', grad['x1'].numpy())


dy/dx0: None
dy/dx1: 0.9999546


You can also request gradients of the output with respect to intermediate values computed inside the tf.GradientTape context.

In [16]:
x = tf.constant(3.0)

with tf.GradientTape() as tape:
  tape.watch(x)
  y = x * x
  z = y * y

# Use the tape to compute the gradient of z with respect to the
# intermediate value y.
# dz_dy = 2 * y and y = x ** 2 = 9
print(tape.gradient(z, y).numpy())


18.0


By default, the resources held by a GradientTape are released as soon as the GradientTape.gradient method is called. To compute multiple gradients over the same computation, create a gradient tape with persistent=True. This allows multiple calls to the gradient method as resources are released when the tape object is garbage collected. For example:

In [17]:
x = tf.constant([1, 3.0])
with tf.GradientTape(persistent=True) as tape:
  tape.watch(x)
  y = x * x
  z = y * y

print(tape.gradient(z, x).numpy())  # 108.0 (4 * x**3 at x = 3)
print(tape.gradient(y, x).numpy())  # 6.0 (2 * x)


[  4. 108.]
[2. 6.]


In [18]:
#del tape   # Drop the reference to the tape


Gradients of non-scalar targets
A gradient is fundamentally an operation on a scalar.

In [19]:
x = tf.Variable(2.0)
with tf.GradientTape(persistent=True) as tape:
  y0 = x**2
  y1 = 1 / x

print(tape.gradient(y0, x).numpy())
print(tape.gradient(y1, x).numpy())


4.0
-0.25


Thus, if you ask for the gradient of multiple targets, the result for each source is:

+   The gradient of the sum of the targets, or equivalently
+   The sum of the gradients of each target.



In [20]:
x = tf.Variable(2.0)
with tf.GradientTape() as tape:
  y0 = x**2
  y1 = 1 / x

print(tape.gradient({'y0': y0, 'y1': y1}, x).numpy())


3.75


Similarly, if the target(s) are not scalar the gradient of the sum is calculated:

In [21]:
x = tf.Variable(2.)

with tf.GradientTape() as tape:
  y = x * [3., 4.]

print(tape.gradient(y, x).numpy())


7.0
