In [1]:
import tensorflow as tf
import numpy as np
keras = tf.keras

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
x = tf.ones((2,2))
x

<tf.Tensor: id=2, shape=(2, 2), dtype=float32, numpy=
array([[1., 1.],
       [1., 1.]], dtype=float32)>

In [7]:
with tf.GradientTape(persistent=True) as tape:
    tape.watch(x)
    y = tf.reduce_sum(x)
    z = tf.multiply(y, y)
dy_dx = tape.gradient(y, x)
dz_dx = tape.gradient(z, x)
del tape

In [8]:
dy_dx

<tf.Tensor: id=30, shape=(2, 2), dtype=float32, numpy=
array([[1., 1.],
       [1., 1.]], dtype=float32)>

In [9]:
dz_dx

<tf.Tensor: id=38, shape=(2, 2), dtype=float32, numpy=
array([[8., 8.],
       [8., 8.]], dtype=float32)>

In [10]:
def f(x, y):
  output = 1.0
  for i in range(y):
    if i > 1 and i < 5:
      output = tf.multiply(output, x)
  return output

def grad(x, y):
  with tf.GradientTape() as t:
    t.watch(x)
    out = f(x, y)
  return t.gradient(out, x)

x = tf.convert_to_tensor(2.0)

assert grad(x, 6).numpy() == 12.0
assert grad(x, 5).numpy() == 12.0
assert grad(x, 4).numpy() == 4.0

In [11]:
x

<tf.Tensor: id=41, shape=(), dtype=float32, numpy=2.0>

In [13]:
x = tf.Variable(1.0)
x

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.0>

In [24]:
with tf.GradientTape(persistent=True) as tape:
    tape.watch(x)
    y = x * x * x
    dy_dx = tape.gradient(y, x)
    d2y_dx2 = tape.gradient(dy_dx, x)
del tape    

W0611 12:22:01.266161 41020 backprop.py:968] Calling GradientTape.gradient on a persistent tape inside its context is significantly less efficient than calling it outside the context (it causes the gradient ops to be recorded on the tape, leading to increased CPU and memory usage). Only call GradientTape.gradient inside the context if you actually want to trace the gradient in order to compute higher order derivatives.
W0611 12:22:01.269173 41020 backprop.py:968] Calling GradientTape.gradient on a persistent tape inside its context is significantly less efficient than calling it outside the context (it causes the gradient ops to be recorded on the tape, leading to increased CPU and memory usage). Only call GradientTape.gradient inside the context if you actually want to trace the gradient in order to compute higher order derivatives.


In [19]:
dy_dx

<tf.Tensor: id=179, shape=(), dtype=float32, numpy=3.0>

In [20]:
d2y_dx2

<tf.Tensor: id=194, shape=(), dtype=float32, numpy=6.0>

In [25]:
with tf.GradientTape() as tape1:
    with tf.GradientTape() as tape2:
        y = x * x * x
        dy_dx = tape2.gradient(y, x)
    d2y_dx2 = tape1.gradient(dy_dx, x)

In [27]:
dy_dx

<tf.Tensor: id=285, shape=(), dtype=float32, numpy=3.0>

In [28]:
d2y_dx2

<tf.Tensor: id=300, shape=(), dtype=float32, numpy=6.0>