In [1]:
import tensorflow as tf
import numpy as np

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Numerical Differentiation

In [2]:
def f(x, y):
    return x**2*y + y + 2

def derivative(f, x, y, x_eps, y_eps):
    return (f(x+x_eps, y+y_eps) - f(x, y)) / (x_eps + y_eps)

eps = 0.00001
df_dx = derivative(f, 3, 4, eps, 0)  # 2*x*y = 24
df_dy = derivative(f, 3, 4, 0, eps)  # x**2 + 1 = 10
df_dx, df_dy

(24.000039999805264, 10.000000000331966)

> Note that we have to call `f` at least **three times** in order just to compute `df_dx` and `df_dy`.

**(?1)** How many times would we have to call `f`, had we used
- forward-mode autodiff?
- backward-mode autodiff?

### `my_func()`, try to autodiff it

In [3]:
def my_func(a, b):
    z = 0
    for i in range(100):
        z = a*np.cos(z+i) + z*np.sin(b-i)
    return z

Let's first build a graph for `my_func()`.

In [4]:
a = tf.Variable(0, name="a")
b = tf.Variable(0, name="b")
a, b

Instructions for updating:
Colocations handled automatically by placer.


(<tf.Variable 'a:0' shape=() dtype=int32_ref>,
 <tf.Variable 'b:0' shape=() dtype=int32_ref>)

The `dtype` is not quite right.

In [5]:
tf.reset_default_graph()

In [6]:
a = tf.Variable(0.0, name="a")
b = tf.Variable(0.0, name="b")
a, b

(<tf.Variable 'a:0' shape=() dtype=float32_ref>,
 <tf.Variable 'b:0' shape=() dtype=float32_ref>)

This time `dtype` is right, but let's try one more possibility.

In [7]:
tf.reset_default_graph()

In [8]:
a = tf.Variable(0, dtype=tf.float64, name="a")
b = tf.Variable(0, dtype=tf.float32, name="b")
a, b

(<tf.Variable 'a:0' shape=() dtype=float64_ref>,
 <tf.Variable 'b:0' shape=() dtype=float32_ref>)

Ok, enough playing with `tf.Variable()`. Let's resume our construction of the graph.

In [9]:
tf.reset_default_graph()
a = tf.Variable(0.0, name="a")
b = tf.Variable(0.0, name="b")

Note that `z`
- should not be `tf.constant`
- neither `tf.placeholder` either
- should probably be `tf.Variable`

In [10]:
z = tf.Variable(0.0, name="z")
for i in range(100):
    z = a*tf.cos(z+i) + z*tf.sin(b-i)
z

<tf.Tensor 'add_199:0' shape=() dtype=float32>

In [11]:
da, db = tf.gradients(z, [a, b])
da, db

(<tf.Tensor 'gradients/AddN_99:0' shape=() dtype=float32>,
 <tf.Tensor 'gradients/AddN_100:0' shape=() dtype=float32>)

Now that the graph has been constructed, let's compute some numerical values.

In [12]:
init = tf.global_variables_initializer()
with tf.Session() as sess:
    init.run()
    da_val, db_val = da.eval(), db.eval()

In [13]:
da_val, db_val

(-1.219198, 0.0)

Let's verify the partial derivatives calculated by autodiff with that calculated by numercial differentiation (We can reuse `derivative()`).

In [14]:
derivative(my_func, 0, 0, eps, 0), derivative(my_func, 0, 0, 0, eps)

(-1.2192136268969398, 0.0)

Looks about right. But this only proves for the point `(a, b) = (0, 0)`. Let's try some more pairs of values `(a, b)`.

In [15]:
a0, b0 = 1.0001, np.pi/2  # values picked randomly

The previous computation took quite some time, here let's `%%time` it.

In [16]:
%%time
with tf.Session() as sess:
    init.run()
    da_val, db_val = da.eval(feed_dict={a: a0, b: b0}), db.eval(feed_dict={a: a0, b: b0})
    
da_val, db_val

CPU times: user 14.4 s, sys: 36 ms, total: 14.5 s
Wall time: 16 s


(-0.8145098, 0.46844184)

In [17]:
%time
print(derivative(my_func, a0, b0, eps, 0), derivative(my_func, a0, b0, 0, eps))

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 14.5 µs
-0.8145009000704028 0.46847738059963


Probably not the fault of `tf`, you've run the graph redundantly as describes (as a bad practice) **on p.235** of the book.

In [18]:
%%time
with tf.Session() as sess:
    init.run()
    da_val, db_val = sess.run([da, db], feed_dict={a: a0, b: b0})

CPU times: user 8.69 s, sys: 22.4 ms, total: 8.71 s
Wall time: 9.96 s


The speed got improved, but it seems that on the CPU of Thinkpad X200
- `tf`'s autodiff is slower than the difference equation
- `10 s` compared to `13 microsec`
  - `tf` autodiff
    ```
    CPU times: user 8.41 s, sys: 27.3 ms, total: 8.43 s
    Wall time: 10.3 s
    ```
  - difference equation
    ```
    CPU times: user 6 µs, sys: 0 ns, total: 6 µs
    Wall time: 13.4 µs
    ```

In [26]:
n_pairs = 7

In [27]:
pairs = 5*np.random.randn(n_pairs,2)
pairs[:7, :]

array([[ 5.48882283, -4.03498243],
       [ 4.97594869,  0.49954073],
       [ 3.28090671,  0.19872654],
       [-0.38263577, -5.6707406 ],
       [-4.05622641,  0.72108979]])

In [29]:
print("Results close?")
for (a0, b0) in pairs:
    with tf.Session() as sess:
        init.run()
        da_val, db_val = sess.run([da, db], feed_dict={a: a0, b: b0})

    a_diff, b_diff = derivative(my_func, a0, b0, eps, 0), derivative(my_func, a0, b0, 0, eps)
    tol = 0.0001
    close = np.abs(da_val - a_diff) < tol and np.abs(db_val - b_diff) < tol
    print(close)
    if not close:
        print(f"    a0 = {a0:.5f}, b = {b0:.5f}")
        print(f"    da_val = {da_val:.5f}, a_diff = {a_diff:.5f}")
        print(f"    db_val = {db_val:.5f}, b_diff = {b_diff:.5f}")
    else:
        print(f"    a0 = {a0:.5f}, b = {b0:.5f}")

Results close?
False
    a0 = 5.48882, b = -4.03498
    da_val = nan, a_diff = -183094.63255
    db_val = nan, b_diff = -548495.18664
False
    a0 = 4.97595, b = 0.49954
    da_val = inf, a_diff = 235678.67377
    db_val = nan, b_diff = 367272.58085
False
    a0 = 3.28091, b = 0.19873
    da_val = 23220513499233910784.00000, a_diff = 280116.16574
    db_val = -45844546338445852672.00000, b_diff = 204862.42645
True
    a0 = -0.38264, b = -5.67074
False
    a0 = -4.05623, b = 0.72109
    da_val = 168350438388777841053015212032.00000, a_diff = 952689.47727
    db_val = 256345334268102908371330400256.00000, b_diff = 576703.14644
False
    a0 = 3.63915, b = 1.03750
    da_val = -8638213570933621971574849536.00000, a_diff = 72467.12347
    db_val = -9082337381252253136281665536.00000, b_diff = 44602.30422
True
    a0 = -0.47866, b = 2.60087


**(?2)** Why in the inconsistent cases, there are times when the values concerned are not `nan`, the corresponding values differ so much?

**(?3)** Would it be because values in `pairs` are too big? Let's try with smaller values.

In [32]:
pairs = np.random.randn(n_pairs,2)
pairs[:7, :]

array([[-0.2460409 ,  0.36686968],
       [-0.04445912, -1.24266627],
       [ 1.2505139 , -0.68945281],
       [-1.20598045,  0.49846434],
       [ 0.75836596, -1.29888354],
       [-0.10022005,  1.05469486],
       [-0.26667067, -0.10496438]])

In [33]:
print("Results close?")
for (a0, b0) in pairs:
    with tf.Session() as sess:
        init.run()
        da_val, db_val = sess.run([da, db], feed_dict={a: a0, b: b0})

    a_diff, b_diff = derivative(my_func, a0, b0, eps, 0), derivative(my_func, a0, b0, 0, eps)
    tol = 0.0001
    close = np.abs(da_val - a_diff) < tol and np.abs(db_val - b_diff) < tol
    print(close)
    if not close:
        print(f"    a0 = {a0:.5f}, b = {b0:.5f}")
        print(f"    da_val = {da_val:.5f}, a_diff = {a_diff:.5f}")
        print(f"    db_val = {db_val:.5f}, b_diff = {b_diff:.5f}")

Results close?
True
True
False
    a0 = 1.25051, b = -0.68945
    da_val = 4.52261, a_diff = 4.52308
    db_val = -4.21196, b_diff = -4.24646
True
True
True
True
