# tf.GradientTape

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
# To start eager execution (this must be top of code)
tf.executing_eagerly()
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus: tf.config.experimental.set_memory_growth(gpus[0], True)


### Create Tensors

In [12]:
x = tf.convert_to_tensor(1.)
w = tf.convert_to_tensor(2.)
b = tf.convert_to_tensor(3.)

print(type(x))
print(x)

<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(1.0, shape=(), dtype=float32)


### Build a computational graph for Automatic differentiation.
When you focus the $x$ of two functions, 

$$
y(x) = w  x + b
$$ 
and
$$ 
z(x) = w  x^2 + b x
$$
you can write the code of "build a computational graph for automatic differentiation" as below.

### tf.GradientTape
텐서플로는 자동 미분(주어진 입력 변수에 대한 연산의 그래디언트(gradient)를 계산하는 것)을 위한 tf.GradientTape API를 제공합니다. tf.GradientTape는 컨텍스트(context) 안에서 실행된 모든 연산을 테이프(tape)에 "기록"합니다. 그 다음 텐서플로는 후진 방식 자동 미분(reverse mode differentiation)을 사용해 테이프에 "기록된" 연산의 그래디언트를 계산합니다.

In [13]:
with tf.GradientTape(persistent=True) as tape:
    tape.watch(x)
    y = w * x + b
    z = w* x**2 + b * x
    
# dy/dx = 2
# dz/dx = 4 * x + 3  (now x=1 so dz/dx = 7)
# 입력 텐서 x에 대한 y의 도함수
dy_dx = tape.gradient(y, x)
# 입력 텐서 x에 대한 z의 도함수
dz_dx = tape.gradient(z, x)
    
print(dy_dx)
print(dz_dx)

tf.Tensor(2.0, shape=(), dtype=float32)
tf.Tensor(7.0, shape=(), dtype=float32)


### linear model
tf.keras 모듈을 사용하여 간단한 Liniear model을 만들어 보겠습니다.
$$
 y_i = Wx_i + b
$$

In [14]:
x = tf.random.normal(shape=[10, 3])
y = tf.random.normal(shape=[10, 2])

# tf.keras.layers.Dense needs only output dimension.
# When tf.keras.layers get input to calculate output at the first time,
# the input dimension is determined.
linear = tf.keras.layers.Dense(units=2,input_shape=(10,3))
predict_y = linear(x)

print("Input: \n", x.shape, end="\n\n")
print("weight: \n", linear.weights[0].shape, end="\n\n")
print("bias:\n", linear.weights[1].shape, end="\n\n")
print("output shape:\n", predict_y.shape)

Input: 
 (10, 3)

weight: 
 (3, 2)

bias:
 (2,)

output shape:
 (10, 2)


#### loss function
TensorFlow eager execution has similar API to PyTorch, however the implementation of "Build a computational graph for Automatic differentiation"  is a little diferrent.
At PyTorch, Tensor itself holds calculation graph, and have the method for automatic differentiation. On the other hand, at TensorFlow eager execution, computational graph is keeped by some functions (for example, `tf.GradientTape()`). 

In [15]:
def loss_fn(model, x, y):
    predict_y = model(x)
    return tf.keras.losses.mean_squared_error(predict_y, y)

with tf.GradientTape() as tape:
    loss = loss_fn(model=linear, x=x, y=y)
    grads = tape.gradient(loss, linear.trainable_variables)

print("loss: \n", loss, end="\n\n")
print("weight grads: \n", grads[0][0], end="\n\n")
print("weight instances: \n", grads[0][1], end="\n\n")
print("bias grads: \n", grads[1][0], end="\n\n")
print("bias instances: \n", grads[1][1], end="\n\n")

loss: 
 tf.Tensor(
[3.000647   0.0633526  0.21499693 2.5319018  5.310877   1.7093294
 0.48144472 0.23076423 1.005112   0.92788976], shape=(10,), dtype=float32)

weight grads: 
 tf.Tensor([3.5041962 1.1919472], shape=(2,), dtype=float32)

weight instances: 
 tf.Tensor([-3.6995046 -1.933562 ], shape=(2,), dtype=float32)

bias grads: 
 tf.Tensor(-1.6289247, shape=(), dtype=float32)

bias instances: 
 tf.Tensor(-0.9769095, shape=(), dtype=float32)



#### Optimizing
We aim to decrese loss value with update parameters as below. 
$$
\begin{align}
W & \leftarrow W - \epsilon \frac{dLoss(W)}{dW}\\\
b & \leftarrow b - \epsilon \frac{dLoss(W)}{db}
\end{align}
$$

where $\epsilon$ is learning rate.

After understanding this code, you are able to write training loop code.

In [16]:
# initialize optimizer
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)

In [17]:
# initial loss value of sum of all data.
with tf.GradientTape() as tape:
    loss = loss_fn(model=linear, x=x, y=y)
    grads = tape.gradient(loss, linear.trainable_variables)
print("loss: ", tf.reduce_sum(loss))

# update prameters using grads
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
optimizer.apply_gradients(zip(grads, linear.trainable_variables))

# loss value after update (may be less than before update)
with tf.GradientTape() as tape:
    loss = loss_fn(model=linear, x=x, y=y)
    grads = tape.gradient(loss, linear.trainable_variables)
print("loss: ", tf.reduce_sum(loss))

loss:  tf.Tensor(15.4763155, shape=(), dtype=float32)
loss:  tf.Tensor(14.69506, shape=(), dtype=float32)


### Data
#### Convert to tf.Tensor from numpy.ndarray

In [18]:
X_numpy = np.random.randn(3, 3)
print(type(X_numpy))
print(X_numpy)

X_tensor = tf.convert_to_tensor(X_numpy)
print(type(X_tensor))
print(X_tensor)

<class 'numpy.ndarray'>
[[ 0.07104778  0.24263894 -1.32652316]
 [-0.33225697  0.0959315   0.01543784]
 [ 0.3612971   0.34421558 -0.27386095]]
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(
[[ 0.07104778  0.24263894 -1.32652316]
 [-0.33225697  0.0959315   0.01543784]
 [ 0.3612971   0.34421558 -0.27386095]], shape=(3, 3), dtype=float64)


#### conver to numpy.array from tf.Tensor
주의할 점은 eager execution 모드에서만 .numpy()호출이 가능한 점 입니다.

In [19]:
X_tensor = tf.random.normal(shape=[3, 3])
print(type(X_tensor))
print(X_tensor)

X_numpy = X_tensor.numpy()
print(type(X_numpy))
print(X_numpy)

<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(
[[ 1.7827262   0.9076068  -0.4188783 ]
 [-1.8647549   0.4820718  -0.29639027]
 [ 0.24610497 -0.41468096  1.2479974 ]], shape=(3, 3), dtype=float32)
<class 'numpy.ndarray'>
[[ 1.7827262   0.9076068  -0.4188783 ]
 [-1.8647549   0.4820718  -0.29639027]
 [ 0.24610497 -0.41468096  1.2479974 ]]


### tf.Dataset pipline