Element-wise operations in pytorch: https://deeplizard.com/learn/video/QscEWm0QTRY

In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# computational dynamic graph is a ds that helps to track changes in a tensor using graphs ds. The next code sets up computational tracking on a tensor
x = torch.tensor(2.0, requires_grad=True)

In [3]:
# defining a tensorial function 
y = 2*x**4 + x**3 + 3*x**2 + 5*x + 1
print(y)

tensor(63., grad_fn=<AddBackward0>)


In [8]:
# numerical differentiation
def f(x):
    return 3*x**2 - 4*x

def numerical_lim(f, x, h):
    return (f(x+h)-f(x))/h

h = 0.1
for i in range(7):
    print(f'h={h: 6f}, numerical limit = {numerical_lim(f, 1, h):.6f}')
    h *= 0.1

h= 0.100000, numerical limit = 2.300000
h= 0.010000, numerical limit = 2.030000
h= 0.001000, numerical limit = 2.003000
h= 0.000100, numerical limit = 2.000300
h= 0.000010, numerical limit = 2.000030
h= 0.000001, numerical limit = 2.000003
h= 0.000000, numerical limit = 2.000000


In [5]:
x = torch.arange(4.0, requires_grad=True)
print(x.grad) # the default value is None

None


In [6]:
y = torch.dot(x,x) # f(x_vector) = x . x = (||x||)^2 = (sqrt(x1^2 + ... xn^2))^2 = x1^2 + ... xn^2
print(y)

tensor(14., grad_fn=<DotBackward0>)


In [7]:
y.backward()

In [8]:
print(x)
print(x.grad)

tensor([0., 1., 2., 3.], requires_grad=True)
tensor([0., 2., 4., 6.])


In [10]:
x.grad == 2*x # check if 2*x is the general vector function solution of the operation grad(x_vector)

tensor([True, True, True, True])

In [11]:
# another example 
# f(x_vector) = x1 + x2 + ... + xn
# IMPORTANT!!! torch accumulates the gradient, to reset the gradient to zeros
x.grad.zero_()

tensor([0., 0., 0., 0.])

In [12]:
print(x.grad)

tensor([0., 0., 0., 0.])


In [10]:
print(x)

tensor([0., 1., 2., 3.], requires_grad=True)


In [13]:
# the derivative of a variable with respect to itself: dx/dx == 1
y = x.sum() # f(x_vector) = x1 + x2 + ... + xn
print(y)
y.backward()
print(x.grad)

tensor(6., grad_fn=<SumBackward0>)
tensor([1., 1., 1., 1.])


Example 3. Backward for non-scalar variables. For non-scalar variables torch calculates the gradient as $\mathbf{J}(\vec{y})^\intercal \cdot \vec{v}$

$$
\vec{x} = \begin{bmatrix} x_{1}\\ x_{2}\\ x_{3} \end{bmatrix} \rightarrow \vec{y} = \begin{bmatrix} y_{1} = x_{1}^{2}\\ y_{2} = x_{2}^{2}\\ y_{3} = x_{3}^{2} \end{bmatrix} \hspace{1cm} \left ( f:\mathbb{R}^{3}\rightarrow \mathbb{R}^{3} \right )
$$

With non-scalar values the analysis must have in count the way that torch generally calculates the gradient. Torch calculates the gradients in base to the Jacobian (J) of y and an auxiliar vector v. In this particular case J(y) is by definition:

$$
\mathbf {J}(\vec{y}) ={\begin{bmatrix}{\dfrac {\partial \vec{y} }{\partial x_{1}}} & \dfrac {\partial \vec{y} }{\partial x_{2}} & {\dfrac {\partial \vec{y} }{\partial x_{3}}}\end{bmatrix}} = {\begin{bmatrix}\nabla ^{\mathrm {T} }\vec{y}_{1}\\ \nabla ^{\mathrm {T} }\vec{y}_{2} \\ \nabla ^{\mathrm {T} }\vec{y}_{3}\end{bmatrix}}
$$

$$
\mathbf{J}(\vec{y}) =
\begin{bmatrix}
  \frac{\partial y_1}{\partial x_1} & 
    \frac{\partial y_1}{\partial x_2} & 
    \frac{\partial y_1}{\partial x_3} \\ 
  \frac{\partial y_2}{\partial x_1} & 
    \frac{\partial y_2}{\partial x_2} & 
    \frac{\partial y_2}{\partial x_3} \\
  \frac{\partial y_3}{\partial x_1} & 
    \frac{\partial y_3}{\partial x_2} & 
    \frac{\partial y_3}{\partial x_3}
\end{bmatrix}
$$

For non-scalar values the gradient is calculated in terms of the product of the transpose of the Jacobian of the final vector (y) and an auxiliar vector (v), in this case v = identity vector

$$
\mathbf{J}(\vec{y})^\intercal \cdot \vec{v} =
\begin{bmatrix}
  \frac{\partial y_1}{\partial x_1} & 
    \frac{\partial y_2}{\partial x_1} & 
    \frac{\partial y_3}{\partial x_1} \\ 
  \frac{\partial y_1}{\partial x_2} & 
    \frac{\partial y_2}{\partial x_2} & 
    \frac{\partial y_3}{\partial x_2} \\
  \frac{\partial y_1}{\partial x_3} & 
    \frac{\partial y_2}{\partial x_3} & 
    \frac{\partial y_3}{\partial x_3}
\end{bmatrix}
\cdot
\begin{bmatrix} 
1 \\
1 \\
1 
\end{bmatrix}
$$


$$
\mathbf{J}(\vec{y})^\intercal \cdot \vec{v} =
\begin{bmatrix}
  \frac{\partial (x_{1}^{2})}{\partial x_1} & 
    \frac{\partial (x_{2}^{2})}{\partial x_1} & 
    \frac{\partial (x_{3}^{2})}{\partial x_1} \\ 
  \frac{\partial (x_{1}^{2})}{\partial x_2} & 
    \frac{\partial (x_{2}^{2})}{\partial x_2} & 
    \frac{\partial (x_{3}^{2})}{\partial x_2} \\
  \frac{\partial (x_{1}^{2})}{\partial x_3} & 
    \frac{\partial (x_{2}^{2})}{\partial x_3} & 
    \frac{\partial (x_{3}^{2})}{\partial x_3}
\end{bmatrix}
\cdot
\begin{bmatrix} 
1 \\
1 \\
1 
\end{bmatrix}
$$

$$
\begin{bmatrix}
   2x_{1} & 
    0 & 
    0 \\ 
  0 & 
    2x_{2} & 
    0 \\
  0 & 
    0 & 
    2x_{3}
\end{bmatrix}
\cdot
\begin{bmatrix} 
1 \\
1 \\
1 
\end{bmatrix}
=
\begin{bmatrix} 
2x_{1} \\
2x_{2} \\
2x_{3}
\end{bmatrix}
$$

In [2]:
# backward for non-scalar variables
x = torch.arange(3.0, requires_grad=True)
print(x)
print(x.grad)

tensor([0., 1., 2.], requires_grad=True)
None


In [3]:
# in torch to make y_vector = [(x_1)^2, (x_2)^2, (x_3)^2] (remembering that x_1, x_2, x_3 = the components of x_vector) we can use element-wise operation on x
y = x * x 

In [4]:
y.backward(torch.ones_like(x)) 
print(x.grad)
x.grad == 2*x

tensor([0., 2., 4.])


tensor([True, True, True])

Example 4. Backward for non-scalar variables.

$$ Q = 3a^{3} - b^{2} $$

$$
\vec{a} = \begin{bmatrix}
a_{1}\\ 
a_{2}\\ 

\end{bmatrix}, \vec{b} = \begin{bmatrix}
b_{1}\\ 
b_{2}\\ 

\end{bmatrix} \rightarrow \vec{Q} = \begin{bmatrix}
Q_{1}\\ 
Q_{2}\\ 

\end{bmatrix} = \begin{bmatrix}
3a_{1}^{3}-3b_{1}^{2}\\ 
3a_{2}^{3}-3b_{2}^{2}\\ 

\end{bmatrix} \hspace{1cm} \left ( f:\mathbb{R}^{2}\rightarrow \mathbb{R}^{2} \right )
$$
 

The jacobian of $\vec{Q}$ is:

$$
\mathbf{J}(\vec{Q}) =
\begin{bmatrix}
  \frac{\partial Q_1}{\partial a_1} & 
    \frac{\partial Q_1}{\partial b_1} & 
    \frac{\partial Q_1}{\partial a_2} &
    \frac{\partial Q_1}{\partial b_2} \\
  \frac{\partial Q_2}{\partial a_1} & 
    \frac{\partial Q_2}{\partial b_1} & 
    \frac{\partial Q_2}{\partial a_2} &
    \frac{\partial Q_2}{\partial b_2} \\
\end{bmatrix}
$$

$$
\mathbf{J}(\vec{Q}) =
\begin{bmatrix}
  9a_{1}^{2} & 
    -2b_{1} & 
    0 &
    0 \\
  0 & 
    0 & 
    9a_{2}^{2}  &
    -2b_{2} \\
\end{bmatrix}
$$

The torch automatic backward calculation with the necesary $\vec{v}$ then will be

$$
\mathbf{J}^T(\vec{Q})\cdot \vec{v} =
\begin{bmatrix}
  9a_{1}^{2} & 
    0 \\
  -2b_{1} & 
    0 \\
  0 & 
    9a_{2}^{2} \\
  0 & 
    -2b_{2} \\
\end{bmatrix}
\cdot
\begin{bmatrix} 
1 \\
1 
\end{bmatrix} = 
\begin{bmatrix} 
9a_{1}^{2} \\
-2b_{1} \\
9a_{2}^{2} \\
-2b_{2}
\end{bmatrix}
$$


In [6]:
a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)
v = torch.tensor([1, 1])

Q = 3*a**3 - b**2
Q.backward(gradient=v)
print(a.grad, b.grad)

print(a.grad == 9*a**2)
print(b.grad == -2*b)

tensor([36., 81.]) tensor([-12.,  -8.])
tensor([True, True])
tensor([True, True])
