In [1]:
import sys
sys.path.append('..')

import numpy as np
import scipy.linalg
import torch

import metrics
import utils

np.random.seed(12)

$$\frac{\partial}{\partial x} ||x||_2^2 = 2x, x \in \mathbb{R}^n$$

In [2]:
x = np.random.randn(14)
tx = torch.tensor(x, requires_grad=True)
y = x@x
ty = torch.dot(tx, tx)
ty.backward()

print(y)
print(ty.data.numpy())
print(metrics.tdist(y, ty.data.numpy()))

18.733339403660047
18.733339403660047
0.0


In [3]:
dx = 2 * x
dx_sol = tx.grad.data.numpy()
print(dx)
print(dx_sol)
print(metrics.tdist(dx, dx_sol))

[ 0.94597166 -1.36285176  0.48487899 -3.40147127  1.50628567 -3.06944268
  0.01025416 -0.24045534 -1.61396376  5.74363879 -1.19564584  0.94491399
  2.19191224 -2.4303376 ]
[ 0.94597166 -1.36285176  0.48487899 -3.40147127  1.50628567 -3.06944268
  0.01025416 -0.24045534 -1.61396376  5.74363879 -1.19564584  0.94491399
  2.19191224 -2.4303376 ]
0.0


$$\frac{\partial}{\partial x} ||x||_1 = sign(x), x \in \mathbb{R}^n$$

In [4]:
x = np.random.randn(14)
tx = torch.tensor(x, requires_grad=True)
y = np.linalg.norm(x, ord=1)
ty = torch.norm(tx, p=1)
ty.backward()

print(y)
print(ty.data.numpy())
print(metrics.tdist(y, ty.data.numpy()))

14.276574175253929
14.276574175253929
0.0


In [5]:
dx = np.sign(x)
dx_sol = tx.grad.data.numpy()
print(dx)
print(dx_sol)
print(metrics.tdist(dx, dx_sol))

[ 1. -1.  1. -1. -1.  1.  1.  1.  1.  1. -1. -1. -1. -1.]
[ 1. -1.  1. -1. -1.  1.  1.  1.  1.  1. -1. -1. -1. -1.]
0.0


$$\frac{\partial}{\partial x} \sum_{x=1}^n x_i = \mathbb{1}, x \in \mathbb{R}^n$$

In [6]:
x = np.random.randn(14)
tx = torch.tensor(x, requires_grad=True)
y = np.sum(x)
ty = torch.sum(tx)
ty.backward()

print(y)
print(ty.data.numpy())
print(metrics.tdist(y, ty.data.numpy()))

-6.814946599398262
-6.814946599398261
8.881784197001252e-16


In [7]:
dx = np.ones((x.shape[0]))
dx_sol = tx.grad.data.numpy()
print(dx)
print(dx_sol)
print(metrics.tdist(dx, dx_sol))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
0.0


$$x, y \in \mathbb{R}^n$$
$$\frac{\partial x^Ty}{\partial x} = y$$
$$\frac{\partial x^Ty}{\partial y} = x$$

In [8]:
x = np.random.randn(14)
y = np.random.randn(14)
tx = torch.tensor(x, requires_grad=True)
ty = torch.tensor(y, requires_grad=True)
z = x @ y
tz = torch.dot(tx, ty)
tz.backward()

print(z)
print(tz.data.numpy())
print(metrics.tdist(z, tz.data.numpy()))

4.509509404393396
4.509509404393396
0.0


In [9]:
dx = y
dx_sol = tx.grad.data.numpy()
print(dx)
print(dx_sol)
print(metrics.tdist(dx, dx_sol))
dy = x
dy_sol = ty.grad.data.numpy()
print(dy)
print(dy_sol)
print(metrics.tdist(dy, dy_sol))

[ 0.1597877  -0.71626359  0.05052283 -0.14333741  0.94357539  0.35764423
 -0.0834492   0.6778061   0.55606037  0.22271946 -1.52898548  1.02921118
 -1.16625876 -1.00956165]
[ 0.1597877  -0.71626359  0.05052283 -0.14333741  0.94357539  0.35764423
 -0.0834492   0.6778061   0.55606037  0.22271946 -1.52898548  1.02921118
 -1.16625876 -1.00956165]
0.0
[ 1.33583134  0.31866529 -0.33759525 -0.58526828 -0.11491994  2.24181779
 -3.14741652  0.53513589  0.23249044  0.86761195 -1.14821271  2.11434424
  1.00094276 -0.051415  ]
[ 1.33583134  0.31866529 -0.33759525 -0.58526828 -0.11491994  2.24181779
 -3.14741652  0.53513589  0.23249044  0.86761195 -1.14821271  2.11434424
  1.00094276 -0.051415  ]
0.0


$$z = c * x, \space x \in \mathbb{R^n}, c \in \mathbb{R}$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{z} * c$$
$$\frac{\partial E}{\partial c} = \frac{\partial E}{z}^T x$$

In [10]:
x = np.random.randn(14)
c = np.array(2.3)
z = c * x
e = z.T @ z

tx = torch.tensor(x, requires_grad=True)
tc = torch.tensor(c, requires_grad=True)
tz = tc * tx
te = torch.dot(tz, tz)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

82.76337000156782
82.76337000156784
1.4210854715202004e-14


$$z = x^Ty, \space x, y, z \in \mathbb{R}^n$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{\partial z} * y$$
$$\frac{\partial E}{\partial y} = \frac{\partial E}{\partial z} * x$$

In [11]:
x = np.random.randn(14)
y = np.random.randn(14)
z = x @ y
e = z**2

tx = torch.tensor(x, requires_grad=True)
ty = torch.tensor(y, requires_grad=True)
tz = torch.dot(tx, ty)
te = tz**2
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

36.087775526961785
36.08777552696178
7.105427357601002e-15


In [12]:
dz = 2 * z
dx = dz * y
dy = dz * x
dx_sol = tx.grad.data.numpy()
dy_sol = ty.grad.data.numpy()

print(dx)
print(dx_sol)
print(metrics.tdist(dx, dx_sol))
print(dy)
print(dy_sol)
print(metrics.tdist(dy, dy_sol))

[ -6.44561871   2.37793986 -16.4076439  -14.35079579   0.19144748
  -0.96169181  -3.01329938  -6.78997731 -13.24818472  -9.39882653
  36.54470499  -7.52212818  18.09282775  -7.0566201 ]
[ -6.44561871   2.37793986 -16.4076439  -14.35079579   0.19144748
  -0.96169181  -3.01329938  -6.78997731 -13.24818472  -9.39882653
  36.54470499  -7.52212818  18.09282775  -7.0566201 ]
6.189493362285248e-15
[ 21.97258658 -12.02665353 -25.13087581   1.76085922  -5.60303144
   4.27988419  -4.78037394 -15.12909247  -8.27661596   9.64330014
   3.27267494 -11.64428768  10.47636594 -17.37745959]
[ 21.97258658 -12.02665353 -25.13087581   1.76085922  -5.60303144
   4.27988419  -4.78037394 -15.12909247  -8.27661596   9.64330014
   3.27267494 -11.64428768  10.47636594 -17.37745959]
6.7678088263037544e-15


$$z = Xy, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^m, z \in \mathbb{R}^n$$
$$\frac{\partial E}{\partial X} = \frac{\partial E}{\partial z} y^T$$
$$\frac{\partial E}{\partial y} = X^T \frac{\partial E}{\partial z}$$

In [13]:
X = np.random.randn(7, 3)
y = np.random.randn(3)
z = X @ y
e = z @ z

tX = torch.tensor(X, requires_grad=True)
ty = torch.tensor(y, requires_grad=True)
tz = torch.matmul(tX, ty)
te = torch.dot(tz, tz)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

36.25412373717531
36.25412373717531
0.0


In [14]:
dz = 2 * z
dX = np.outer(dz, y)
dy = X.T @ dz
dX_sol = tX.grad.data.numpy()
dy_sol = ty.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))
print(dy)
print(dy_sol)
print(metrics.tdist(dy, dy_sol))

[[ 8.58414287  0.90323648  0.16263869]
 [-4.38366685 -0.46125605 -0.08305475]
 [-7.92910398 -0.83431229 -0.15022805]
 [ 8.33956201  0.87750131  0.15800476]
 [-3.632771   -0.38224565 -0.06882797]
 [-9.06740297 -0.95408583 -0.17179473]
 [-9.64867087 -1.01524772 -0.18280767]]
[[ 8.58414287  0.90323648  0.16263869]
 [-4.38366685 -0.46125605 -0.08305475]
 [-7.92910398 -0.83431229 -0.15022805]
 [ 8.33956201  0.87750131  0.15800476]
 [-3.632771   -0.38224565 -0.06882797]
 [-9.06740297 -0.95408583 -0.17179473]
 [-9.64867087 -1.01524772 -0.18280767]]
0.0
[40.47933023 24.94137081 -9.57602141]
[40.47933023 24.94137081 -9.57602141]
3.552713678800501e-15


$$z = y^TX, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^n, z \in \mathbb{R}^m$$
$$\frac{\partial E}{\partial X} = y^T\frac{\partial E}{\partial z}$$
$$\frac{\partial E}{\partial y} = X \frac{\partial E}{\partial z}$$

In [15]:
X = np.random.randn(7, 3)
y = np.random.randn(7)
z = y @ X
e = z @ z

tX = torch.tensor(X, requires_grad=True)
ty = torch.tensor(y, requires_grad=True)
tz = torch.matmul(ty, tX)
te = torch.dot(tz, tz)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

9.028420925553718
9.028420925553718
0.0


In [16]:
dz = 2 * z
dX = np.outer(y, dz)
dy = X @ dz
dX_sol = tX.grad.data.numpy()
dy_sol = ty.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))
print(dy)
print(dy_sol)
print(metrics.tdist(dy, dy_sol))

[[ -0.175777     0.47264192   2.56084683]
 [  0.4533018   -1.21887065  -6.60402921]
 [ -0.35983339   0.96754603   5.24231366]
 [ -0.1272869    0.34225823   1.85440788]
 [  1.00694242  -2.70753958 -14.66986711]
 [ -0.24117548   0.64849005   3.51361915]
 [  0.82395184  -2.21550127 -12.00392763]]
[[ -0.175777     0.47264192   2.56084683]
 [  0.4533018   -1.21887065  -6.60402921]
 [ -0.35983339   0.96754603   5.24231366]
 [ -0.1272869    0.34225823   1.85440788]
 [  1.00694242  -2.70753958 -14.66986711]
 [ -0.24117548   0.64849005   3.51361915]
 [  0.82395184  -2.21550127 -12.00392763]]
4.308734650905669e-15
[ -6.25797581   3.52932785   5.80016161   9.00767026 -11.88309058
  -4.11578511   5.09593842]
[ -6.25797581   3.52932785   5.80016161   9.00767026 -11.88309058
  -4.11578511   5.09593842]
3.233018248352212e-15


$$Z = XY, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^{m*p}, z \in \mathbb{R}^{n*p}$$
$$\frac{\partial E}{\partial X} = \frac{\partial E}{\partial Z}Y^T$$
$$\frac{\partial E}{\partial Y} = X^T \frac{\partial E}{\partial Z}$$

In [17]:
X = np.random.randn(7, 3)
Y = np.random.randn(3, 2)
Z = X @ Y
Z_flat = Z.reshape(-1)
e = Z_flat @ Z_flat

tX = torch.tensor(X, requires_grad=True)
tY = torch.tensor(Y, requires_grad=True)
tZ = torch.matmul(tX, tY)
tZ_flat = tZ.view(-1)
te = torch.dot(tZ_flat, tZ_flat)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

14.869025117634362
14.869025117634363
1.7763568394002505e-15


In [18]:
dZ_flat = 2 * Z_flat
dZ = dZ_flat.reshape(Z.shape[0], Z.shape[1])
dX = dZ @ Y.T
dY = X.T @ dZ
dX_sol = tX.grad.data.numpy()
dY_sol = tY.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))
print(dY)
print(dY_sol)
print(metrics.tdist(dY, dY_sol))

[[-3.49003824  7.35296457  2.47664396]
 [ 0.83856186 -1.93135413 -0.60280663]
 [-2.26103954  4.67992261  1.60057219]
 [ 1.63973262 -3.36369402 -1.15933289]
 [ 1.78589422 -3.72447036 -1.26553662]
 [ 0.78129173 -1.85281456 -0.56414504]
 [ 1.56300454 -3.18367978 -1.10402144]]
[[-3.49003824  7.35296457  2.47664396]
 [ 0.83856186 -1.93135413 -0.60280663]
 [-2.26103954  4.67992261  1.60057219]
 [ 1.63973262 -3.36369402 -1.15933289]
 [ 1.78589422 -3.72447036 -1.26553662]
 [ 0.78129173 -1.85281456 -0.56414504]
 [ 1.56300454 -3.18367978 -1.10402144]]
1.2212453270876722e-15
[[-7.94668909 11.14088288]
 [ 7.7336249  -7.18289606]
 [ 6.81871998 -8.63607927]]
[[-7.94668909 11.14088288]
 [ 7.7336249  -7.18289606]
 [ 6.81871998 -8.63607927]]
2.808666774861361e-15


$$Z = X^TX, \space X \in \mathbb{R}^{n*m}, Z \in \mathbb{R}^{m*m}$$
$$\frac{\partial E}{\partial X} = X(\frac{\partial E}{\partial Z} +    \frac{\partial E}{\partial Z}^T)$$

In [19]:
X = np.random.randn(5, 3)
Z = X.T @ X
Z_flat = Z.reshape(-1)
e = Z_flat @ Z_flat

tX = torch.tensor(X, requires_grad=True)
tZ = torch.matmul(torch.transpose(tX, 1, 0), tX)
tZ_flat = tZ.view(-1)
te = torch.dot(tZ_flat, tZ_flat)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

103.23083005204043
103.23083005204046
2.842170943040401e-14


In [20]:
dZ_flat = 2 * Z_flat
dZ = dZ_flat.reshape(Z.shape[0], Z.shape[1])
dX = X @ (dZ + dZ.T)
dX_sol = tX.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))

[[-63.46265934 -36.64261304 -34.74844084]
 [-20.12452315 -11.18108958 -18.56517314]
 [-11.11891629 -19.2830123  -32.39392498]
 [ 16.65153053  -8.25200932  20.91609196]
 [-25.34512635  19.34332405 -32.60645328]]
[[-63.46265934 -36.64261304 -34.74844084]
 [-20.12452315 -11.18108958 -18.56517314]
 [-11.11891629 -19.2830123  -32.39392498]
 [ 16.65153053  -8.25200932  20.91609196]
 [-25.34512635  19.34332405 -32.60645328]]
1.443119118362539e-14


$Z_I = f(X_I)$, with $Z$ and $X$ tensors of same size, $f: \mathbb{R} \rightarrow \mathbb{R}$
$$\frac{\partial E}{\partial X_I} = \frac{\partial E}{\partial Z_I} *    f'(X_I)$$

In [21]:
X = np.random.randn(5, 3)
Z = np.cos(X)
Z_flat = Z.reshape(-1)
e = Z_flat @ Z_flat

tX = torch.tensor(X, requires_grad=True)
tZ = torch.cos(tX)
tZ_flat = tZ.view(-1)
te = torch.dot(tZ_flat, tZ_flat)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

6.8542875965253955
6.8542875965253955
0.0


In [22]:
dZ_flat = 2 * Z_flat
dZ = dZ_flat.reshape(Z.shape[0], Z.shape[1])
dX = dZ * (-np.sin(X))
dX_sol = tX.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))

[[ 0.67141586 -0.98411058  0.4044489 ]
 [ 0.99976156 -0.00191423 -0.09810756]
 [-0.99995919  0.42090411 -0.94132233]
 [ 0.00705578  0.99882196 -0.84200559]
 [ 0.37743023  0.97076711 -0.16627778]]
[[ 0.67141586 -0.98411058  0.4044489 ]
 [ 0.99976156 -0.00191423 -0.09810756]
 [-0.99995919  0.42090411 -0.94132233]
 [ 0.00705578  0.99882196 -0.84200559]
 [ 0.37743023  0.97076711 -0.16627778]]
0.0


$Z_I = f(X_I, Y_I)$, with $Z$, $X$and $Y$ tensors of same size, $f: \mathbb{R}*\mathbb{R} \rightarrow \mathbb{R}$
$$\frac{\partial E}{\partial X_I} = \frac{\partial E}{\partial Z_I} *    \frac{\partial f(X_I, Y_I)}{\partial X_I}$$
$$\frac{\partial E}{\partial Y_I} = \frac{\partial E}{\partial Z_I} *    \frac{\partial f(X_I, Y_I)}{\partial Y_I}$$

In [23]:
X = np.random.rand(7, 3) + 0.1
Y = np.random.randn(7, 3)
Z = np.power(X, Y)
Z_flat = Z.reshape(-1)
e = Z_flat @ Z_flat


tX = torch.tensor(X, requires_grad=True)
tY = torch.tensor(Y, requires_grad=True)
tZ = torch.pow(tX, tY)
tZ_flat = tZ.view(-1)
te = torch.dot(tZ_flat, tZ_flat)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

151.7929370800654
151.7929370800654
0.0


In [24]:
dZ_flat = 2 * Z_flat
dZ = dZ_flat.reshape(Z.shape[0], Z.shape[1])
dX = dZ * Y * np.power(X, Y-1) 
dY = dZ * np.log(X) * np.power(X, Y)
dX_sol = tX.grad.data.numpy()
dY_sol = tY.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))
print(dY)
print(dY_sol)
print(metrics.tdist(dY, dY_sol))

[[-1.47662497e+03  8.21415840e-01  7.11877166e-01]
 [ 4.85073893e-01 -1.24327766e-01 -1.29022186e+01]
 [ 8.06131651e-01 -7.20958167e+01 -3.20868983e+01]
 [-2.10939137e+01  1.03099902e+00 -1.96466266e-01]
 [-4.30056823e-01 -2.33701798e+00 -1.88817756e+00]
 [-2.89855311e+00 -7.67949261e-01 -2.08453062e+00]
 [ 7.95762724e-01 -3.22457087e+01  2.63829700e-01]]
[[-1.47662497e+03  8.21415840e-01  7.11877166e-01]
 [ 4.85073893e-01 -1.24327766e-01 -1.29022186e+01]
 [ 8.06131651e-01 -7.20958167e+01 -3.20868983e+01]
 [-2.10939137e+01  1.03099902e+00 -1.96466266e-01]
 [-4.30056823e-01 -2.33701798e+00 -1.88817756e+00]
 [-2.89855311e+00 -7.67949261e-01 -2.08453062e+00]
 [ 7.95762724e-01 -3.22457087e+01  2.63829700e-01]]
0.0
[[-3.06262600e+02 -4.75443852e-01 -7.25862659e-02]
 [-1.40480996e-01 -2.08243921e+00 -9.34640342e+00]
 [-9.78779799e-02 -2.36631801e+01 -6.92062436e+00]
 [-1.40394225e+01 -4.90878091e-01 -1.32491222e-01]
 [ 2.43818328e-02  8.59217782e-02 -1.08632451e+00]
 [-1.63219672e+00 -2.8552

Every tensor sum of an axis can be transformed into a 3D-tensor sum on axis 1, using only reshape.  

$$X \in \mathbb{R}^{m * n * p}, Y \in \mathbb{R}^{m * p}$$
$y$ is the sum of $X$ on axis $2$.
$$Y_{ik} = \sum_{j=i}^n X_{ijk}$$
$$\frac{\partial E}{\partial X_{ijk}} = \frac{\partial E}{\partial Y_{ik}}$$

In [25]:
def prod(x):
    res = 1
    for v in x: res *= v
    return res

def sum_axis(X, axis):
    shape3 = (prod(X.shape[:axis]), X.shape[axis], prod(X.shape[axis+1:]))
    final_shape = X.shape[:axis] + X.shape[axis+1:]
    return np.sum(X.reshape(shape3), axis=1).reshape(final_shape)

X = np.random.randn(2, 4, 3, 7)

s = [sum_axis(X, i) for i in range(4)]

tX = torch.tensor(X, requires_grad = True)
s_sol = [torch.sum(tX, i) for i in range(4)]

for i in range(4):
    print(s[i].shape)
    print(s_sol[i].data.numpy().shape)
    print(metrics.tdist(s[i], s_sol[i].data.numpy()))

(4, 3, 7)
(4, 3, 7)
0.0
(2, 3, 7)
(2, 3, 7)
0.0
(2, 4, 7)
(2, 4, 7)
0.0
(2, 4, 3)
(2, 4, 3)
0.0


In [26]:
def my_expand_dims3(x, size):
    
    y = np.empty((x.shape[0], size, x.shape[1]))
    for i in range(x.shape[0]):
        for j in range(size):
            for k in range(x.shape[1]):
                y[i, j, k] = x[i, k]
    return y
    

def dsum_axis(X, axis, dout):
    dout = dout.reshape((prod(X.shape[:axis]), prod(X.shape[axis+1:])))
    return my_expand_dims3(dout, X.shape[axis]).reshape(X.shape)
    
a = np.array([[1, 2, 3], [4, 5, 6]])
a2 = my_expand_dims3(a, 2)
print(a2)

[[[1. 2. 3.]
  [1. 2. 3.]]

 [[4. 5. 6.]
  [4. 5. 6.]]]


In [27]:
for i in range(4):
    
    ds = 2 * s[i]
    dX = dsum_axis(X, i, ds)
    
    si_flat = s_sol[i].view(-1)
    tz = torch.dot(si_flat, si_flat)
    tX.grad.data.zero_()
    tz.backward()
    dX_sol = tX.grad.data.numpy()
    
    print(dX.shape)
    print(dX_sol.shape)
    print(metrics.tdist(dX, dX_sol))
    

AttributeError: 'NoneType' object has no attribute 'data'

## Derivatives sheet

$$(c)' = 0, \space c \in \mathbb{R}$$
$$(x)' = 1, \space x \in \mathbb{R}$$
$$(cx)' = c, \space c, x \in \mathbb{R}$$
$$(e^x)' = e^x, \space x \in \mathbb{R}$$
$$(ln(x))' = \frac{1}{x}, \space x \in \mathbb{R}$$
$$(\frac{1}{x})' = - \frac{1}{x^2}, \space x \in \mathbb{R}$$

$$(cos(x))' = -sin(x), \space x \in \mathbb{R}$$
$$(sin(x))' = cos(x), \space x \in \mathbb{R}$$
$$(cosh(x))' = -sinh(x), \space x \in \mathbb{R}$$
$$(sinh(x))' = cos(x), \space x \in \mathbb{R}$$
$$(tanh(x))' = 1 - tanh(x)^2, \space x \in \mathbb{R}$$

$$(\sigma(x))' = \sigma(x)(1 - \sigma(x)), \space x \in \mathbb{R}$$

$$\frac{\partial}{\partial x} x^y = y*x^{y-1}, \space x, y \in \mathbb{R}$$
$$\frac{\partial}{\partial y} x^y = ln(x)*x^{y}, \space x, y \in \mathbb{R}$$

$$\frac{\partial}{\partial x} ||x||_2^2 = 2x, x \in \mathbb{R}^n$$

$$\frac{\partial}{\partial x} \sum_{x=1}^n x_i = \mathbb{1}, x \in \mathbb{R}^n$$

$$z = ||x||_1, \space x \in \mathbb{R^n}$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{z} * sgn(x)$$

$$z = c + x, \space x, z \in \mathbb{R^n}, c \in \mathbb{R}$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{z}$$
$$\frac{\partial E}{\partial c} = \sum_{j=1}^n \frac{\partial E}{\partial z_i}$$

$$z = c * x, \space x \in \mathbb{R^n}, c \in \mathbb{R}$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{\partial z} * c$$
$$\frac{\partial E}{\partial c} = \frac{\partial E}{\partial z}^T x$$

$$z = c / x, \space x \in \mathbb{R^n}, c \in \mathbb{R}$$
$$\frac{\partial E}{\partial x} = -c * \frac{\partial E}{\partial z} / (x*x)$$
$$\frac{\partial E}{\partial c} = \frac{\partial E}{\partial z}^T \frac{1}{x}$$

$$z = \sum_{i=1}^n x_i, \space x \in \mathbb{R^n}, z \in \mathbb{R}$$
$$\frac{\partial E}{\partial x_i} = \frac{\partial E}{z}$$

$$x, y \in \mathbb{R}^n$$
$$\frac{\partial x^Ty}{\partial x} = y$$
$$\frac{\partial x^Ty}{\partial y} = x$$

$$z = x^Ty, \space x, y, z \in \mathbb{R}^n$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{\partial z} * y$$
$$\frac{\partial E}{\partial y} = \frac{\partial E}{\partial z} * x$$

$$z = Xy, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^m, z \in \mathbb{R}^n$$
$$\frac{\partial E}{\partial X} = \frac{\partial E}{\partial z} y^T$$
$$\frac{\partial E}{\partial y} = X^T \frac{\partial E}{\partial z}$$

$$z = y^TX, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^n, z \in \mathbb{R}^m$$
$$\frac{\partial E}{\partial X} = y^T\frac{\partial E}{\partial z}$$
$$\frac{\partial E}{\partial y} = X \frac{\partial E}{\partial z}$$

$$Z = XY, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^{m*p}, z \in \mathbb{R}^{n*p}$$
$$\frac{\partial E}{\partial X} = \frac{\partial E}{\partial Z}Y^T$$
$$\frac{\partial E}{\partial Y} = X^T \frac{\partial E}{\partial Z}$$

$$Z = X^TX, \space X \in \mathbb{R}^{n*m}, Z \in \mathbb{R}^{m*m}$$
$$\frac{\partial E}{\partial X} = X(\frac{\partial E}{\partial Z} +    \frac{\partial E}{\partial Z}^T)$$

$Z_I = f(X_I)$, with $Z$ and $X$ tensors of same size, $f: \mathbb{R} \rightarrow \mathbb{R}$
$$\frac{\partial E}{\partial X_I} = \frac{\partial E}{\partial Z_I} *    f'(X_I)$$

$Z_I = f(X_I, Y_I)$, with $Z$, $X$and $Y$ tensors of same size, $f: \mathbb{R}*\mathbb{R} \rightarrow \mathbb{R}$
$$\frac{\partial E}{\partial X_I} = \frac{\partial E}{\partial Z_I} *    \frac{\partial f(X_I, Y_I)}{\partial X_I}$$
$$\frac{\partial E}{\partial Y_I} = \frac{\partial E}{\partial Z_I} *    \frac{\partial f(X_I, Y_I)}{\partial Y_I}$$

$x \in \mathbb{R}^n$, and $S_i$ = softmax$(x)_i$
$$\frac{\partial S_i}{x_j} = S_i(1 - S_j) \space (i = j)$$
$$\frac{\partial S_i}{x_j} = -S_iS_j \space (i \neq j)$$