In [1]:
import numpy as np
import scipy.linalg
import torch

import metrics
import utils

np.random.seed(12)

$$\frac{\partial}{\partial x} ||x||_2^2 = 2x, x \in \mathbb{R}^n$$

In [2]:
x = np.random.randn(14)
tx = torch.tensor(x, requires_grad=True)
y = x@x
ty = torch.dot(tx, tx)
ty.backward()

print(y)
print(ty.data.numpy())
print(metrics.tdist(y, ty.data.numpy()))

18.733339403660047
18.733339403660047
0.0


In [3]:
dx = 2 * x
dx_sol = tx.grad.data.numpy()
print(dx)
print(dx_sol)
print(metrics.tdist(dx, dx_sol))

[ 0.94597166 -1.36285176  0.48487899 -3.40147127  1.50628567 -3.06944268
  0.01025416 -0.24045534 -1.61396376  5.74363879 -1.19564584  0.94491399
  2.19191224 -2.4303376 ]
[ 0.94597166 -1.36285176  0.48487899 -3.40147127  1.50628567 -3.06944268
  0.01025416 -0.24045534 -1.61396376  5.74363879 -1.19564584  0.94491399
  2.19191224 -2.4303376 ]
0.0


$$\frac{\partial}{\partial x} ||x||_1 = sign(x), x \in \mathbb{R}^n$$

In [4]:
x = np.random.randn(14)
tx = torch.tensor(x, requires_grad=True)
y = np.linalg.norm(x, ord=1)
ty = torch.norm(tx, p=1)
ty.backward()

print(y)
print(ty.data.numpy())
print(metrics.tdist(y, ty.data.numpy()))

14.276574175253929
14.276574175253929
0.0


In [5]:
dx = np.sign(x)
dx_sol = tx.grad.data.numpy()
print(dx)
print(dx_sol)
print(metrics.tdist(dx, dx_sol))

[ 1. -1.  1. -1. -1.  1.  1.  1.  1.  1. -1. -1. -1. -1.]
[ 1. -1.  1. -1. -1.  1.  1.  1.  1.  1. -1. -1. -1. -1.]
0.0


$$\frac{\partial}{\partial x} \sum_{x=1}^n x_i = \mathbb{1}, x \in \mathbb{R}^n$$

In [6]:
x = np.random.randn(14)
tx = torch.tensor(x, requires_grad=True)
y = np.sum(x)
ty = torch.sum(tx)
ty.backward()

print(y)
print(ty.data.numpy())
print(metrics.tdist(y, ty.data.numpy()))

-6.814946599398262
-6.814946599398261
8.881784197001252e-16


In [7]:
dx = np.ones((x.shape[0]))
dx_sol = tx.grad.data.numpy()
print(dx)
print(dx_sol)
print(metrics.tdist(dx, dx_sol))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
0.0


$$x, y \in \mathbb{R}^n$$
$$\frac{\partial x^Ty}{\partial x} = y$$
$$\frac{\partial x^Ty}{\partial y} = x$$

In [8]:
x = np.random.randn(14)
y = np.random.randn(14)
tx = torch.tensor(x, requires_grad=True)
ty = torch.tensor(y, requires_grad=True)
z = x @ y
tz = torch.dot(tx, ty)
tz.backward()

print(z)
print(tz.data.numpy())
print(metrics.tdist(z, tz.data.numpy()))

4.509509404393396
4.509509404393396
0.0


In [9]:
dx = y
dx_sol = tx.grad.data.numpy()
print(dx)
print(dx_sol)
print(metrics.tdist(dx, dx_sol))
dy = x
dy_sol = ty.grad.data.numpy()
print(dy)
print(dy_sol)
print(metrics.tdist(dy, dy_sol))

[ 0.1597877  -0.71626359  0.05052283 -0.14333741  0.94357539  0.35764423
 -0.0834492   0.6778061   0.55606037  0.22271946 -1.52898548  1.02921118
 -1.16625876 -1.00956165]
[ 0.1597877  -0.71626359  0.05052283 -0.14333741  0.94357539  0.35764423
 -0.0834492   0.6778061   0.55606037  0.22271946 -1.52898548  1.02921118
 -1.16625876 -1.00956165]
0.0
[ 1.33583134  0.31866529 -0.33759525 -0.58526828 -0.11491994  2.24181779
 -3.14741652  0.53513589  0.23249044  0.86761195 -1.14821271  2.11434424
  1.00094276 -0.051415  ]
[ 1.33583134  0.31866529 -0.33759525 -0.58526828 -0.11491994  2.24181779
 -3.14741652  0.53513589  0.23249044  0.86761195 -1.14821271  2.11434424
  1.00094276 -0.051415  ]
0.0


$$z = c * x, \space x \in \mathbb{R^n}, c \in \mathbb{R}$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{z} * c$$
$$\frac{\partial E}{\partial c} = \frac{\partial E}{z}^T x$$

In [24]:
x = np.random.randn(14)
c = np.array(2.3)
z = c * x
e = z.T @ z

tx = torch.tensor(x, requires_grad=True)
tc = torch.tensor(c, requires_grad=True)
tz = tc * tx
te = torch.dot(tz, tz)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

67.76763118602102
67.76763118602105
2.842170943040401e-14


$$z = x^Ty, \space x, y, z \in \mathbb{R}^n$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{\partial z} * y$$
$$\frac{\partial E}{\partial y} = \frac{\partial E}{\partial z} * x$$

In [10]:
x = np.random.randn(14)
y = np.random.randn(14)
z = x @ y
e = z**2

tx = torch.tensor(x, requires_grad=True)
ty = torch.tensor(y, requires_grad=True)
tz = torch.dot(tx, ty)
te = tz**2
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

8.568358616400001
8.5683586164
1.7763568394002505e-15


In [11]:
dz = 2 * z
dx = dz * y
dy = dz * x
dx_sol = tx.grad.data.numpy()
dy_sol = ty.grad.data.numpy()

print(dx)
print(dx_sol)
print(metrics.tdist(dx, dx_sol))
print(dy)
print(dy_sol)
print(metrics.tdist(dy, dy_sol))

[-10.70655976   5.86021515  12.24549612  -0.85801207   2.73018339
  -2.08545479   2.32932791   7.3719374    4.03293818  -4.69888098
  -1.59467297   5.67390014  -5.1048081    8.46749694]
[-10.70655976   5.86021515  12.24549612  -0.85801207   2.73018339
  -2.08545479   2.32932791   7.3719374    4.03293818  -4.69888098
  -1.59467297   5.67390014  -5.1048081    8.46749694]
4.149621000377914e-15
[ 0.61627583 -2.99755782 -8.2413335   9.88036798 -8.61312131 -9.58042949
  2.70116825  1.17884556  3.34761628  3.53192523  7.84125564  9.89182254
  1.16693236 -1.50909139]
[ 0.61627583 -2.99755782 -8.2413335   9.88036798 -8.61312131 -9.58042949
  2.70116825  1.17884556  3.34761628  3.53192523  7.84125564  9.89182254
  1.16693236 -1.50909139]
3.789423922623494e-15


$$z = Xy, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^m, z \in \mathbb{R}^n$$
$$\frac{\partial E}{\partial X} = \frac{\partial E}{\partial z} y^T$$
$$\frac{\partial E}{\partial y} = X^T \frac{\partial E}{\partial z}$$

In [12]:
X = np.random.randn(7, 3)
y = np.random.randn(3)
z = X @ y
e = z @ z

tX = torch.tensor(X, requires_grad=True)
ty = torch.tensor(y, requires_grad=True)
tz = torch.matmul(tX, ty)
te = torch.dot(tz, tz)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

35.72778917689937
35.72778917689937
0.0


In [13]:
dz = 2 * z
dX = np.outer(dz, y)
dy = X.T @ dz
dX_sol = tX.grad.data.numpy()
dy_sol = ty.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))
print(dy)
print(dy_sol)
print(metrics.tdist(dy, dy_sol))

[[  2.57774349  -3.07985357  -2.95656708]
 [ -3.64440221   4.3542832    4.17998132]
 [  5.4881387   -6.55715499  -6.29467218]
 [-12.01621971  14.3568192   13.78211596]
 [  2.08286196  -2.48857572  -2.38895808]
 [  4.64291645  -5.54729471  -5.32523659]
 [ -3.43997384   4.11003491   3.94551028]]
[[  2.57774349  -3.07985357  -2.95656708]
 [ -3.64440221   4.3542832    4.17998132]
 [  5.4881387   -6.55715499  -6.29467218]
 [-12.01621971  14.3568192   13.78211596]
 [  2.08286196  -2.48857572  -2.38895808]
 [  4.64291645  -5.54729471  -5.32523659]
 [ -3.43997384   4.11003491   3.94551028]]
0.0
[-19.29237407  32.67847854  -1.9824407 ]
[-19.29237407  32.67847854  -1.9824407 ]
2.220446049250313e-16


$$z = y^TX, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^n, z \in \mathbb{R}^m$$
$$\frac{\partial E}{\partial X} = y^T\frac{\partial E}{\partial z}$$
$$\frac{\partial E}{\partial y} = X \frac{\partial E}{\partial z}$$

In [14]:
X = np.random.randn(7, 3)
y = np.random.randn(7)
z = y @ X
e = z @ z

tX = torch.tensor(X, requires_grad=True)
ty = torch.tensor(y, requires_grad=True)
tz = torch.matmul(ty, tX)
te = torch.dot(tz, tz)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

24.950847697108124
24.95084769710813
7.105427357601002e-15


In [15]:
dz = 2 * z
dX = np.outer(y, dz)
dy = X @ dz
dX_sol = tX.grad.data.numpy()
dy_sol = ty.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))
print(dy)
print(dy_sol)
print(metrics.tdist(dy, dy_sol))

[[-1.82897406 -1.23052335  7.04643384]
 [-1.65191552 -1.11139936  6.36428567]
 [-0.11483612 -0.07726109  0.44242571]
 [-1.64839277 -1.10902928  6.35071368]
 [-3.48435695 -2.34425554 13.42407813]
 [-0.12658277 -0.08516417  0.48768167]
 [ 2.31632801  1.55841231 -8.92404787]]
[[-1.82897406 -1.23052335  7.04643384]
 [-1.65191552 -1.11139936  6.36428567]
 [-0.11483612 -0.07726109  0.44242571]
 [-1.64839277 -1.10902928  6.35071368]
 [-3.48435695 -2.34425554 13.42407813]
 [-0.12658277 -0.08516417  0.48768167]
 [ 2.31632801  1.55841231 -8.92404787]]
3.324186464221916e-15
[ -6.01701007 -14.49008035 -13.6833829   18.32625461  17.93905303
  11.88339873 -28.40229847]
[ -6.01701007 -14.49008035 -13.6833829   18.32625461  17.93905303
  11.88339873 -28.40229847]
9.769962616701378e-15


$$Z = XY, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^{m*p}, z \in \mathbb{R}^{n*p}$$
$$\frac{\partial E}{\partial X} = \frac{\partial E}{\partial Z}Y^T$$
$$\frac{\partial E}{\partial Y} = X^T \frac{\partial E}{\partial Z}$$

In [16]:
X = np.random.randn(7, 3)
Y = np.random.randn(3, 2)
Z = X @ Y
Z_flat = Z.reshape(-1)
e = Z_flat @ Z_flat

tX = torch.tensor(X, requires_grad=True)
tY = torch.tensor(Y, requires_grad=True)
tZ = torch.matmul(tX, tY)
tZ_flat = tZ.view(-1)
te = torch.dot(tZ_flat, tZ_flat)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

23.66394138865219
23.66394138865219
0.0


In [17]:
dZ_flat = 2 * Z_flat
dZ = dZ_flat.reshape(Z.shape[0], Z.shape[1])
dX = dZ @ Y.T
dY = X.T @ dZ
dX_sol = tX.grad.data.numpy()
dY_sol = tY.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))
print(dY)
print(dY_sol)
print(metrics.tdist(dY, dY_sol))

[[-1.67908037e-01  4.34103711e-03  5.98357888e-03]
 [-1.32876315e+00 -2.08871879e+00  3.10861894e+00]
 [ 1.80099884e+00  2.82651469e+00 -4.20688293e+00]
 [ 3.18233886e+00  5.05298906e+00 -7.51796586e+00]
 [ 7.78292877e-02  2.13841577e-02 -3.65087981e-02]
 [ 1.63071444e+00  2.50222180e+00 -3.72686738e+00]
 [ 1.60394372e+00  2.74906152e+00 -4.08083467e+00]]
[[-1.67908037e-01  4.34103711e-03  5.98357888e-03]
 [-1.32876315e+00 -2.08871879e+00  3.10861894e+00]
 [ 1.80099884e+00  2.82651469e+00 -4.20688293e+00]
 [ 3.18233886e+00  5.05298906e+00 -7.51796586e+00]
 [ 7.78292877e-02  2.13841577e-02 -3.65087981e-02]
 [ 1.63071444e+00  2.50222180e+00 -3.72686738e+00]
 [ 1.60394372e+00  2.74906152e+00 -4.08083467e+00]]
1.5899003276961254e-17
[[ 10.00121984  -4.9915834 ]
 [ 10.19733761  -1.88300499]
 [-30.1030253    6.29625184]]
[[ 10.00121984  -4.9915834 ]
 [ 10.19733761  -1.88300499]
 [-30.1030253    6.29625184]]
3.6687789788066426e-15


$$Z = X^TX, \space X \in \mathbb{R}^{n*m}, Z \in \mathbb{R}^{m*m}$$
$$\frac{\partial E}{\partial X} = X(\frac{\partial E}{\partial Z} +    \frac{\partial E}{\partial Z}^T)$$

In [18]:
X = np.random.randn(5, 3)
Z = X.T @ X
Z_flat = Z.reshape(-1)
e = Z_flat @ Z_flat

tX = torch.tensor(X, requires_grad=True)
tZ = torch.matmul(torch.transpose(tX, 1, 0), tX)
tZ_flat = tZ.view(-1)
te = torch.dot(tZ_flat, tZ_flat)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

97.45718379559193
97.45718379559193
0.0


In [19]:
dZ_flat = 2 * Z_flat
dZ = dZ_flat.reshape(Z.shape[0], Z.shape[1])
dX = X @ (dZ + dZ.T)
dX_sol = tX.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))

[[ -0.76547476 -37.81435271 -44.58042207]
 [-10.58015363   3.06873581  29.72179335]
 [ -2.62345198 -26.22934351 -20.99112803]
 [ 13.34561034  15.41588968 -23.48587833]
 [ 13.10827276 -23.42292114 -70.3596491 ]]
[[ -0.76547476 -37.81435271 -44.58042207]
 [-10.58015363   3.06873581  29.72179335]
 [ -2.62345198 -26.22934351 -20.99112803]
 [ 13.34561034  15.41588968 -23.48587833]
 [ 13.10827276 -23.42292114 -70.3596491 ]]
5.687116766346677e-15


$Z_I = f(X_I)$, with $Z$ and $X$ tensors of same size, $f: \mathbb{R} \rightarrow \mathbb{R}$
$$\frac{\partial E}{\partial X_I} = \frac{\partial E}{\partial Z_I} *    f'(X_I)$$

In [20]:
X = np.random.randn(5, 3)
Z = np.cos(X)
Z_flat = Z.reshape(-1)
e = Z_flat @ Z_flat

tX = torch.tensor(X, requires_grad=True)
tZ = torch.cos(tX)
tZ_flat = tZ.view(-1)
te = torch.dot(tZ_flat, tZ_flat)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

6.947981330191818
6.947981330191819
8.881784197001252e-16


In [21]:
dZ_flat = 2 * Z_flat
dZ = dZ_flat.reshape(Z.shape[0], Z.shape[1])
dX = dZ * (-np.sin(X))
dX_sol = tX.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))

[[ 0.52458293  0.90545114  0.8600759 ]
 [ 0.71158856  0.93469161 -0.67042823]
 [ 0.94031985 -0.04898118 -0.84950448]
 [ 0.94472048 -0.99994179  0.9952005 ]
 [-0.49918049  0.61635734  0.67141586]]
[[ 0.52458293  0.90545114  0.8600759 ]
 [ 0.71158856  0.93469161 -0.67042823]
 [ 0.94031985 -0.04898118 -0.84950448]
 [ 0.94472048 -0.99994179  0.9952005 ]
 [-0.49918049  0.61635734  0.67141586]]
0.0


$Z_I = f(X_I, Y_I)$, with $Z$, $X$and $Y$ tensors of same size, $f: \mathbb{R}*\mathbb{R} \rightarrow \mathbb{R}$
$$\frac{\partial E}{\partial X_I} = \frac{\partial E}{\partial Z_I} *    \frac{\partial f(X_I, Y_I)}{\partial X_I}$$
$$\frac{\partial E}{\partial Y_I} = \frac{\partial E}{\partial Z_I} *    \frac{\partial f(X_I, Y_I)}{\partial Y_I}$$

In [22]:
X = np.random.rand(7, 3) + 0.1
Y = np.random.randn(7, 3)
Z = np.power(X, Y)
Z_flat = Z.reshape(-1)
e = Z_flat @ Z_flat


tX = torch.tensor(X, requires_grad=True)
tY = torch.tensor(Y, requires_grad=True)
tZ = torch.pow(tX, tY)
tZ_flat = tZ.view(-1)
te = torch.dot(tZ_flat, tZ_flat)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

91.6018066558635
91.6018066558635
0.0


In [23]:
dZ_flat = 2 * Z_flat
dZ = dZ_flat.reshape(Z.shape[0], Z.shape[1])
dX = dZ * Y * np.power(X, Y-1) 
dY = dZ * np.log(X) * np.power(X, Y)
dX_sol = tX.grad.data.numpy()
dY_sol = tY.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))
print(dY)
print(dY_sol)
print(metrics.tdist(dY, dY_sol))

[[ 1.07805226e+00  1.12605145e+00  1.36416782e+00]
 [-2.17430585e+00  5.42316487e-01  2.05636755e-01]
 [-2.67306770e+00 -3.83905115e+00 -6.33235922e+01]
 [ 4.83811484e-01 -1.02425156e+01  3.46629852e-01]
 [ 1.12635928e+00  1.90471797e-01  1.68023629e-01]
 [-9.96853538e+02  1.02566643e+00 -2.69395320e+01]
 [-2.30435242e+00  8.59002427e-01  8.30668719e-03]]
[[ 1.07805226e+00  1.12605145e+00  1.36416782e+00]
 [-2.17430585e+00  5.42316487e-01  2.05636755e-01]
 [-2.67306770e+00 -3.83905115e+00 -6.33235922e+01]
 [ 4.83811484e-01 -1.02425156e+01  3.46629852e-01]
 [ 1.12635928e+00  1.90471797e-01  1.68023629e-01]
 [-9.96853538e+02  1.02566643e+00 -2.69395320e+01]
 [-2.30435242e+00  8.59002427e-01  8.30668719e-03]]
0.0
[[-5.26064676e-01 -3.75503606e-01 -1.57442010e-01]
 [-9.94512074e-01 -1.95440239e-01  1.08546127e-02]
 [-2.26391748e-02 -5.08151725e+00 -1.39801941e+01]
 [ 1.52674918e-02 -3.59949573e+00 -1.10110269e+00]
 [-3.30147353e-01 -4.35170993e-02 -7.34000963e-02]
 [-2.61547898e+02 -6.4674

Every tensor sum of an axis can be transformed into a 3D-tensor sum on axis 1, using only reshape.  

$$X \in \mathbb{R}^{m * n * p}, Y \in \mathbb{R}^{m * p}$$
$y$ is the sum of $X$ on axis $2$.
$$Y_{ik} = \sum_{j=i}^n X_{ijk}$$
$$\frac{\partial E}{\partial X_{ijk}} = \frac{\partial E}{\partial Y_{ik}}$$

In [65]:
def prod(x):
    res = 1
    for v in x: res *= v
    return res

def sum_axis(X, axis):
    shape3 = (prod(X.shape[:axis]), X.shape[axis], prod(X.shape[axis+1:]))
    final_shape = X.shape[:axis] + X.shape[axis+1:]
    return np.sum(X.reshape(shape3), axis=1).reshape(final_shape)

X = np.random.randn(2, 4, 3, 7)

s = [sum_axis(X, i) for i in range(4)]

tX = torch.tensor(X, requires_grad = True)
s_sol = [torch.sum(tX, i) for i in range(4)]

for i in range(4):
    print(s[i].shape)
    print(s_sol[i].data.numpy().shape)
    print(metrics.tdist(s[i], s_sol[i].data.numpy()))

(4, 3, 7)
(4, 3, 7)
0.0
(2, 3, 7)
(2, 3, 7)
0.0
(2, 4, 7)
(2, 4, 7)
0.0
(2, 4, 3)
(2, 4, 3)
0.0


In [76]:
def my_expand_dims3(x, size):
    
    y = np.empty((x.shape[0], size, x.shape[1]))
    for i in range(x.shape[0]):
        for j in range(size):
            for k in range(x.shape[1]):
                y[i, j, k] = x[i, k]
    return y
    

def dsum_axis(X, axis, dout):
    dout = dout.reshape((prod(X.shape[:axis]), prod(X.shape[axis+1:])))
    return my_expand_dims3(dout, X.shape[axis]).reshape(X.shape)
    
a = np.array([[1, 2, 3], [4, 5, 6]])
a2 = my_expand_dims3(a, 2)
print(a2)

[[[1. 2. 3.]
  [1. 2. 3.]]

 [[4. 5. 6.]
  [4. 5. 6.]]]


In [95]:
for i in range(4):
    
    ds = 2 * s[i]
    dX = dsum_axis(X, i, ds)
    
    si_flat = s_sol[i].view(-1)
    tz = torch.dot(si_flat, si_flat)
    tX.grad.data.zero_()
    tz.backward()
    dX_sol = tX.grad.data.numpy()
    
    print(dX.shape)
    print(dX_sol.shape)
    print(metrics.tdist(dX, dX_sol))
    

(2, 4, 3, 7)
(2, 4, 3, 7)
0.0
(2, 4, 3, 7)
(2, 4, 3, 7)
0.0
(2, 4, 3, 7)
(2, 4, 3, 7)
0.0
(2, 4, 3, 7)
(2, 4, 3, 7)
0.0


## Derivatives sheet

$$(c)' = 0, \space c \in \mathbb{R}$$
$$(x)' = 1, \space x \in \mathbb{R}$$
$$(cx)' = c, \space c, x \in \mathbb{R}$$
$$(e^x)' = e^x, \space x \in \mathbb{R}$$
$$(ln(x))' = \frac{1}{x}, \space x \in \mathbb{R}$$
$$(\frac{1}{x})' = - \frac{1}{x^2}, \space x \in \mathbb{R}$$

$$(cos(x))' = -sin(x), \space x \in \mathbb{R}$$
$$(sin(x))' = cos(x), \space x \in \mathbb{R}$$
$$(cosh(x))' = -sinh(x), \space x \in \mathbb{R}$$
$$(sinh(x))' = cos(x), \space x \in \mathbb{R}$$
$$(tanh(x))' = 1 - tanh(x)^2, \space x \in \mathbb{R}$$

$$(\sigma(x))' = \sigma(x)(1 - \sigma(x)), \space x \in \mathbb{R}$$

$$\frac{\partial}{\partial x} x^y = y*x^{y-1}, \space x, y \in \mathbb{R}$$
$$\frac{\partial}{\partial y} x^y = ln(x)*x^{y}, \space x, y \in \mathbb{R}$$

$$\frac{\partial}{\partial x} ||x||_2^2 = 2x, x \in \mathbb{R}^n$$

$$\frac{\partial}{\partial x} \sum_{x=1}^n x_i = \mathbb{1}, x \in \mathbb{R}^n$$

$$z = ||x||_1, \space x \in \mathbb{R^n}$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{z} * sgn(x)$$

$$z = c + x, \space x, z \in \mathbb{R^n}, c \in \mathbb{R}$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{z}$$
$$\frac{\partial E}{\partial c} = \sum_{j=1}^n \frac{\partial E}{\partial z_i}$$

$$z = c * x, \space x \in \mathbb{R^n}, c \in \mathbb{R}$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{\partial z} * c$$
$$\frac{\partial E}{\partial c} = \frac{\partial E}{\partial z}^T x$$

$$z = c / x, \space x \in \mathbb{R^n}, c \in \mathbb{R}$$
$$\frac{\partial E}{\partial x} = -c * \frac{\partial E}{\partial z} / (x*x)$$
$$\frac{\partial E}{\partial c} = \frac{\partial E}{\partial z}^T \frac{1}{x}$$

$$z = \sum_{i=1}^n x_i, \space x \in \mathbb{R^n}, z \in \mathbb{R}$$
$$\frac{\partial E}{\partial x_i} = \frac{\partial E}{z}$$

$$x, y \in \mathbb{R}^n$$
$$\frac{\partial x^Ty}{\partial x} = y$$
$$\frac{\partial x^Ty}{\partial y} = x$$

$$z = x^Ty, \space x, y, z \in \mathbb{R}^n$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{\partial z} * y$$
$$\frac{\partial E}{\partial y} = \frac{\partial E}{\partial z} * x$$

$$z = Xy, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^m, z \in \mathbb{R}^n$$
$$\frac{\partial E}{\partial X} = \frac{\partial E}{\partial z} y^T$$
$$\frac{\partial E}{\partial y} = X^T \frac{\partial E}{\partial z}$$

$$z = y^TX, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^n, z \in \mathbb{R}^m$$
$$\frac{\partial E}{\partial X} = y^T\frac{\partial E}{\partial z}$$
$$\frac{\partial E}{\partial y} = X \frac{\partial E}{\partial z}$$

$$Z = XY, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^{m*p}, z \in \mathbb{R}^{n*p}$$
$$\frac{\partial E}{\partial X} = \frac{\partial E}{\partial Z}Y^T$$
$$\frac{\partial E}{\partial Y} = X^T \frac{\partial E}{\partial Z}$$

$$Z = X^TX, \space X \in \mathbb{R}^{n*m}, Z \in \mathbb{R}^{m*m}$$
$$\frac{\partial E}{\partial X} = X(\frac{\partial E}{\partial Z} +    \frac{\partial E}{\partial Z}^T)$$

$Z_I = f(X_I)$, with $Z$ and $X$ tensors of same size, $f: \mathbb{R} \rightarrow \mathbb{R}$
$$\frac{\partial E}{\partial X_I} = \frac{\partial E}{\partial Z_I} *    f'(X_I)$$

$Z_I = f(X_I, Y_I)$, with $Z$, $X$and $Y$ tensors of same size, $f: \mathbb{R}*\mathbb{R} \rightarrow \mathbb{R}$
$$\frac{\partial E}{\partial X_I} = \frac{\partial E}{\partial Z_I} *    \frac{\partial f(X_I, Y_I)}{\partial X_I}$$
$$\frac{\partial E}{\partial Y_I} = \frac{\partial E}{\partial Z_I} *    \frac{\partial f(X_I, Y_I)}{\partial Y_I}$$

$x \in \mathbb{R}^n$, and $S_i$ = softmax$(x)_i$
$$\frac{\partial S_i}{x_j} = S_i(1 - S_j) \space (i = j)$$
$$\frac{\partial S_i}{x_j} = -S_iS_j \space (i \neq j)$$