In [1]:
import sys
sys.path.append('../../pyutils')

import numpy as np
import scipy.linalg
import torch

import metrics
import utils

np.random.seed(12)

$$\frac{\partial}{\partial x} ||x||_2^2 = 2x, x \in \mathbb{R}^n$$

In [2]:
x = np.random.randn(14)
tx = torch.tensor(x, requires_grad=True)
y = x@x
ty = torch.dot(tx, tx)
ty.backward()

print(y)
print(ty.data.numpy())
print(metrics.tdist(y, ty.data.numpy()))

18.733339403660047
18.733339403660047
0.0


In [3]:
dx = 2 * x
dx_sol = tx.grad.data.numpy()
print(dx)
print(dx_sol)
print(metrics.tdist(dx, dx_sol))

[ 0.94597166 -1.36285176  0.48487899 -3.40147127  1.50628567 -3.06944268
  0.01025416 -0.24045534 -1.61396376  5.74363879 -1.19564584  0.94491399
  2.19191224 -2.4303376 ]
[ 0.94597166 -1.36285176  0.48487899 -3.40147127  1.50628567 -3.06944268
  0.01025416 -0.24045534 -1.61396376  5.74363879 -1.19564584  0.94491399
  2.19191224 -2.4303376 ]
0.0


$$\frac{\partial}{\partial x} ||x||_1 = sign(x), x \in \mathbb{R}^n$$

In [4]:
x = np.random.randn(14)
tx = torch.tensor(x, requires_grad=True)
y = np.linalg.norm(x, ord=1)
ty = torch.norm(tx, p=1)
ty.backward()

print(y)
print(ty.data.numpy())
print(metrics.tdist(y, ty.data.numpy()))

14.276574175253929
14.276574175253929
0.0


In [5]:
dx = np.sign(x)
dx_sol = tx.grad.data.numpy()
print(dx)
print(dx_sol)
print(metrics.tdist(dx, dx_sol))

[ 1. -1.  1. -1. -1.  1.  1.  1.  1.  1. -1. -1. -1. -1.]
[ 1. -1.  1. -1. -1.  1.  1.  1.  1.  1. -1. -1. -1. -1.]
0.0


$$\frac{\partial}{\partial x} \sum_{x=1}^n x_i = \mathbb{1}, x \in \mathbb{R}^n$$

In [6]:
x = np.random.randn(14)
tx = torch.tensor(x, requires_grad=True)
y = np.sum(x)
ty = torch.sum(tx)
ty.backward()

print(y)
print(ty.data.numpy())
print(metrics.tdist(y, ty.data.numpy()))

-6.814946599398262
-6.814946599398261
8.881784197001252e-16


In [7]:
dx = np.ones((x.shape[0]))
dx_sol = tx.grad.data.numpy()
print(dx)
print(dx_sol)
print(metrics.tdist(dx, dx_sol))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
0.0


$$x, y \in \mathbb{R}^n$$
$$\frac{\partial x^Ty}{\partial x} = y$$
$$\frac{\partial x^Ty}{\partial y} = x$$

In [8]:
x = np.random.randn(14)
y = np.random.randn(14)
tx = torch.tensor(x, requires_grad=True)
ty = torch.tensor(y, requires_grad=True)
z = x @ y
tz = torch.dot(tx, ty)
tz.backward()

print(z)
print(tz.data.numpy())
print(metrics.tdist(z, tz.data.numpy()))

4.509509404393396
4.509509404393396
0.0


In [9]:
dx = y
dx_sol = tx.grad.data.numpy()
print(dx)
print(dx_sol)
print(metrics.tdist(dx, dx_sol))
dy = x
dy_sol = ty.grad.data.numpy()
print(dy)
print(dy_sol)
print(metrics.tdist(dy, dy_sol))

[ 0.1597877  -0.71626359  0.05052283 -0.14333741  0.94357539  0.35764423
 -0.0834492   0.6778061   0.55606037  0.22271946 -1.52898548  1.02921118
 -1.16625876 -1.00956165]
[ 0.1597877  -0.71626359  0.05052283 -0.14333741  0.94357539  0.35764423
 -0.0834492   0.6778061   0.55606037  0.22271946 -1.52898548  1.02921118
 -1.16625876 -1.00956165]
0.0
[ 1.33583134  0.31866529 -0.33759525 -0.58526828 -0.11491994  2.24181779
 -3.14741652  0.53513589  0.23249044  0.86761195 -1.14821271  2.11434424
  1.00094276 -0.051415  ]
[ 1.33583134  0.31866529 -0.33759525 -0.58526828 -0.11491994  2.24181779
 -3.14741652  0.53513589  0.23249044  0.86761195 -1.14821271  2.11434424
  1.00094276 -0.051415  ]
0.0


$$x \in \mathbb{R}^n, \space M \in \mathbb{R}^{n*n} \text{ symetric}$$
$$\frac{\partial x^TMx}{\partial x} = 2Mx$$

In [10]:
x = np.random.randn(3)
M = np.random.randn(3, 3)
M = M.T @ M
tx = torch.tensor(x, requires_grad=True)
tM = torch.tensor(M, requires_grad=True)
z = x @ M @ x
tz = torch.matmul(torch.matmul(tx, tM), tx)
tz.backward()
dx = 2 * M @ x

print(dx)
print(tx.grad.data.numpy())
print(metrics.tdist(dx, tx.grad.data.numpy()))

[-6.50427942 17.90265975 21.71056981]
[-6.50427942 17.90265975 21.71056981]
0.0


$$z = c * x, \space x \in \mathbb{R^n}, c \in \mathbb{R}$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{z} * c$$
$$\frac{\partial E}{\partial c} = \frac{\partial E}{z}^T x$$

In [11]:
x = np.random.randn(14)
c = np.array(2.3)
z = c * x
e = z.T @ z

tx = torch.tensor(x, requires_grad=True)
tc = torch.tensor(c, requires_grad=True)
tz = tc * tx
te = torch.dot(tz, tz)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

69.14040081119921
69.14040081119921
0.0


$$z = x^Ty, \space x, y, z \in \mathbb{R}^n$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{\partial z} * y$$
$$\frac{\partial E}{\partial y} = \frac{\partial E}{\partial z} * x$$

In [12]:
x = np.random.randn(14)
y = np.random.randn(14)
z = x @ y
e = z**2

tx = torch.tensor(x, requires_grad=True)
ty = torch.tensor(y, requires_grad=True)
tz = torch.dot(tx, ty)
te = tz**2
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

0.045384471447029305
0.04538447144702919
1.1796119636642288e-16


In [13]:
dz = 2 * z
dx = dz * y
dy = dz * x
dx_sol = tx.grad.data.numpy()
dy_sol = ty.grad.data.numpy()

print(dx)
print(dx_sol)
print(metrics.tdist(dx, dx_sol))
print(dy)
print(dy_sol)
print(metrics.tdist(dy, dy_sol))

[ 0.64162316 -0.25024783  0.58195254  0.5249406   0.19211156 -0.27328717
 -0.58695596  0.41147809 -0.54707873 -0.54306029  0.64884119  0.62286809
  0.01604418 -0.10489789]
[ 0.64162316 -0.25024783  0.58195254  0.5249406   0.19211156 -0.27328717
 -0.58695596  0.41147809 -0.54707873 -0.54306029  0.64884119  0.62286809
  0.01604418 -0.10489789]
2.311859930211523e-15
[ 0.37152175 -0.61625417 -0.22857998  0.08432851 -0.58186175 -0.50892006
  0.00678927 -0.03410433 -0.10686017 -0.24079192 -0.46981833 -0.33330914
  1.29597924 -0.26675607]
[ 0.37152175 -0.61625417 -0.22857998  0.08432851 -0.58186175 -0.50892006
  0.00678927 -0.03410433 -0.10686017 -0.24079192 -0.46981833 -0.33330914
  1.29597924 -0.26675607]
2.442545479692608e-15


$$z = Xy, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^m, z \in \mathbb{R}^n$$
$$\frac{\partial E}{\partial X} = \frac{\partial E}{\partial z} y^T$$
$$\frac{\partial E}{\partial y} = X^T \frac{\partial E}{\partial z}$$

In [14]:
X = np.random.randn(7, 3)
y = np.random.randn(3)
z = X @ y
e = z @ z

tX = torch.tensor(X, requires_grad=True)
ty = torch.tensor(y, requires_grad=True)
tz = torch.matmul(tX, ty)
te = torch.dot(tz, tz)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

7.044825278274827
7.044825278274827
0.0


In [15]:
dz = 2 * z
dX = np.outer(dz, y)
dy = X.T @ dz
dX_sol = tX.grad.data.numpy()
dy_sol = ty.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))
print(dy)
print(dy_sol)
print(metrics.tdist(dy, dy_sol))

[[-0.04438874 -0.63716949 -1.34684281]
 [-0.02115185 -0.30362016 -0.6417894 ]
 [-0.05135561 -0.7371741  -1.55823159]
 [ 0.02244455  0.32217592  0.6810124 ]
 [-0.15305772 -2.19703727 -4.64407646]
 [ 0.12999258  1.86595311  3.94423393]
 [ 0.12162297  1.74581325  3.69028343]]
[[-0.04438874 -0.63716949 -1.34684281]
 [-0.02115185 -0.30362016 -0.6417894 ]
 [-0.05135561 -0.7371741  -1.55823159]
 [ 0.02244455  0.32217592  0.6810124 ]
 [-0.15305772 -2.19703727 -4.64407646]
 [ 0.12999258  1.86595311  3.94423393]
 [ 0.12162297  1.74581325  3.69028343]]
0.0
[-6.97979932  9.63875043  5.67732131]
[-6.97979932  9.63875043  5.67732131]
1.7763568394002505e-15


$$z = y^TX, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^n, z \in \mathbb{R}^m$$
$$\frac{\partial E}{\partial X} = y^T\frac{\partial E}{\partial z}$$
$$\frac{\partial E}{\partial y} = X \frac{\partial E}{\partial z}$$

In [16]:
X = np.random.randn(7, 3)
y = np.random.randn(7)
z = y @ X
e = z @ z

tX = torch.tensor(X, requires_grad=True)
ty = torch.tensor(y, requires_grad=True)
tz = torch.matmul(ty, tX)
te = torch.dot(tz, tz)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

3.8396984168592354
3.8396984168592354
0.0


In [17]:
dz = 2 * z
dX = np.outer(y, dz)
dy = X @ dz
dX_sol = tX.grad.data.numpy()
dy_sol = ty.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))
print(dy)
print(dy_sol)
print(metrics.tdist(dy, dy_sol))

[[-0.63119366  0.28804921  1.16363545]
 [ 3.28079586 -1.49721187 -6.04830275]
 [-0.74990752  0.34222502  1.38249007]
 [ 0.53283534 -0.24316276 -0.98230721]
 [-1.34046736  0.61173073  2.47121515]
 [ 0.22914034 -0.10456964 -0.42243109]
 [ 1.98750493 -0.9070104  -3.66405959]]
[[-0.63119366  0.28804921  1.16363545]
 [ 3.28079586 -1.49721187 -6.04830275]
 [-0.74990752  0.34222502  1.38249007]
 [ 0.53283534 -0.24316276 -0.98230721]
 [-1.34046736  0.61173073  2.47121515]
 [ 0.22914034 -0.10456964 -0.42243109]
 [ 1.98750493 -0.9070104  -3.66405959]]
2.622130794118103e-16
[-7.06392399 -2.60272907  4.7734634   1.26648965 -0.64150134  6.32889909
 -4.70270526]
[-7.06392399 -2.60272907  4.7734634   1.26648965 -0.64150134  6.32889909
 -4.70270526]
2.220446049250313e-16


$$Z = XY, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^{m*p}, z \in \mathbb{R}^{n*p}$$
$$\frac{\partial E}{\partial X} = \frac{\partial E}{\partial Z}Y^T$$
$$\frac{\partial E}{\partial Y} = X^T \frac{\partial E}{\partial Z}$$

In [18]:
X = np.random.randn(7, 3)
Y = np.random.randn(3, 2)
Z = X @ Y
Z_flat = Z.reshape(-1)
e = Z_flat @ Z_flat

tX = torch.tensor(X, requires_grad=True)
tY = torch.tensor(Y, requires_grad=True)
tZ = torch.matmul(tX, tY)
tZ_flat = tZ.view(-1)
te = torch.dot(tZ_flat, tZ_flat)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

20.602819165122455
20.602819165122455
0.0


In [19]:
dZ_flat = 2 * Z_flat
dZ = dZ_flat.reshape(Z.shape[0], Z.shape[1])
dX = dZ @ Y.T
dY = X.T @ dZ
dX_sol = tX.grad.data.numpy()
dY_sol = tY.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))
print(dY)
print(dY_sol)
print(metrics.tdist(dY, dY_sol))

[[ 3.97871846 -6.33975355 -4.23418738]
 [-0.39306208 -3.00703587 -0.39979006]
 [ 4.05223587 -5.1486296  -4.01785362]
 [-4.14463001  6.98619505  4.49678062]
 [-1.82270442  2.523923    1.85408595]
 [-0.64980715 -4.20649064 -0.4887448 ]
 [ 0.94733823 -2.78377223 -1.295082  ]]
[[ 3.97871846 -6.33975355 -4.23418738]
 [-0.39306208 -3.00703587 -0.39979006]
 [ 4.05223587 -5.1486296  -4.01785362]
 [-4.14463001  6.98619505  4.49678062]
 [-1.82270442  2.523923    1.85408595]
 [-0.64980715 -4.20649064 -0.4887448 ]
 [ 0.94733823 -2.78377223 -1.295082  ]]
2.180535678842061e-15
[[ -5.90633464 -11.19726466]
 [-10.5344012   -0.37380538]
 [-12.84078589  10.21902099]]
[[ -5.90633464 -11.19726466]
 [-10.5344012   -0.37380538]
 [-12.84078589  10.21902099]]
1.9868029630580076e-15


$$Z = X^TX, \space X \in \mathbb{R}^{n*m}, Z \in \mathbb{R}^{m*m}$$
$$\frac{\partial E}{\partial X} = X(\frac{\partial E}{\partial Z} +    \frac{\partial E}{\partial Z}^T)$$

In [20]:
X = np.random.randn(5, 3)
Z = X.T @ X
Z_flat = Z.reshape(-1)
e = Z_flat @ Z_flat

tX = torch.tensor(X, requires_grad=True)
tZ = torch.matmul(torch.transpose(tX, 1, 0), tX)
tZ_flat = tZ.view(-1)
te = torch.dot(tZ_flat, tZ_flat)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

271.84311934564516
271.84311934564516
0.0


In [21]:
dZ_flat = 2 * Z_flat
dZ = dZ_flat.reshape(Z.shape[0], Z.shape[1])
dX = X @ (dZ + dZ.T)
dX_sol = tX.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))

[[  -6.53808255   24.78188913  -32.65780906]
 [ -68.32951866   39.72331895  114.64315097]
 [ -61.11703755   52.95722259  101.65674081]
 [  62.48972191  -29.36472574 -122.19888141]
 [ -39.63881961   -0.17154525   47.63986001]]
[[  -6.53808255   24.78188913  -32.65780906]
 [ -68.32951866   39.72331895  114.64315097]
 [ -61.11703755   52.95722259  101.65674081]
 [  62.48972191  -29.36472574 -122.19888141]
 [ -39.63881961   -0.17154525   47.63986001]]
2.1334777765716796e-14


$Z_I = f(X_I)$, with $Z$ and $X$ tensors of same size, $f: \mathbb{R} \rightarrow \mathbb{R}$
$$\frac{\partial E}{\partial X_I} = \frac{\partial E}{\partial Z_I} *    f'(X_I)$$

In [22]:
X = np.random.randn(5, 3)
Z = np.cos(X)
Z_flat = Z.reshape(-1)
e = Z_flat @ Z_flat

tX = torch.tensor(X, requires_grad=True)
tZ = torch.cos(tX)
tZ_flat = tZ.view(-1)
te = torch.dot(tZ_flat, tZ_flat)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

8.519988773275411
8.519988773275413
1.7763568394002505e-15


In [23]:
dZ_flat = 2 * Z_flat
dZ = dZ_flat.reshape(Z.shape[0], Z.shape[1])
dX = dZ * (-np.sin(X))
dX_sol = tX.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))

[[ 0.37743023  0.97076711 -0.16627778]
 [-0.17436211 -0.68908609  0.9419013 ]
 [-0.56618871 -0.2965124   0.99917863]
 [ 0.87913338 -0.08090861 -0.32402193]
 [ 0.90011796  0.22875516 -0.75883033]]
[[ 0.37743023  0.97076711 -0.16627778]
 [-0.17436211 -0.68908609  0.9419013 ]
 [-0.56618871 -0.2965124   0.99917863]
 [ 0.87913338 -0.08090861 -0.32402193]
 [ 0.90011796  0.22875516 -0.75883033]]
0.0


$Z_I = f(X_I, Y_I)$, with $Z$, $X$and $Y$ tensors of same size, $f: \mathbb{R}*\mathbb{R} \rightarrow \mathbb{R}$
$$\frac{\partial E}{\partial X_I} = \frac{\partial E}{\partial Z_I} *    \frac{\partial f(X_I, Y_I)}{\partial X_I}$$
$$\frac{\partial E}{\partial Y_I} = \frac{\partial E}{\partial Z_I} *    \frac{\partial f(X_I, Y_I)}{\partial Y_I}$$

In [24]:
X = np.random.rand(7, 3) + 0.1
Y = np.random.randn(7, 3)
Z = np.power(X, Y)
Z_flat = Z.reshape(-1)
e = Z_flat @ Z_flat


tX = torch.tensor(X, requires_grad=True)
tY = torch.tensor(Y, requires_grad=True)
tZ = torch.pow(tX, tY)
tZ_flat = tZ.view(-1)
te = torch.dot(tZ_flat, tZ_flat)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

31.523999640318724
31.52399964031872
3.552713678800501e-15


In [25]:
dZ_flat = 2 * Z_flat
dZ = dZ_flat.reshape(Z.shape[0], Z.shape[1])
dX = dZ * Y * np.power(X, Y-1) 
dY = dZ * np.log(X) * np.power(X, Y)
dX_sol = tX.grad.data.numpy()
dY_sol = tY.grad.data.numpy()

print(dX)
print(dX_sol)
print(metrics.tdist(dX, dX_sol))
print(dY)
print(dY_sol)
print(metrics.tdist(dY, dY_sol))

[[  0.67332577  -1.9056877  -14.16141866]
 [ -2.62874314   1.04251959  -0.24685522]
 [ -2.97332623 -32.53156119 -12.38202496]
 [ -2.45148253  -1.3856159   -4.30861926]
 [  1.50858889  -2.81216764   0.25862449]
 [ -1.13230957   1.1170135    2.31469228]
 [  2.66040311  -0.55871038   0.28672347]]
[[  0.67332577  -1.9056877  -14.16141866]
 [ -2.62874314   1.04251959  -0.24685522]
 [ -2.97332623 -32.53156119 -12.38202496]
 [ -2.45148253  -1.3856159   -4.30861926]
 [  1.50858889  -2.81216764   0.25862449]
 [ -1.13230957   1.1170135    2.31469228]
 [  2.66040311  -0.55871038   0.28672347]]
0.0
[[ 2.47169302e-02  8.84477685e-02 -2.44620876e+00]
 [-1.57985735e+00 -2.29210742e-01 -5.42199613e-01]
 [-4.77586116e+00 -7.77662821e+00 -9.49341509e+00]
 [-1.24734811e+00 -1.34867052e+00 -2.38055701e+00]
 [ 1.97672305e-01  2.92962373e-04 -6.65333391e-02]
 [-8.63374000e-01 -2.96900600e-01 -1.22098813e-01]
 [ 6.24425701e-02 -1.53363054e-01 -7.81627557e-02]]
[[ 2.47169302e-02  8.84477685e-02 -2.44620876e+0

Every tensor sum of an axis can be transformed into a 3D-tensor sum on axis 1, using only reshape.  

$$X \in \mathbb{R}^{m * n * p}, Y \in \mathbb{R}^{m * p}$$
$y$ is the sum of $X$ on axis $2$.
$$Y_{ik} = \sum_{j=i}^n X_{ijk}$$
$$\frac{\partial E}{\partial X_{ijk}} = \frac{\partial E}{\partial Y_{ik}}$$

In [26]:
def prod(x):
    res = 1
    for v in x: res *= v
    return res

def sum_axis(X, axis):
    shape3 = (prod(X.shape[:axis]), X.shape[axis], prod(X.shape[axis+1:]))
    final_shape = X.shape[:axis] + X.shape[axis+1:]
    return np.sum(X.reshape(shape3), axis=1).reshape(final_shape)

X = np.random.randn(2, 4, 3, 7)

s = [sum_axis(X, i) for i in range(4)]

tX = torch.tensor(X, requires_grad = True)
s_sol = [torch.sum(tX, i) for i in range(4)]

for i in range(4):
    print(s[i].shape)
    print(s_sol[i].data.numpy().shape)
    print(metrics.tdist(s[i], s_sol[i].data.numpy()))

(4, 3, 7)
(4, 3, 7)
0.0
(2, 3, 7)
(2, 3, 7)
0.0
(2, 4, 7)
(2, 4, 7)
0.0
(2, 4, 3)
(2, 4, 3)
0.0


In [27]:
def my_expand_dims3(x, size):
    
    y = np.empty((x.shape[0], size, x.shape[1]))
    for i in range(x.shape[0]):
        for j in range(size):
            for k in range(x.shape[1]):
                y[i, j, k] = x[i, k]
    return y
    

def dsum_axis(X, axis, dout):
    dout = dout.reshape((prod(X.shape[:axis]), prod(X.shape[axis+1:])))
    return my_expand_dims3(dout, X.shape[axis]).reshape(X.shape)
    
a = np.array([[1, 2, 3], [4, 5, 6]])
a2 = my_expand_dims3(a, 2)
print(a2)

[[[1. 2. 3.]
  [1. 2. 3.]]

 [[4. 5. 6.]
  [4. 5. 6.]]]


In [28]:
for i in range(4):
    
    ds = 2 * s[i]
    dX = dsum_axis(X, i, ds)
    
    si_flat = s_sol[i].view(-1)
    tz = torch.dot(si_flat, si_flat)
    tz.backward()
    dX_sol = tX.grad.data.numpy()
    
    print(dX.shape)
    print(dX_sol.shape)
    print(metrics.tdist(dX, dX_sol))
    tX.grad.data.zero_()
    

(2, 4, 3, 7)
(2, 4, 3, 7)
0.0
(2, 4, 3, 7)
(2, 4, 3, 7)
0.0
(2, 4, 3, 7)
(2, 4, 3, 7)
0.0
(2, 4, 3, 7)
(2, 4, 3, 7)
0.0


## Derivatives sheet

$$(c)' = 0, \space c \in \mathbb{R}$$
$$(x)' = 1, \space x \in \mathbb{R}$$
$$(cx)' = c, \space c, x \in \mathbb{R}$$
$$(e^x)' = e^x, \space x \in \mathbb{R}$$
$$(ln(x))' = \frac{1}{x}, \space x \in \mathbb{R}$$
$$(\frac{1}{x})' = - \frac{1}{x^2}, \space x \in \mathbb{R}$$

$$(cos(x))' = -sin(x), \space x \in \mathbb{R}$$
$$(sin(x))' = cos(x), \space x \in \mathbb{R}$$
$$(cosh(x))' = -sinh(x), \space x \in \mathbb{R}$$
$$(sinh(x))' = cos(x), \space x \in \mathbb{R}$$
$$(tanh(x))' = 1 - tanh(x)^2, \space x \in \mathbb{R}$$

$$(\sigma(x))' = \sigma(x)(1 - \sigma(x)), \space x \in \mathbb{R}$$

$$\frac{\partial}{\partial x} x^y = y*x^{y-1}, \space x, y \in \mathbb{R}$$
$$\frac{\partial}{\partial y} x^y = ln(x)*x^{y}, \space x, y \in \mathbb{R}$$

$$\frac{\partial}{\partial x} ||x||_2^2 = 2x, x \in \mathbb{R}^n$$

$$\frac{\partial}{\partial x} \sum_{x=1}^n x_i = \mathbb{1}, x \in \mathbb{R}^n$$

$$z = ||x||_1, \space x \in \mathbb{R^n}$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{z} * sgn(x)$$

$$z = c + x, \space x, z \in \mathbb{R^n}, c \in \mathbb{R}$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{z}$$
$$\frac{\partial E}{\partial c} = \sum_{j=1}^n \frac{\partial E}{\partial z_i}$$

$$z = c * x, \space x \in \mathbb{R^n}, c \in \mathbb{R}$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{\partial z} * c$$
$$\frac{\partial E}{\partial c} = \frac{\partial E}{\partial z}^T x$$

$$z = c / x, \space x \in \mathbb{R^n}, c \in \mathbb{R}$$
$$\frac{\partial E}{\partial x} = -c * \frac{\partial E}{\partial z} / (x*x)$$
$$\frac{\partial E}{\partial c} = \frac{\partial E}{\partial z}^T \frac{1}{x}$$

$$z = \sum_{i=1}^n x_i, \space x \in \mathbb{R^n}, z \in \mathbb{R}$$
$$\frac{\partial E}{\partial x_i} = \frac{\partial E}{z}$$

$$x, y \in \mathbb{R}^n$$
$$\frac{\partial x^Ty}{\partial x} = y$$
$$\frac{\partial x^Ty}{\partial y} = x$$

$$x \in \mathbb{R}^n, \space M \in \mathbb{R}^{n*n} \text{ symetric}$$
$$\frac{\partial x^TMx}{\partial x} = 2Mx$$

$$z = x^Ty, \space x, y, z \in \mathbb{R}^n$$
$$\frac{\partial E}{\partial x} = \frac{\partial E}{\partial z} * y$$
$$\frac{\partial E}{\partial y} = \frac{\partial E}{\partial z} * x$$

$$z = Xy, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^m, z \in \mathbb{R}^n$$
$$\frac{\partial E}{\partial X} = \frac{\partial E}{\partial z} y^T$$
$$\frac{\partial E}{\partial y} = X^T \frac{\partial E}{\partial z}$$

$$z = y^TX, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^n, z \in \mathbb{R}^m$$
$$\frac{\partial E}{\partial X} = y^T\frac{\partial E}{\partial z}$$
$$\frac{\partial E}{\partial y} = X \frac{\partial E}{\partial z}$$

$$Z = XY, \space x \in \mathbb{R}^{n*m}, y \in \mathbb{R}^{m*p}, z \in \mathbb{R}^{n*p}$$
$$\frac{\partial E}{\partial X} = \frac{\partial E}{\partial Z}Y^T$$
$$\frac{\partial E}{\partial Y} = X^T \frac{\partial E}{\partial Z}$$

$$Z = X^TX, \space X \in \mathbb{R}^{n*m}, Z \in \mathbb{R}^{m*m}$$
$$\frac{\partial E}{\partial X} = X(\frac{\partial E}{\partial Z} +    \frac{\partial E}{\partial Z}^T)$$

$Z_I = f(X_I)$, with $Z$ and $X$ tensors of same size, $f: \mathbb{R} \rightarrow \mathbb{R}$
$$\frac{\partial E}{\partial X_I} = \frac{\partial E}{\partial Z_I} *    f'(X_I)$$

$Z_I = f(X_I, Y_I)$, with $Z$, $X$and $Y$ tensors of same size, $f: \mathbb{R}*\mathbb{R} \rightarrow \mathbb{R}$
$$\frac{\partial E}{\partial X_I} = \frac{\partial E}{\partial Z_I} *    \frac{\partial f(X_I, Y_I)}{\partial X_I}$$
$$\frac{\partial E}{\partial Y_I} = \frac{\partial E}{\partial Z_I} *    \frac{\partial f(X_I, Y_I)}{\partial Y_I}$$

$x \in \mathbb{R}^n$, and $S_i$ = softmax$(x)_i$
$$\frac{\partial S_i}{x_j} = S_i(1 - S_j) \space (i = j)$$
$$\frac{\partial S_i}{x_j} = -S_iS_j \space (i \neq j)$$