# scalar

In [2]:
import torch

def x_square(x):
    return x**2

def d_x_square(x):
    return 2*x

X = torch.rand(1, requires_grad=True)
x = X.item()

Y = x_square(X)
Y.backward()
print(X.grad, "<>", d_x_square(x))

tensor([0.7853]) <> 0.7852791547775269


In [None]:
def lossfn(y_actual, y_pred):
    return y_pred - y_actual

def d_x_lossfn(y_actual, y_pred, x):
    # d(loss)/dx => d(loss)dy * dy/dx
    return (0 - 1) * d_x_square(x)

y_actual = 10.0
X = torch.rand(1, requires_grad=True)
x = X.item()
Y_pred = x_square(X)
y_pred = x_square(x)

loss = lossfn(Y_pred, y_actual)
loss.backward()
print(X.grad, "<>", d_x_lossfn(y_actual, y_pred, x))

In [20]:
def lossfn2(y_actual, y_pred):
    return (y_pred - y_actual) ** 2

def d_lossfn2(y_actual, y_pred, x):
    # df/dx where f = g**2 and g = y - y_pred
    # df/dx = df/dg . dg/dx = 2g . dg/dx
    return  2 * lossfn(y_pred, y_actual) * d_x_lossfn(y_actual, y_pred, x)
    
x = 1
y_actual = 10
X = torch.tensor(x, dtype=float, requires_grad=True)
loss = lossfn2(y_actual, x_square(X))
loss.backward()

print(X.grad, "<>", d_lossfn2(y_actual, x_square(x), x))

tensor(-36., dtype=torch.float64) <> -36


In [22]:
def linear_fn(w, x, b):
    return w*x+b

def d_w_linear_fn(w, x, b):
    # d(w.x+b)/dw = x
    return x

def d_b_linear_fn(w, x, b):
    # d(w.x+b)/db = 1
    return 1

w = 5
b = 4
x = 2.5
W = torch.tensor(w, dtype=float, requires_grad=True)
B = torch.tensor(b, dtype=float, requires_grad=True)
X = torch.tensor(x, dtype=float, requires_grad=True)
Z = linear_fn(W, x, B)
Z.backward()
print(W.grad, B.grad, "<>", d_w_linear_fn(w, x, b), d_b_linear_fn(w, x, b))


tensor(2.5000, dtype=torch.float64) tensor(1., dtype=torch.float64) <> 2.5 1


# Vector

In [37]:
import numpy as np

def f(x):
    return sum(x**2);

x = np.array([1.0, 5.0, 6.0])
X  = torch.tensor(x, requires_grad=True, dtype=float)
Y = f(X)
Y.backward()
print(Y, X)
print(X.grad)


tensor(62., dtype=torch.float64, grad_fn=<AddBackward0>) tensor([1., 5., 6.], dtype=torch.float64, requires_grad=True)
tensor([ 2., 10., 12.], dtype=torch.float64)


## Linear function z = w.x + b

In [4]:
import torch 

X = torch.randn(5, 2, requires_grad=True)
W = torch.randn(2, 3, requires_grad=True)
B = torch.randn(1, 3, requires_grad=True)

print("B=", B)
print("X=", X)
print("W=", W)

print(X.shape, W.shape)
Q = torch.tensordot(X, W, dims=1)
Q.retain_grad()
print("Q=", Q.shape, Q)

Z = Q + B
Z.retain_grad()
print("Z=", Z)

O = torch.sum(Z)
print("O=", O)

O.backward()
#print("dz", Z.grad)
print("dw", W.grad)
print("dx", X.grad, "<>", sum(W[0]), sum(W[1]))
print("db", B.grad)

B= tensor([[ 0.9501,  0.2477, -1.2972]], requires_grad=True)
X= tensor([[-0.5632, -0.3472],
        [-1.1954, -0.6511],
        [ 1.8192, -1.7200],
        [ 0.4597, -0.0819],
        [ 0.4172, -0.1399]], requires_grad=True)
W= tensor([[0.4458, 0.6325, 0.4701],
        [0.0154, 0.1349, 0.2547]], requires_grad=True)
torch.Size([5, 2]) torch.Size([2, 3])
Q= torch.Size([5, 3]) tensor([[-0.2564, -0.4031, -0.3532],
        [-0.5430, -0.8440, -0.7278],
        [ 0.7845,  0.9186,  0.4171],
        [ 0.2037,  0.2797,  0.1952],
        [ 0.1839,  0.2451,  0.1605]], grad_fn=<ReshapeAliasBackward0>)
Z= tensor([[ 0.6937, -0.1554, -1.6504],
        [ 0.4072, -0.5963, -2.0250],
        [ 1.7346,  1.1664, -0.8801],
        [ 1.1538,  0.5274, -1.1020],
        [ 1.1340,  0.4928, -1.1367]], grad_fn=<AddBackward0>)
O= tensor(-0.2362, grad_fn=<SumBackward0>)
dz tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])


# Recursive function

Computation graph (CS231 course)

h(t) = w_h * h(t-1)
![comp_graph_recursive_function.jpg](./comp_graph_recursive_function.jpg)

In [19]:
import torch

W_h = torch.tensor(2.0, requires_grad=True)
w_h = W_h.item()

H0 = torch.tensor(0.5, requires_grad=True)
h0 = H0.item()

H1 = W_h * H0
h1 = H1.item()

H2 = W_h * H1
h2 = H2.item()

H2.backward()
dw_h_over_h0 = 0 #because H0 is constant for w_h
dw_h_over_h1 = h0 + w_h * dw_h_over_h0
dw_h_over_h2 = h1 + w_h * dw_h_over_h1

dh0_over_h0 = 1
dh0_over_h1 = w_h * dh0_over_h0
dh0_over_h2 = w_h * dh0_over_h1

print(W_h.grad, "<>", dw_h_over_h2, "<>", (h0 * w_h + h1))
print(H0.grad, "<>", dh0_over_h2, "<>", (w_h * w_h))

tensor(2.) == 2.0 == 2.0
tensor(4.) == 4.0 == 4.0


# References
https://arxiv.org/pdf/1802.01528.pdf

https://youtu.be/d14TUNcbn1k?si=hyEeGpEt5hP1XVHA

https://kratzert.github.io/2016/02/12/understanding-the-gradient-flow-through-the-batch-normalization-layer.html
