In [13]:
import torch
import numpy as np
import torch.nn.functional as F

In [14]:
x = np.random.randn(2)

A = np.random.randn(10, 2)
b = np.random.randn(10)

C = np.random.randn(10, 10)
d = np.random.randn(10)

E = np.random.randn(1, 10)

y = np.random.randn(1)

def mse_torch(y_hat, y):
    return 0.5 * torch.sum((y_hat - y) ** 2)

def mse_numpy(y_hat, y):
    return 0.5 * np.sum((y_hat - y) ** 2)

In [15]:
At = torch.from_numpy(A)
At.requires_grad = True
bt = torch.from_numpy(b)
bt.requires_grad = True
Ct = torch.from_numpy(C)
Ct.requires_grad = True
dt = torch.from_numpy(d)
dt.requires_grad = True
Et = torch.from_numpy(E)
Et.requires_grad = True

xt = torch.from_numpy(x)
yt = torch.from_numpy(y)

In [16]:
def model_torch(xt):
    yt = At @ xt + bt
    yt = F.relu(yt)
    yt = Ct @ yt + dt
    yt = F.relu(yt)
    yt = Et @ yt
    return yt

In [17]:
loss = mse_torch(model_torch(xt), yt)

In [18]:
loss.backward()
loss

tensor(3.0756, dtype=torch.float64, grad_fn=<MulBackward0>)

In [19]:
At.grad

tensor([[ 0.0000, -0.0000],
        [ 1.2418, -0.2984],
        [-1.6198,  0.3893],
        [ 4.3955, -1.0563],
        [ 0.0000, -0.0000],
        [-8.3285,  2.0014],
        [ 0.9349, -0.2247],
        [ 3.5032, -0.8419],
        [ 0.0000, -0.0000],
        [ 2.1107, -0.5072]], dtype=torch.float64)

# Numpy version

In [None]:
mul0 = A @ x
add0 = mul0 + b
relu0 = np.maximum(0, add0)

mul1 = C @ relu0
add1 = mul1 + d
relu1 = np.maximum(0, add1)

mul2 = E @ relu1
y_hat = mul2

loss = 0.5 * np.sum((y_hat - y) ** 2)
delta_y = y_hat - y

In [None]:
loss

18.199294931170332

In [None]:
# Last layer: y_hat = E @ mul2
# y = f(a, x), df/dx = a, xd = df/da
a_prime = E.T
E_prime = mul2
grad_E = np.outer(delta_y, E_prime)
delta = a_prime @ delta_y

# relu1 = relu(add1)
a_prime = (add1 >= 0).astype(np.float32)
delta = a_prime * delta

# add1 = mul1 + d
a_prime = np.ones_like(d)
d_prime = np.ones_like(d)
grad_d = (d_prime * delta)
delta = a_prime * delta

# mul1 = C @ relu0
a_prime = C.T
C_prime = relu0
grad_C = np.outer(delta, C_prime)
delta = a_prime @ delta

# relu0 = relu(add0)
a_prime = (add0 >= 0).astype(np.float32)
delta = a_prime * delta

# add0 = mul0 + b
a_prime = np.ones_like(b)
b_prime = np.ones_like(b)
grad_b = (b_prime * delta)
delta = a_prime * delta

# mul0 = A @ x
a_prime = A.T
A_prime = x
grad_A = np.outer(delta, A_prime)

In [None]:
bt.grad, grad_b

(tensor([ 10.3594, -17.6940,   0.0000,  21.0760,  -0.4013,   4.2122,   0.0000,
          -9.1358,   0.0000,   0.0000], dtype=torch.float64),
 array([ 10.35944731, -17.69400177,   0.        ,  21.07600819,
         -0.40127032,   4.21216019,  -0.        ,  -9.13577466,
          0.        ,  -0.        ]))

In [None]:
bt.data

tensor([-1.6089,  0.1655, -1.1472,  0.5495, -0.6791, -1.0682, -1.8927, -0.5223,
        -0.0524, -0.7498], dtype=torch.float64)

At each operation `y = f(x, w)`, we need to record:
- The input, `x`
- The gradient wrt to `x`, $\frac{\partial f}{\partial x} = x'$
- Whether `w` requires a gradient
  - If it does, then record $\frac{\partial f}{\partial w} = w'$

During the backwards step, we loop through the tape and compute:
- $\delta_y = \hat{y} - y$
- $\frac{\partial L}{\partial w} = w' * \delta$
- $\delta_{l-1} = x' * \delta_l$


In [21]:
import pickle
with open("./data/assignment-one-test-parameters.pkl", 'rb') as f:
    d = pickle.load(f)

In [23]:
d.keys()

dict_keys(['w1', 'w2', 'w3', 'b1', 'b2', 'b3', 'inputs', 'targets'])

In [25]:
d['inputs'].shape

(200, 2)

Pass in weight sizes, activations, biases

num_neurons []
activations []
bias []

In [None]:
class model:
    def __init__():
        self.tape = []
    
    def forward():
        z = self.matmul(x, A)
        z = self.add(z, b)
        z = self.relu(z)
        return z
    
    def matmul(self, x, A):
        self.tape.append({
            'input0': x,
            'input1': A,
            'output': A @ x,
            'function': 'matmul'})
        return A @ x

    def matmul_backward(self, x, A, delta):
        x_prime = A.T
        A_prime = x
        grad_A = np.outer(delta, A_prime)
        delta = x_prime @ delta
        A.grad = grad_A
        return delta

    def add(self, x, b):
        self.tape.append({
            'input0': x,
            'input1': b,
            'function': 'add'
        })
    
    def add_backward(self, x, b, delta):
        a_prime = np.ones_like(b)
        b_prime = np.ones_like(b)
        grad_b = (b_prime * delta)
        delta = a_prime * delta
        return delta, grad_b
        
    
    def relu(self, a):
        self.tape.stick({
            'input0': a,
            'function': 'relu'
        })


    def backward():
        delta = compute_delta(y_hat, y)
        for item in reversed(self.tape):
            # Get operation, inputs
            # Compute df/dw and df/dx
            # If we care about the parameter gradients, compute dL/dw
            # Compute the delta
            # Pass the delta to
            delta, param_update = self.op_backward(a, b, delta)
            self.gradients["A"] = param_update
        

In [None]:
class model:
    def __init__():
        self.layers = [layer1, layer2, ...]

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
            self.inputs.append(x)
            self.outputs.append(x)
        return x

    def backward(self, y):
        delta = compute_delta(y)
        for layer in reversed(self.layers):
            delta = layer.backward(delta)