## Extra: Using arrays (unfinished tutorial)

It's not too difficult to extend this system to use `numpy` arrays as input and output for each node in the computation graph.

Recall the multivariable chain rule:

$$\frac{\partial f_i}{\partial x_j} = \sum_k \frac{\partial f_i}{\partial g_k}\frac{\partial g_k}{\partial x_j}$$

That means if we have a vector at each node, we would in theory need to store the local Jacobians at each stage. However looking at the formula for the gradient of a single scalar output (fixed $i$) $\frac{\partial f}{\partial \vec x}$ we see that we actually only need the product of the gradient vector $\frac{\partial f}{\partial \vec g}$ with the jacobian $\frac{\partial \vec g}{\partial \vec x}$ - the **vector jacobian product** (vjp)

**TODO**

* Explain all the operations
* Explain vjp of matrix multiplication
* Explain why we may need to sum over batch dimension during backprop
* Explain how we can use `Tensor` objects in the gradient calculation itself, allowing for higher order derivatives

In [None]:
from collections import defaultdict
from itertools import count
import numpy as np
import graphviz

In [None]:
class Tensor:
    def __init__(self, array, backward=None, op=None, name=None):
        self.array = array
        self._backward = None
        self.op = op
        self.name = name
        
    def __repr__(self):
        prefix = "Tensor("
        lines = repr(self.array).splitlines()
        for i in range(1, len(lines)):
            lines[i] = " " * len(prefix) + lines[i]
        array_repr = "\n".join(lines)
        return f"{prefix}{array_repr})"
    
    @property
    def shape(self):
        return self.array.shape
    
    def register(self, backward):
        self._backward = backward
        
    @staticmethod
    def wrap(f):
        def op(*args):
            new_args = []
            for arg in args:
                if not isinstance(arg, Tensor):
                    arg = Tensor(arg)
                new_args.append(arg)
            return f(*new_args)

        return op

    @Tensor.wrap
    def __add__(self, other):
        out = Tensor(self.array + other.array)
        
        @out.register
        def backward(grad):
            # function that receives gradient vector and computes vjp
            return [(self, grad), (other, grad)]

        return out
    
    @Tensor.wrap
    def __sub__(self, other):
        out = Tensor(self.array - other.array)
        
        @out.register
        def backward(grad):
            return [(self, grad), (other, -1 * grad)]
        
        return out
    
    @Tensor.wrap
    def __mul__(self, other):
        out = Tensor(self.array * other.array)
        
        @out.register
        def backward(grad):
            return [(self, other * grad), (other, self * grad)]

        return out
    
    @Tensor.wrap
    def __truediv__(self, other):
        out = Tensor(self.array / other.array)
        
        @out.register
        def backward(grad):
            return [(self, -1 * grad / other), (other, -1 * grad * self / (other * other))]
        
        return out
        
    def log(self):
        out = Tensor(np.log(self.array))
        
        @out.register
        def backward(grad):
            return [(self, grad / self)]
        
        return out
    
    @property
    def T(self):
        out = Tensor(np.swapaxes(self.array, -1, -2))
        
        @out.register
        def backward(grad):
            return [(self, grad.T)]

        return out
    
    def __matmul__(self, other):
        out = Tensor(self.array @ other.array, op="@")
        
        @out.register
        def backward(grad):
            return [
                (self, grad @ other.T),
                (other, self.T @ grad),
            ]
 
        return out
    
    def sum(self, axis=None):
        out = Tensor(self.array.sum(axis=axis))
        
        @out.register
        def backward(grad):
            return [(self, grad * Tensor(np.ones_like(self.array)))]

        return out
    
    def relu(self):
        out = Tensor(np.maximum(self.array, 0))
        
        @out.register
        def backward(grad):
            return [(self, grad * Tensor(self.array > 0))]
        
        return out
    
    def sigmoid(self):
        out = Tensor(1 / (1 + np.exp(-self.array)))
        
        @out.register
        def backward(grad):
            return [(self, grad * out * (1 - out))]
        
        return out

    def draw_graph(self):
        g = graphviz.Digraph(node_attr={"shape": "record", "height": ".1"})
        g.attr(rankdir="LR")
        counter = count(0)
        nodes = {}

        def add_node(tensor):
            node_name = f"node{next(counter)}"
            g.node(node_name, "Tensor" if not tensor.name else tensor.name)#format_var(tensor))
            nodes[tensor] = node_name

        def add_edges(tensor, grad):
            if tensor not in nodes:
                add_node(tensor)
            if tensor._backward is None:
                return
            for child_tensor, deriv in tensor._backward(grad):
                if child_tensor not in nodes:
                    add_node(child_tensor)
                g.edge(nodes[child_tensor], nodes[tensor])
                add_edges(child_tensor, deriv)

        add_edges(self, Tensor(np.ones_like(self.array)))
        return g
    
    @property
    def topo_ordered_nodes(self):
        nodes = []
        visited = set()

        def add_nodes(tensor, grad):
            if tensor in visited:
                return
            visited.add(tensor)
            if tensor._backward is not None:
                for child_tensor, deriv in tensor._backward(grad):
                    add_nodes(child_tensor, deriv)
            nodes.append(tensor)

        add_nodes(self, Tensor(np.ones_like(self.array)))

        return nodes
    
    def gradients(self):
        grads = defaultdict(lambda: Tensor(np.array(0)))
        grads[self] = Tensor(np.ones_like(self.array))
        for parent_tensor in reversed(self.topo_ordered_nodes):
            grad = grads[parent_tensor]
            if parent_tensor._backward is not None:
                for child_tensor, deriv in parent_tensor._backward(grad):
                    if deriv.shape != child_tensor.shape:
                        deriv = deriv.sum(axis=0)
                    grads[child_tensor] += deriv
        return grads
    
Tensor.__rmul__ = Tensor.__mul__
Tensor.__radd__ = Tensor.__add__
Tensor.__rsub__ = lambda other, self: Tensor.__sub__(self, other)
Tensor.__rtruediv__ = lambda other, self: Tensor.__truediv__(self, other)

In [None]:
class Layer:
    def __init__(self, n_inputs, n_outputs):
        self.weights = Tensor(np.random.normal(size=(n_inputs, n_outputs)), name="weights")
        self.bias = Tensor(np.zeros(n_outputs), name="bias")
        
    def __call__(self, inputs):
        return inputs @ self.weights + self.bias

In [None]:
x = Tensor(np.random.rand(10, 5))
x

In [None]:
layer = Layer(5, 6)

In [None]:
layer(x).draw_graph()

In [None]:
class MLP:
    def __init__(self, n_inputs, neurons_per_layer):
        self.layers = []
        for n_outputs in neurons_per_layer:
            self.layers.append(Layer(n_inputs, n_outputs))
            n_inputs = n_outputs
    
    def __call__(self, inputs):
        x = inputs
        for layer in self.layers:
            x = layer(x)
            if layer is not self.layers[-1]:
                x = x.relu()
        return x.sigmoid()

In [None]:
mlp = MLP(x.shape[1], [5, 1])

In [None]:
out = mlp(x)
out

In [None]:
def binary_crossentropy(y_true, y_pred):
    eps = 1e-50 # to avoid math domain errors from log(0)
    return (-1 * (y_true * (y_pred + eps).log() + (1 - y_true) * (1 - y_pred + eps).log())).sum() / y_true.shape[0]

In [None]:
binary_crossentropy(Tensor(np.random.rand(*out.shape)), out).draw_graph()