# Minigrad Autodiff Engine

This notebook contains a minimal implementation of an automatic differentiation engine that supports reverse-mode autodiff on N-dimensional tensors:
- Tensor: core data structure and computation graph
- Broadcasting & reductions: broadcasting in forward pass and unbroadcasting in backward pass
- Matmul: matrix multiplication and its vector-Jacobian products (VJPs)
- NN module: neural network library with Linear layers, activations, parameter collection
- Experiments: gradient checking, training simple models on toy datasets

Note: this project is self-contained and won't be shared with CNN / Transformer / Diffusion, those projects will use Pytorch autograd.


## Setup

In [18]:
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=3)
np.random.seed(0)

def _unbroadcast(grad, target_shape):
    """
    Reduce a gradient to match the given target shape.

    Params:
        grad: np.array, the gradient with the broadcasted shape
        target_shape: tuple of int, the shape of the tensor before broadcasting

    Returns:
        result: np.array, the gradient with reduced dimensions and summed so it matches the target shape
    """
    # grad.ndim is number of axes, len(target_shape) is desired number of axes.
    # For [[a, b], [c, d]], the result of sum(axis=0) is [a+c, b+d]
    while grad.ndim > len(target_shape):
        grad = grad.sum(axis=0)
    
    # For every axis where the original tensor had size 1 and was broadcasted, we sum
    # the gradient along that axis to collapse it back.
    for axis, dim in reversed(list(enumerate(target_shape))):
        if dim == 1:
            grad = grad.sum(axis=axis, keepdims=True)
    return grad

## Tensor Data Structure

The `Tensor` is a core data structure used to construct computation graphs or neural networks that process input data, generate predictions, and evaluate a loss function. It also enables us to perform backpropagation to compute gradients at each node so the network's weights can be updated during training.

In [24]:
class Tensor:
    def __init__(self, data, requires_grad=False):
        # Numpy scalar, vector, matrix
        self.data = np.array(data, dtype=np.float64)

        # Determines if we should store gradient
        self.requires_grad = requires_grad

        # Gradient of the loss wrt this tensor (same shape as .data)
        self.grad = None

        # Closure that computes local gradients and accumulates them into parent tensors
        self._backward = lambda: None

        # Parent tensors
        self._prev = ()

        # Op name for logging
        self._op = ''

    def __repr__(self):
        # String representation for logging
        return f"Tensor(data={self.data}, grad={self.grad}, op={self._op})"

    def backward(self):
        """Performs backpropagation on the computation graph and updates internal variables."""
        if self.data.shape != ():
            raise ValueError("backward() assumes scalar output on the loss layer")

        # Set loss gradient to ones
        self.grad = np.array(1.0, dtype=np.float64)

        # Build DFS topological sort of computation graph
        topo, visited = [], set()
        def build(v):
            if v not in visited:
                visited.add(v)
                for p in v._prev: 
                    build(p)
                topo.append(v)
        build(self)

        # Reverse topo sort order (starting from the outermost / loss layer)
        for v in reversed(topo):
            if v.grad is not None:
                v._backward()

    def __add__(self, other):
        """Elementwise addition with broadcasting, returns a new Tensor and sets up gradient flow to both parents"""
        if not isinstance(other, Tensor):
            other = Tensor(other)

        # Construct new tensor resulting from the operation
        output_data = self.data + other.data
        output = Tensor(data=output_data, requires_grad=(self.requires_grad or other.requires_grad))
        output._prev = (self, other)
        output._op = "add"

        # For the new tensor, set backprop method to update the gradients of its parents.
        # For addition, the local derivative with respect to each parent is 1, so the upstream
        # gradient is passed straight through.
        # .grad has the gradient with respect to the outermost layer.
        def _backward():
            if self.requires_grad:
                grad_self = _unbroadcast(output.grad, self.data.shape)
                self.grad = grad_self if self.grad is None else self.grad + grad_self
            if other.requires_grad:
                grad_other = _unbroadcast(output.grad, other.data.shape)
                other.grad = grad_other if other.grad is None else other.grad + grad_other

        output._backward = _backward
        return output
    
    def __mul__(self, other):
        """Elementwise multiplication with broadcasting, returns a new Tensor and sets up gradient flow to both parents"""
        if not isinstance(other, Tensor):
            other = Tensor(other)

        output_data = self.data * other.data
        output = Tensor(data=output_data, requires_grad=(self.requires_grad or other.requires_grad))
        output._prev = (self, other)
        output._op = "mul"

        def _backward():
            if self.requires_grad:
                grad_other = output.grad * other.data
                grad_other = _unbroadcast(grad_other, self.data.shape)
                self.grad = grad_other if self.grad is None else self.grad + grad_other
            if other.requires_grad:
                grad_self = output.grad * self.data
                grad_self = _unbroadcast(grad_self, other.data.shape)
                other.grad = grad_self if other.grad is None else other.grad + grad_self

        output._backward = _backward
        return output

    def __matmul__(self, other):
        """Matrix multiplication (@ operator), returns a new Tensor and sets up gradient flow to both parents"""
        if not isinstance(other, Tensor):
            other = Tensor(other)

        output_data = self.data @ other.data
        output = Tensor(data=output_data, requires_grad=(self.requires_grad or other.requires_grad))
        output._prev = (self, other)
        output._op = "matmul"

        def _backward():
            if self.requires_grad:
                if other.data.ndim == 1:
                    grad_self = np.outer(output.grad, other.data)
                else:
                    grad_self = output.grad @ other.data.T
                grad_self = _unbroadcast(grad_self, self.data.shape)
                self.grad = grad_self if self.grad is None else self.grad + grad_self
            if other.requires_grad:
                if self.data.ndim == 1:
                    grad_other = np.outer(self.data, output.grad)
                else:
                    grad_other = self.data.T @ output.grad
                grad_self = _unbroadcast(grad_self, other.data.shape)
                other.grad = grad_other if other.grad is None else other.grad + grad_other

        output._backward = _backward
        return output

    def __neg__(self):
        """Negative operator, uses predefined ops"""
        return self * -1

    def __sub__(self, other):
        """Negative operator, uses predefined ops"""
        if not isinstance(other, Tensor):
            other = Tensor(other)
        return self + (-other)

    def __rsub__(self, other):
        """Reverse subtraction operator, uses predefined ops"""
        if not isinstance(other, Tensor):
            other = Tensor(other)
        return other + (-self)

    def __pow__(self, power):
        """Power (** operator), returns a new Tensor and sets up gradient flow to both parents"""
        assert isinstance(power, (int, float)), "only scalar powers supported"
        output_data = self.data ** power
        output = Tensor(output_data, requires_grad=self.requires_grad)
        output._prev = (self,)
        output._op = "pow"

        def _backward():
            if self.requires_grad:
                grad_self = output.grad * (power * (self.data ** (power - 1)))
                self.grad = grad_self if self.grad is None else self.grad + grad_self

        output._backward = _backward
        return output

    def __truediv__(self, other):
        """True division, uses predefined ops"""
        if not isinstance(other, Tensor):
            other = Tensor(other)
        return self * (other ** -1)

    def exp(self):
        """e^x, returns a new Tensor and sets up gradient flow to both parents"""
        output_data = np.exp(self.data)
        output = Tensor(data=output_data, requires_grad=self.requires_grad)
        output._prev = (self,)
        output._op = "exp"

        def _backward():
            if self.requires_grad:
                grad_self = output.grad * output.data
                self.grad = grad_self if self.grad is None else self.grad + grad_self

        output._backward = _backward
        return output

    def log(self):
        """Natural logarithm, returns a new Tensor and sets up gradient flow to the parent"""
        output_data = np.log(self.data)
        output = Tensor(data=output_data, requires_grad=self.requires_grad)
        output._prev = (self,)
        output._op = "log"

        def _backward():
            if self.requires_grad:
                grad_self = output.grad * (1.0 / self.data)
                self.grad = grad_self if self.grad is None else self.grad + grad_self

        output._backward = _backward
        return output

    def relu(self):
        """ReLU activation, returns a new Tensor and sets up gradient flow to the parent"""
        output_data = np.maximum(0, self.data)
        output = Tensor(data=output_data, requires_grad=self.requires_grad)
        output._prev = (self,)
        output._op = "relu"

        def _backward():
            if self.requires_grad:
                grad_self = output.grad * (self.data > 0).astype(self.data.dtype)
                self.grad = grad_self if self.grad is None else self.grad + grad_self

        output._backward = _backward
        return output

    def sigmoid(self):
        """Sigmoid activation, returns a new Tensor and sets up gradient flow to the parent"""
        output_data = 1 / (1 + np.exp(-self.data))
        output = Tensor(data=output_data, requires_grad=self.requires_grad)
        output._prev = (self,)
        output._op = "sigmoid"

        def _backward():
            if self.requires_grad:
                grad_self = output.grad * (output.data * (1 - output.data))
                self.grad = grad_self if self.grad is None else self.grad + grad_self

        output._backward = _backward
        return output

    def tanh(self):
        """Tanh activation, returns a new Tensor and sets up gradient flow to the parent"""
        output_data = np.tanh(self.data)
        output = Tensor(data=output_data, requires_grad=self.requires_grad)
        output._prev = (self,)
        output._op = "tanh"

        def _backward():
            if self.requires_grad:
                grad_self = output.grad * (1 - output.data ** 2)
                self.grad = grad_self if self.grad is None else self.grad + grad_self

        output._backward = _backward
        return output

    def sum(self, axis=None, keepdims=False):
        """Sum of tensor elements over given axes, returns a new Tensor"""
        output_data = self.data.sum(axis=axis, keepdims=keepdims)
        output = Tensor(data=output_data, requires_grad=self.requires_grad)
        output._prev = (self,)
        output._op = "sum"

        def _backward():
            if self.requires_grad:
                grad_self = output.grad
                if not keepdims and axis is not None:
                    grad_self = np.expand_dims(grad_self, axis=axis)
                grad_self = np.broadcast_to(grad_self, self.data.shape)
                self.grad = grad_self if self.grad is None else self.grad + grad_self

        output._backward = _backward
        return output

    def mean(self, axis=None, keepdims=False):
        """Mean of tensor elements over given axes, returns a new Tensor"""
        output_data = self.data.mean(axis=axis, keepdims=keepdims)
        output = Tensor(data=output_data, requires_grad=self.requires_grad)
        output._prev = (self,)
        output._op = "mean"

        def _backward():
            if self.requires_grad:
                grad_self = output.grad
                if not keepdims and axis is not None:
                    grad_self = np.expand_dims(grad_self, axis=axis)
                if axis is None:
                    denom = self.data.size
                else:
                    denom = self.data.shape[axis]
                grad_self = np.broadcast_to(grad_self, self.data.shape) / denom
                self.grad = grad_self if self.grad is None else self.grad + grad_self

        output._backward = _backward
        return output

    def reshape(self, *shape):
        """Reshape tensor to given shape, returns a new Tensor"""
        output_data = self.data.reshape(*shape)
        output = Tensor(data=output_data, requires_grad=self.requires_grad)
        output._prev = (self,)
        output._op = "reshape"

        def _backward():
            if self.requires_grad:
                grad_self = output.grad.reshape(self.data.shape)
                self.grad = grad_self if self.grad is None else self.grad + grad_self

        output._backward = _backward
        return output

    def transpose(self, *axes):
        """Transpose tensor according to given axes, returns a new Tensor"""
        output_data = self.data.transpose(*axes)
        output = Tensor(data=output_data, requires_grad=self.requires_grad)
        output._prev = (self,)
        output._op = "transpose"

        def _backward():
            if self.requires_grad:
                if axes:
                    inv_axes = np.argsort(axes)
                    grad_self = output.grad.transpose(*inv_axes)
                else:
                    grad_self = output.grad.transpose()
                self.grad = grad_self if self.grad is None else self.grad + grad_self

        output._backward = _backward
        return output

    def broadcast_to(self, shape):
        """Broadcast tensor to given shape, returns a new Tensor"""
        output_data = np.broadcast_to(self.data, shape)
        output = Tensor(data=output_data, requires_grad=self.requires_grad)
        output._prev = (self,)
        output._op = "broadcast_to"

        def _backward():
            if self.requires_grad:
                grad_self = _unbroadcast(output.grad, self.data.shape)
                self.grad = grad_self if self.grad is None else self.grad + grad_self

        output._backward = _backward
        return output

### Testing

We can try out a few basic models to see if the Tensor operations are correct.

In [None]:
x = Tensor([1, 2], requires_grad=True)
W = Tensor([[1, 2], [1, 1]], requires_grad=True)
b = Tensor([1, 1], requires_grad=True)
y = W @ x + b
label = Tensor([5, 5])
loss = ((y - label) ** 2).sum()
loss.backward()

print('x:', x)
print('W:', W)
print('b:', b)
print('y:', y)
print('label:', label)
print('loss:', loss)

x: Tensor(data=[1. 2.], grad=[0. 2.], op=)
W: Tensor(data=[[1. 2.]
 [1. 1.]], grad=[[ 2.  4.]
 [-2. -4.]], op=)
b: Tensor(data=[1. 1.], grad=[ 2. -2.], op=)
y: Tensor(data=[6. 4.], grad=[ 2. -2.], op=add)
label: Tensor(data=[5. 5.], grad=None, op=)
loss: Tensor(data=2.0, grad=1.0, op=sum)


In [36]:
a = Tensor([3.0], requires_grad=True)
b = Tensor([4.0], requires_grad=True)
c = (a + b).sum()
c.backward()
print("add:", a.grad, b.grad, "(expected 1, 1)")

a = Tensor([3.0], requires_grad=True)
b = Tensor([4.0], requires_grad=True)
c = (a * b).sum()
c.backward()
print("mul:", a.grad, b.grad, "(expected 4, 3)")

x = Tensor([2.0], requires_grad=True)
y = (x ** 3).sum()
y.backward()
print("pow:", x.grad, "(expected 12)")

a = Tensor([6.0], requires_grad=True)
b = Tensor([2.0], requires_grad=True)
c = (a / b).sum()
c.backward()
print("div:", a.grad, b.grad, "(expected 1/2, -6/4 = -1.5)")

x = Tensor([2.0], requires_grad=True)
y = x.exp().sum()
y.backward()
print("exp:", x.grad, "(expected 7.389)")

x = Tensor([2.0], requires_grad=True)
y = x.log().sum()
y.backward()
print("log:", x.grad, "(expected 0.5)")

x = Tensor([-1.0, 2.0], requires_grad=True)
y = x.relu().sum()
y.backward()
print("relu:", x.grad, "(expected [0, 1])")

x = Tensor([0.0], requires_grad=True)
y = x.sigmoid().sum()
y.backward()
print("sigmoid:", x.grad, "(expected 0.25)")

x = Tensor([0.0], requires_grad=True)
y = x.tanh().sum()
y.backward()
print("tanh:", x.grad, "(expected 1.0)")

W = Tensor([[1.0, 2.0]], requires_grad=True)
x = Tensor([3.0, 4.0], requires_grad=True)
y = (W @ x).sum()
y.backward()
print("matmul:", W.grad, x.grad, "(expected [[3, 4]], [1, 2])")

x = Tensor([1.0, 2.0, 3.0], requires_grad=True)
y = x.sum()
y.backward()
print("sum:", x.grad, "(expected [1, 1, 1])")

x = Tensor([1.0, 2.0, 3.0], requires_grad=True)
y = x.mean()
y.backward()
print("mean:", x.grad, "(expected [1/3, 1/3, 1/3])")

x = Tensor([[1.0, 2.0]], requires_grad=True)
y = x.broadcast_to((3,2))
z = y.sum()
z.backward()
print("broadcast:", x.grad, "(expected [[3, 3]])")

x = Tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)
y = x.reshape(4,)
z = y.transpose().sum()
z.backward()
print("reshape+transpose:", x.grad, "(expected all ones)")

add: [1.] [1.] (expected 1, 1)
mul: [4.] [3.] (expected 4, 3)
pow: [12.] (expected 12)
div: [0.5] [-1.5] (expected 1/2, -6/4 = -1.5)
exp: [7.389] (expected 7.389)
log: [0.5] (expected 0.5)
relu: [0. 1.] (expected [0, 1])
sigmoid: [0.25] (expected 0.25)
tanh: [1.] (expected 1.0)
matmul: [[3. 4.]] [1. 2.] (expected [[3, 4]], [1, 2])
sum: [1. 1. 1.] (expected [1, 1, 1])
mean: [0.333 0.333 0.333] (expected [1/3, 1/3, 1/3])
broadcast: [[3. 3.]] (expected [[3, 3]])
reshape+transpose: [[1. 1.]
 [1. 1.]] (expected all ones)
