# Minigrad Autodiff Engine

This notebook contains a minimal implementation of an automatic differentiation engine that supports reverse-mode autodiff on N-dimensional tensors:
- Tensor: core data structure and computation graph
- Broadcasting & reductions: broadcasting in forward pass and unbroadcasting in backward pass
- Matmul: matrix multiplication and its vector-Jacobian products (VJPs)
- NN module: neural network library with Linear layers, activations, parameter collection
- Experiments: gradient checking, training simple models on toy datasets

Note: this project is self-contained and won't be shared with CNN / Transformer / Diffusion, those projects will use Pytorch autograd.


## Setup

In [None]:
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=3)
np.random.seed(0)

def _unbroadcast(grad, target_shape):
    """
    Reduce a gradient to match the given target shape.

    Params:
        grad: np.array, the gradient with the broadcasted shape
        target_shape: tuple of int, the shape of the tensor before broadcasting

    Returns:
        result: np.array, the gradient with reduced dimensions and summed so it matches the target shape
    """
    # grad.ndim is number of axes, len(target_shape) is desired number of axes.
    # For [[a, b], [c, d]], the result of sum(axis=0) is [a+c, b+d]
    while grad.ndim > len(target_shape):
        grad = grad.sum(axis=0)
    
    # For every axis where the original tensor had size 1 and was broadcasted, we sum
    # the gradient along that axis to collapse it back.
    for axis, dim in reversed(list(enumerate(target_shape))):
        if dim == 1:
            grad = grad.sum(axis=axis, keepdims=True)
    return grad

## Tensor Data Structure

In [None]:
class Tensor:
    def __init__(self, data, requires_grad=False):
        # Numpy scalar, vector, matrix
        self.data = np.array(data, dtype=np.float64)

        # Determines if we should store gradient
        self.requires_grad = requires_grad

        # Gradient of the loss wrt this tensor (same shape as .data)
        self.grad = None

        # Closure that computes local gradients and accumulates them into parent tensors
        self._backward = lambda: None

        # Parent tensors
        self._prev = set()

        # Op name for logging
        self._op = ''

    def __repr__(self):
        # String representation for logging
        return f"Tensor(data={self.data}, grad={self.grad}, op={self._op})"

    def backward(self):
        """Performs backpropagation on the computation graph and updates internal variables."""
        topo, visited = [], set()

        # Build DFS topological sort of computation graph
        def build(v):
            if v not in visited:
                visited.add(v)
                for p in v._prev: 
                    build(p)
                topo.append(v)
        build(self)

        # Set loss gradient to ones
        self.grad = np.ones_like(self.data)

        # Set computation graph gradients to zeros
        for v in topo:
            if v.requires_grad:
                v.grad = np.zeros_like(v.data)

        # Reverse topo sort order (starting from the outermost / loss layer)
        for v in reversed(topo):
            if v.requires_grad:
                v._backward()

    def __add__(self, other):
        """Elementwise addition with broadcasting, returns a new Tensor and sets up gradient flow to both parents"""
        if not isinstance(other, Tensor):
            other = Tensor(other)

        # Construct new tensor resulting from the operation
        output_data = self.data + other.data
        output = Tensor(data=output_data, requires_grad=(self.requires_grad or other.requires_grad))
        output._prev = {self, other}
        output._op = "add"

        # For the new tensor, set backprop method to update the gradients of its parents.
        # For addition, the local derivative with respect to each parent is 1, so the upstream
        # gradient is passed straight through.
        # .grad has the gradient with respect to the outermost layer.
        def _backward():
            output_grad = output.grad
            for parent in output._prev:
                if parent.requires_grad:
                    adjusted_grad = _unbroadcast(output_grad, parent.data.shape)
                    parent.grad = (parent.grad if parent.grad is not None else np.zeros_like(parent.data)) + adjusted_grad

        output._backward = _backward
        return output

    
    def __mul__(self, other): pass
    def __matmul__(self, other): pass
    def __neg__(self): pass
    def __sub__(self, other): pass
    def __truediv__(self, other): pass
    def __pow__(self, power): pass

    def exp(self): pass
    def log(self): pass
    def relu(self): pass
    def sigmoid(self): pass
    def tanh(self): pass

    def sum(self, axis=None, keepdims=False): pass
    def mean(self, axis=None, keepdims=False): pass

    def reshape(self, *shape): pass
    def transpose(self, *axes): pass
    def broadcast_to(self, shape): pass