In [56]:
from dataclasses import dataclass
from typing import Union, List, Callable, Optional, Tuple, Literal

import numpy as np

Scalar = Union[int, float]

Data = Union[Scalar, list, np.ndarray, "Tensor"]


@dataclass(frozen=True)
class Leaf:
    value: "Tensor"
    grad_fn: Callable[[np.ndarray], np.ndarray]


class Tensor:
    def __init__(
        self,
        data: Data,
        requires_grad: bool = False,
        dependencies: Optional[List[Leaf]] = None,
        dtype=np.float32
    ):
        self._data = Tensor.build_ndarray(data, dtype)
        self.dtype = dtype

        self.requires_grad = requires_grad
        self.dependencies = dependencies or []

        self.grad = np.zeros_like(self._data) if requires_grad else None

    @property
    def data(self) -> np.ndarray:
        return self._data

    @data.setter
    def data(self, data: Data):
        self._data = Tensor.build_ndarray(data, self.dtype)
        if self.requires_grad:
            self.zero_grad()

    @property
    def size(self) -> int:
        return self.data.size

    @property
    def shape(self) -> Tuple[int, ...]:
        return self.data.shape

    @property
    def ndim(self) -> int:
        return self.data.ndim

    @staticmethod
    def build_ndarray(data: Data, dtype=np.float32) -> np.ndarray:
        if isinstance(data, Tensor):
            return np.array(data.data, dtype=dtype)
        if isinstance(data, np.ndarray):
            return data.astype(dtype)
        return np.array(data, dtype=dtype)

    def __repr__(self):
        return f"Tensor({self.data}, requires_grad={self.requires_grad}, shape={self.shape})"
    
    def zero_grad(self):
        if self.grad is None:
            self.grad = np.zeros_like(self._data)
        else:
            self.grad.fill(0.0)

    def backward(self, grad: Optional[np.ndarray] = None) -> None:
        if not self.requires_grad:
            raise RuntimeError(
                "Cannot call backward() on a tensor that does not require gradients. "
                "If you need gradients, ensure that requires_grad=True when creating the tensor."
            )

        if grad is None:
            if grad.shape == ():
                grad = np.array(1.0)
            else:
                raise ValueError("Grad must be provided if tensor has shape")
            
        self.grad = self.grad + grad

        for dependency in self.dependencies:
            backward_grad = dependency.grad_fn(grad)
            dependency.value.backward(backward_grad)

    def transpose(self, axes: Tuple[int, ...] = None) -> "Tensor":
        output = np.transpose(self.data, axes=axes)
        dependencies: List[Leaf] = []

        def _bkwd(grad: np.ndarray) -> np.ndarray:
            return np.transpose(grad, axes=axes)
        
        if self.requires_grad:
            dependencies.append(
                Leaf(value=self, grad_fn=_bkwd)
            )

        return Tensor(
            output,
            requires_grad=self.requires_grad,
            dependencies=dependencies
        )

    @property
    def T(self):
        return self.transpose()

### Chain Rule

$$
\frac{dz}{dx} = \frac{dz}{dy} \cdot \frac{dy}{dx}
$$

If we have a function composition:  

$$
f(x) = g(h(x))
$$

Then, by the chain rule:

$$
f'(x) = g'(h(x)) \cdot h'(x)
$$

In [48]:
t = Tensor([1, 2, 3], requires_grad=True)
t.data = [[1, 3, 5], [2, 3, 4]]
t_T = t.T

t_T.backward(np.ones_like(t_T.data))

In [57]:
InitMethod = Literal["xavier", "he", "he_leaky", "normal", "uniform"]


class Parameter(Tensor):
    def __init__(
        self,
        *shape: int,
        data: Optional[np.ndarray] = None,
        init_method: InitMethod = "xavier",
        gain: float = 1.0,
    ):
        if data in None:
            data = self._init(shape, init_method, gain)

        super().__init__(data=data, requires_grad=True)

    def _init(self, shape: Tuple[int, ...], init_method: InitMethod = "xavier", gain: float = 1.0, alpha: float = 0.01):
        weights = np.random.randn(*shape)

        if init_method == "xavier":
            std = gain * np.sqrt(1.0 / shape[0])
            return std * weights
        if init_method == "he":
            std = gain * np.sqrt(2.0 / shape[0])
            return std * weights
        if init_method == "he_leaky":
            std = gain * np.sqrt(2.0 / (1 + alpha**2) * (1 / shape[0]))
            return std * weights
        if init_method == "normal":
            return gain * weights
        if init_method == "uniform":
            return gain * np.random.uniform(-1, 1, size=shape)

        raise ValueError(f"Unknown initialization method: {init_method}")

In [None]:
from typing import Iterator

class Module:
    def __call__(self, *args, **kwds) -> Tensor:
        self.forward(*args, **kwds)

    def forward(self, *input):
        raise NotImplementedError()
    
    def parameters(self) -> Iterator[Parameter]:
        for _, item in self.__dict__.items():
            if isinstance(item, Parameter):
                yield item
            if isinstance(item, Module):
                yield from item.parameters()
    
    def zero_grad(self) -> None:
        for param in self.parameters():
            param.zero_grad()


class Sequential(Module):
    def __init__(self, *modules: Module):
        super.__init__()
        self.modules = modules

    def parameters(self) -> Iterator[Parameter]:
        for module in self.modules:
            yield from module.parameters()

    def forward(self, *input):
        for module in self.modules:
            input = module(*input)
        return input