In [4]:
import numpy as np
import cupy as cp
from __future__ import annotations

In [5]:
x_gpu = cp.array([1, 2, 3])
x_cpu = x_gpu.get()
type(x_cpu)
type(x_gpu)

# lets keep things on purely on the cpu for now

cupy.ndarray

In [181]:
class Tensor:
  def __init__(self, list: list) -> None:
    self.name: str = ""
    self.values: np.array = np.array(list, dtype="float32")
    self.parents: tuple[Tensor, Tensor] = (None, None)
    self.gradient: np.array = np.zeros_like(self.values)
    self._backward: function = lambda *args: None
    self.visited = False

  def __add__(self, other: Tensor) -> Tensor:
    child = Tensor(self.values + other.values)
    child.parents = (self, other)
    def _backward() -> None:
      self.gradient += np.ones_like(self.values) * child.gradient
      other.gradient += np.ones_like(other.values) * child.gradient
    child._backward = _backward
    return child
  
  def __sub__(self, other: Tensor) -> Tensor:
    return self.__add__(-other)

  def __mul__(self, other: Tensor) -> Tensor:
    child = Tensor(self.values * other.values)
    child.parents = (self, other)
    def _backward() -> None:
      self.gradient += other.values * child.gradient # mistake made using Tensor object instead of tensor value
      other.gradient += self.values * child.gradient # keep in mind whether dealing with object object.values or object.gradient
    child._backward = _backward
    return child

  def __truediv__(self, other: Tensor) -> Tensor:
    return self.__mul__(other ** -1)

  def __neg__(self) -> Tensor:
    child = Tensor(-self.values)
    child.parents = (self, None)
    def _backward():
      self.gradient += -np.ones_like(self.values) * child.gradient
    child._backward = _backward
    return child

  def __pow__(self, other: float) -> Tensor:
    child = Tensor(self.values ** other)
    child.parents = (self, None)
    def _backward():
      self.gradient += other * self.values ** (other - 1) * child.gradient
    child._backward = _backward
    return child

  def sum(self) -> Tensor:
    child = Tensor([self.values.sum()])
    child.parents = (self, None)
    def _backward() -> None:
      self.gradient += np.ones_like(self.values) * child.gradient
    child._backward = _backward
    return child
  
  def topological_sort(self) -> list[Tensor]:
    dfs_sort: list[Tensor] = []
    def dfs(node: Tensor):
      if node.parents[0]:
        dfs(node.parents[0])
      if node.parents[1]:
        dfs(node.parents[1])
      if not node.visited:
        dfs_sort.append(node); node.visited = True
    dfs(self); return list(reversed(dfs_sort))

  def backward(self):
    self.gradient = np.ones_like(self.values) # gradient w.r.t self
    for node in self.topological_sort():
      node._backward()
      print(node)

  def zero_grad(self):
    self.gradient = np.zeros_like(self.values)
    self.visited = False
  
  def __repr__(self) -> str:
    return f"{self.name}: {self.values} : {self.gradient}"

In [182]:
a = Tensor([1, 2, 3]); a.name = "a"
b = Tensor([2, 4, 6]); b.name = "b"
c = Tensor([3, 6, 9]); c.name = "c"
d = a * b; d.name = "d"
dn = -d; dn.name = "dn"
e = dn - c; e.name = "e"
e2 = e ** 2; e2.name = "e2"
f = e2 / a; f.name = "f"
l = f.sum(); l.name = "l"
l.backward()

l: [366.] : [1.]
f: [ 25.  98. 243.] : [1. 1. 1.]
: [1.         0.5        0.33333334] : [ 25. 196. 729.]
e2: [ 25. 196. 729.] : [1.         0.5        0.33333334]
e: [ -5. -14. -27.] : [-10. -14. -18.]
: [-3. -6. -9.] : [-10. -14. -18.]
c: [3. 6. 9.] : [10. 14. 18.]
dn: [ -2.  -8. -18.] : [-10. -14. -18.]
d: [ 2.  8. 18.] : [10. 14. 18.]
b: [2. 4. 6.] : [10. 28. 54.]
a: [1. 2. 3.] : [-5.  7. 27.]


In [184]:
# comparing with pytorch
import torch
a = torch.tensor([1, 2, 3], dtype=float, requires_grad=True)
b = torch.tensor([2, 4, 6], dtype=float, requires_grad=True)
c = torch.tensor([3, 6, 9], dtype=float, requires_grad=True)
d = a * b; d.retain_grad()
dn = -d; dn.retain_grad()
e = dn - c; e.retain_grad()
e2 = e ** 2; e2.retain_grad()
f = e2 / a; f.retain_grad()
l = f.sum(); l.retain_grad()
l.backward()

In [185]:
tensors = list(reversed([a, b, c, d, e, e2, f, l]))
for tensor in tensors:
  print(tensor.grad)

tensor(1., dtype=torch.float64)
tensor([1., 1., 1.], dtype=torch.float64)
tensor([1.0000, 0.5000, 0.3333], dtype=torch.float64)
tensor([-10., -14., -18.], dtype=torch.float64)
tensor([10., 14., 18.], dtype=torch.float64)
tensor([10., 14., 18.], dtype=torch.float64)
tensor([10., 28., 54.], dtype=torch.float64)
tensor([-5.,  7., 27.], dtype=torch.float64)


In [153]:
tensors = list(reversed([a, b, c, d, e, e2, f, l]))
for tensor in tensors:
  print(tensor)

tensor(2604., dtype=torch.float64, grad_fn=<SumBackward0>)
tensor([  25.,  392., 2187.], dtype=torch.float64, grad_fn=<MulBackward0>)
tensor([ 25., 196., 729.], dtype=torch.float64, grad_fn=<PowBackward0>)
tensor([ -5., -14., -27.], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([ 2.,  8., 18.], dtype=torch.float64, grad_fn=<MulBackward0>)
tensor([3., 6., 9.], dtype=torch.float64, requires_grad=True)
tensor([2., 4., 6.], dtype=torch.float64, requires_grad=True)
tensor([1., 2., 3.], dtype=torch.float64, requires_grad=True)
