In [4]:
import numpy as np
import cupy as cp
from __future__ import annotations

In [5]:
x_gpu = cp.array([1, 2, 3])
x_cpu = x_gpu.get()
type(x_cpu)
type(x_gpu)

# lets keep things on purely on the cpu for now

cupy.ndarray

In [91]:
class Tensor:
  def __init__(self, list: list) -> None:
    self.name: str = ""
    self.values: np.array = np.array(list)
    self.parents: tuple[Tensor, Tensor] = (None, None)
    self.gradient: np.array = np.zeros_like(self.values)
    self._backward: function = lambda *args: None
    self.visited = False

  def __add__(self, other: Tensor) -> Tensor:
    child = Tensor(self.values + other.values)
    child.parents = (self, other)
    def _backward() -> None:
      self.gradient += 1 * child.gradient
      other.gradient += 1 * child.gradient
    child._backward = _backward
    return child
  
  def __mul__(self, other: Tensor) -> Tensor:
    child = Tensor(self.values * other.values)
    child.parents = (self, other)
    def _backward() -> None:
      self.gradient += other.values * child.gradient # mistake made using Tensor object instead of tensor value
      other.gradient += self.values * child.gradient # keep in mind whether dealing with object object.values or object.gradient
    child._backward = _backward
    return child
  
  def sum(self) -> Tensor:
    child = Tensor([self.values.sum()])
    child.parents = (self, None)
    def _backward() -> None:
      self.gradient += np.ones_like(self.values) * child.gradient
    child._backward = _backward
    return child
  
  def topological_sort(self) -> list[Tensor]:
    dfs_sort: list[Tensor] = []
    def dfs(node: Tensor):
      if node.parents[0]:
        dfs(node.parents[0])
      if node.parents[1]:
        dfs(node.parents[1])
      if not node.visited:
        dfs_sort.append(node); node.visited = True
    dfs(self); return list(reversed(dfs_sort))

  def backward(self):
    self.gradient = np.ones_like(self.values) # gradient w.r.t self
    for node in self.topological_sort():
      node._backward()
      print(node)

  def zero_grad(self):
    self.gradient = np.zeros_like(self.values)
    self.visited = False
  
  def __repr__(self) -> str:
    return f"{self.name}: {self.values} : {self.gradient}"

In [92]:
np.array([1, 2]) * np.array([2, 4])

array([2, 8])

In [97]:
a = Tensor([1, 2, 3]); a.name = "a"
b = Tensor([2, 4, 6]); b.name = "b"
c = Tensor([3, 6, 9]); c.name = "c"
d = a * b; d.name = "d"
e = d + c; e.name = "e"
f = e * a; f.name = "f"
l = f.sum(); l.name = "l"
l.backward()


l: [114] : [1]
f: [ 5 28 81] : [1 1 1]
e: [ 5 14 27] : [1 2 3]
c: [3 6 9] : [1 2 3]
d: [ 2  8 18] : [1 2 3]
b: [2 4 6] : [1 4 9]
a: [1 2 3] : [ 7 22 45]


In [94]:
# comparing with pytorch
import torch
a = torch.tensor([1, 2, 3], dtype=float, requires_grad=True)
b = torch.tensor([2, 4, 6], dtype=float, requires_grad=True)
c = torch.tensor([3, 6, 9], dtype=float, requires_grad=True)
d = a * b; d.retain_grad()
e = d + c; e.retain_grad()
f = e * a; f.retain_grad()
l = f.sum(); l.retain_grad()
l.backward()

tensor([ 2.,  8., 18.], dtype=torch.float64, grad_fn=<MulBackward0>)
tensor([ 5., 14., 27.], dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([ 5., 28., 81.], dtype=torch.float64, grad_fn=<MulBackward0>)
tensor(114., dtype=torch.float64, grad_fn=<SumBackward0>)


In [95]:
tensors = [a, b, c, d, e, f, l]
for tensor in tensors:
  print(tensor.grad)

tensor([ 7., 22., 45.], dtype=torch.float64)
tensor([1., 4., 9.], dtype=torch.float64)
tensor([1., 2., 3.], dtype=torch.float64)
tensor([1., 2., 3.], dtype=torch.float64)
tensor([1., 2., 3.], dtype=torch.float64)
tensor([1., 1., 1.], dtype=torch.float64)
tensor(1., dtype=torch.float64)
