## Lectures 7 and 8: Building Autograd engine from scratch

### Learning outcomes
1. Understanding Automated Differentiation Engines at a foundational level
2. Operator Overloading in Objoct Oriented Programming
3. Graph Representation
4. Graph Traversal

To Draw a painting representing the conciseness of the mathematics, the systematism of DSA and the abstraction of programming paradigms like OOP

## Problem Statement
Restricting yourself to <font color="red">Python's Standard Library</font>, build an <font color="red">Autograd Engine</font> capable of estimating the gradients required to solve the following problem using gradient descent.
<br><br>
Find a point in $\mathbb{R}^{2}$ with the least average Euclidean distance to a set of arbitrary points

In [3]:
# previously in the course
from random import Random
from math import ceil, sqrt

def generate_pnts(N=1000):
    rndm_obj = Random(x=5)
    return [rndm_obj.uniform(a=0, b=1) for _ in range(N)],[rndm_obj.uniform(a=0, b=1) for _ in range(N)]

def calc_grad_1(x_p, y_p, batch_x ,batch_y):
  sum_x, sum_y = 0, 0
  n = len(batch_x)
  for x_i, y_i in zip(batch_x ,batch_y):
    inv_sqrt = ((x_i - x_p) ** 2 + (y_i - y_p) ** 2) ** (-0.5)
    sum_x += inv_sqrt * (x_i - x_p)
    sum_y += inv_sqrt * (y_i - y_p)
  return -sum_x/n, -sum_y/n

def loss(x_p, y_p, batch_x ,batch_y):
  return (1/len(batch_x))* sum([sqrt((x_i-x_p)**2+(y_i-y_p)**2) for x_i , y_i in zip(batch_x, batch_y)])


In [392]:
data_x, data_y = generate_pnts()

dl_dx_1, dl_dy_1 = calc_grad_1(0.3,0.3,data_x,data_y)
print(dl_dx_1,dl_dy_1)


-0.3277744397151291 -0.336282174741702


In [393]:
#loss without tensors
loss(0.3,0.3,data_x,data_y)

0.4430528244756474

## PyTorch as an example of Autograd engines

In [394]:
import torch 
pnt = torch.tensor([0.3,0.3])
pnt.requires_grad = True
data_tnsr = torch.tensor([data_x,data_y])
data_tnsr = data_tnsr.t()
#each row, contains two points of the dataset
print(data_tnsr.shape)

torch.Size([1000, 2])


In [395]:
#loss with tensors
loss_torch = torch.mean(torch.sqrt(((data_tnsr-pnt)**2).sum(dim=1)))
loss_torch

tensor(0.4431, grad_fn=<MeanBackward0>)

In [396]:
#make it leaf, to retain grad
pnt.retain_grad()
#if no backward, no grad, it would be none
loss_torch.backward()
torch_grad = pnt.grad.data
torch_grad
pnt.grad.zero_()

tensor([0., 0.])

In [397]:
for i in range(3):
    print(f"iteration {i}")
    loss_torch = torch.mean(torch.sqrt(((data_tnsr-pnt)**2).sum(dim=1)))
    print(f"loss torch:{loss_torch}")
    loss_torch.backward()
    print(f"torch grad = {pnt.grad.data}")
pnt.grad.zero_()


iteration 0
loss torch:0.44305282831192017
torch grad = tensor([-0.3278, -0.3363])
iteration 1
loss torch:0.44305282831192017
torch grad = tensor([-0.6555, -0.6726])
iteration 2
loss torch:0.44305282831192017
torch grad = tensor([-0.9833, -1.0088])


tensor([0., 0.])

In [398]:
for i in range(3):
    print(f"iteration {i}")
    loss_torch = torch.mean(torch.sqrt(((data_tnsr-pnt)**2).sum(dim=1)))
    print(f"loss torch:{loss_torch}")
    loss_torch.backward()
    print(f"torch grad = {pnt.grad.data}")
    pnt.grad.zero_()


iteration 0
loss torch:0.44305282831192017
torch grad = tensor([-0.3278, -0.3363])
iteration 1
loss torch:0.44305282831192017
torch grad = tensor([-0.3278, -0.3363])
iteration 2
loss torch:0.44305282831192017
torch grad = tensor([-0.3278, -0.3363])


## Building an Autograd From Scratch 

In [1]:
# Why using a class? because it's the paradigm that keeps the state and the behaviour
class comp_node:
    def __init__(self,val,children=[], op = "assign"):
        self.val = val # a place holder in the memory
        self.children = children
        self.grad = 0 # initialized with 0 because we do a (plus equal operation)
        self.op = op 
        self.backward_prop = lambda : None
    def __to_comp_node__(self, obj):
        if not isinstance(obj, comp_node):
            return comp_node(val = obj)
        else:
            return obj

    def __sub__(self,other):
        other = self.__to_comp_node__(other)
        out = comp_node(val=self.val - other.val, children=[self,other], op = "subtraction")
        def _backward_prop():
            self.grad -= out.grad * (-1)
            other.grad -= out.grad * (-1)
        out.backward_prop = _backward_prop
        return out
    def __rsub__(self,other):
        other = self.__to_comp_node__(other)
       # out = comp_node(val=other.val - self.val, children=[self,other])
        return other - self

    def __add__(self,other):
        other = self.__to_comp_node__(other)
        out = comp_node(val=self.val + other.val, children=[self,other], op = "addition")
        def _backward_prop():
            self.grad += out.grad *1
            other.grad += out.grad *1
        out.backward_prop = _backward_prop
        return out
    def __radd__(self,other):
        other = self.__to_comp_node__(other)
       # out = comp_node(val=other.val - self.val, children=[self,other])
        return other + self
    def __mul__(self,other):
        other = self.__to_comp_node__(other)
        out = comp_node(val=self.val * other.val, children=[self,other], op = "multplication")
        def __backward_prop():
         self.grad += out.grad * other.val
         other.grad += out.grad * self.val
        out.backward_prop = __backward_prop 
        return out
    def __rmul__(self,other):
        other = self.__to_comp_node__(other)
       # out = comp_node(val=other.val - self.val, children=[self,other])
        return other * self
    def __pow__(self, exponent):
        if not isinstance(exponent, (int, float)):
            raise ValueError("unsupported types")
        out = comp_node(val = self.val ** exponent, children=[self], op = f"power {exponent}")
        def _backward_prop():
          self.grad += out.grad * (exponent * self.val **(exponent-1))
        out.backward_prop = _backward_prop
        return out
    def __eq__(self,other):
        return self.val == other.val
    def __repr__(self):
        return f"op: {self.op} | val: {self.val:0.5f} | number of children {len(self.children)} | gradient {self.grad}"


assert comp_node(val = 5 ).val == 5, "assignment failed"
assert (comp_node(val = 5) - comp_node(val = 3)).val == 2, "nodes subtraction overloading failed"
assert (comp_node(val = 5) - 3).val == 2, "integer and node subtraction overloading failed"
#in rsub-> other - self 
assert ( 5 - comp_node(val =3)).val == 2, "integer and node right subtraction overloading failed"

assert (comp_node(val = 5) + comp_node(val = 3)).val == 8, "nodes addition overloading failed"
assert (comp_node(val = 5) + 3).val == 8, "integer and node addition overloading failed"
#in radd-> other + self 
assert ( 5 + comp_node(val =3)).val == 8, "integer and node right addidtion overloading failed"
assert (comp_node(val=25)**0.5).val == 5, "node power int or float failed"
assert comp_node(val = 5)**2 == comp_node(val =25), "nodes comparison failed"
assert (5 * comp_node(val =2)).val == 10, "node muliplying failed"

In [7]:
data_x, data_y = generate_pnts(N=1)
x_p,y_p = comp_node(val=0.3), comp_node(val=0.3)

def graph_loss(xp, yp, datax , datay):
    #n = len(datax)
    #l,M,gx,gy, Ix, Iy = 0,0,0,0,0,0
    #for datax, datay in zip(datax, datay):
    Ix = datax - xp
    Iy = datay - yp
    gx = Ix**2
    gy = Iy**2
    M = gx + gy
    l = M ** 0.5
    #l = l * (1/n)
    return l, [l,M,gx,gy, Ix, Iy, xp, yp] # 

graph_loss_val, reverse_topo_order = graph_loss(x_p, y_p, data_x[0], data_y[0])
reverse_topo_order[0].grad = 1

for i, node in enumerate(reverse_topo_order):
  node.backward_prop()
  print(i, node)



0 op: power 0.5 | val: 0.54721 | number of children 1 | gradient 1
1 op: addition | val: 0.29944 | number of children 2 | gradient 0.9137222319490423
2 op: power 2 | val: 0.10427 | number of children 1 | gradient 0.9137222319490423
3 op: power 2 | val: 0.19518 | number of children 1 | gradient 0.9137222319490423
4 op: subtraction | val: 0.32290 | number of children 2 | gradient 0.5900849147094943
5 op: subtraction | val: 0.44179 | number of children 2 | gradient 0.8073411877467226
6 op: assign | val: 0.30000 | number of children 0 | gradient 0.5900849147094943
7 op: assign | val: 0.30000 | number of children 0 | gradient 0.8073411877467226
