# Neural Networks: Zero to Hero

Video:

https://www.youtube.com/watch?v=VMj-3S1tku0&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ

Reference notebook:

https://github.com/karpathy/nn-zero-to-hero/blob/master/lectures/micrograd/micrograd_lecture_second_half_roughly.ipynb

In [None]:
import math
import numpy as np
import matplotlib.pyplot as plt

In [None]:
class Value:
    def __init__(self, data, label="", _in=(), _op=""):
        self.data = data
        self.label = label
        self._inputs = list(_in)
        self._operation = _op
        self._grad = 0.0
        self._backward = lambda: None
    
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        res = Value(data=self.data+other.data, _in=(self, other), _op="+")
        def backward():
            self._grad += 1.0 * res._grad
            other._grad += 1.0 * res._grad
        res._backward = backward
        return res
    
    def __radd__(self, other):
        return Value(other) + self
        
    def __sub__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        res = Value(data=self.data-other.data, _in=(self, other), _op="-")
        def backward():
            self._grad += 1.0 * res._grad
            other._grad += -1.0 * res._grad
        res._backward = backward
        return res
    
    def __rsub__(self, other):
        return Value(other) - self

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        res = Value(data=self.data*other.data, _in=(self, other), _op="*")
        def backward():
            self._grad += other.data * res._grad
            other._grad += self.data * res._grad
        res._backward = backward
        return res
    
    def __rmul__(self, other):
        return Value(other) * self
    
    def __truediv__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        res = Value(data=self.data/other.data, _in=(self, other), _op="/")
        def backward():
            self._grad += (1.0 / other.data) * res._grad
            other._grad += (-self.data / other.data**2) * res._grad
        res._backward = backward
        return res
    
    def __rtruediv__(self, other):
        return Value(other) / self
    
    def __pow__(self, other):
        assert isinstance(other, (int, float))

        x = self.data
        o = x**other

        res = Value(data=o, _in=(self), _op="pow")
        def backward():
            self._grad = (other * x**(other-1.0)) * res._grad
        res._backward = backward
        return res

    
    def tanh(self):
        x = self.data
        o = (math.exp(2*x) - 1.0) / (math.exp(2*x) + 1)
        res = Value(o, _in=(self,), _op="tanh")
        def backward():
            self._grad += (1.0 - res.data**2) * res._grad
        res._backward = backward
        return res

    def exp(self):
        x = self.data
        o = math.exp(x)
        res = Value(o, _in=(self,), _op="exp")
        def backward():
            self._grad += o * res._grad
        res._backward = backward
        return res
    
    def __repr__(self):
        return f"Value(l={self.label} d={self.data:.4f}, g={self._grad:.4f})"
    
    def print_topo(self):
        # Topological sort
        topo = []
        visited = set()
        def build(v):
            if v not in visited:
                visited.add(v)
                for inp in v._inputs:
                    build(inp)
                topo.append(v)
        build(self)

        for node in reversed(topo):
            print(node)

    
    def backward(self):
        # Topological sort
        topo = []
        visited = set()
        def build(v):
            if v not in visited:
                visited.add(v)
                for inp in v._inputs:
                    build(inp)
                topo.append(v)
        build(self)
        
        self._grad = 1.0
        for node in reversed(topo):
            node._backward()
    

In [None]:
a = Value(2.0)
print("a+2", a + 2)
print("2+a", 2 + a)
print("a-2", a - 2)
print("2-a", 2 - a)
print("a*2", a * 2)
print("2*a", 2 * a)
print("a/2", a / 2)
print("2/a", 2 / a)

print("a.exp()", a.exp())

In [None]:
from graphviz import Digraph

def build_dot_graph(root_node):
    def add_inputs(parent_node, all_nodes):
        if parent_node not in all_nodes:
            all_nodes.add(parent_node)
            for input_node in parent_node._inputs:
                add_inputs(input_node, all_nodes)

    all_nodes = set()
    add_inputs(root_node, all_nodes)

    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'})
    for node in all_nodes:
        node_label=f'{{ {node.label} ({node._operation}) | {{ d={node.data:.4f} | g={node._grad:.4f} }}}}'

        dot.node(str(id(node)), node_label, shape="record")

        if node._inputs:
            op_node_id = str(id(node))+'_'+node._operation
            dot.node( op_node_id, label=node._operation )
            dot.edge(op_node_id, str(id(node)))
            for input_node in node._inputs:
                dot.edge(str(id(input_node)), op_node_id)
    return dot

In [None]:
# Inputs
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# Weights
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# Bias
b = Value(6.8813735870195432, label='b')
# Forward pass
x1w1 = x1 * w1; x1w1.label = 'x1w1'
x2w2 = x2 * w2; x2w2.label = 'x2w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1w1x2w2'
n = x1w1x2w2 + b; n.label = 'n'
o = n.tanh(); o.label = 'o'

o.backward()

In [None]:
# Value(l=o d=0.7071, g=1.0000)
# Value(l=n d=0.8814, g=0.5000)
# Value(l=b d=6.8814, g=0.5000)
# Value(l=x1w1x2w2 d=-6.0000, g=0.5000)
# Value(l=x2w2 d=0.0000, g=0.5000)
# Value(l=w2 d=1.0000, g=0.0000)
# Value(l=x2 d=0.0000, g=0.5000)
# Value(l=x1w1 d=-6.0000, g=0.5000)
# Value(l=w1 d=-3.0000, g=1.0000)
# Value(l=x1 d=2.0000, g=-1.5000)
o.print_topo()

build_dot_graph(o)


In [None]:
# Inputs
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# Weights
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# Bias
b = Value(6.8813735870195432, label='b')
# Forward pass
x1w1 = x1 * w1; x1w1.label = 'x1w1'
x2w2 = x2 * w2; x2w2.label = 'x2w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1w1x2w2'
n = x1w1x2w2 + b; n.label = 'n'

# double_x = 2 * n; double_x.label = '2n'
# e_2n = double_x.exp(); e_2n.label = 'e_2n'
# nom = e_2n + 1.0; nom.label = 'nom'
# den = e_2n - 1.0; den.label = 'den'

e = (2*n).exp();  e.label = 'e'
o = (e - 1) / (e + 1); o.label = 'o'

o.backward()

In [None]:
# Value(l=o d=0.7071, g=1.0000)
# Value(l=n d=0.8814, g=0.5000)
# Value(l=b d=6.8814, g=0.5000)
# Value(l=x1w1x2w2 d=-6.0000, g=0.5000)
# Value(l=x2w2 d=0.0000, g=0.5000)
# Value(l=w2 d=1.0000, g=0.0000)
# Value(l=x2 d=0.0000, g=0.5000)
# Value(l=x1w1 d=-6.0000, g=0.5000)
# Value(l=w1 d=-3.0000, g=1.0000)
# Value(l=x1 d=2.0000, g=-1.5000)
o.print_topo()

build_dot_graph(o)


In [None]:
import torch

x1 = torch.tensor(2.0, requires_grad=True)
x2 = torch.tensor(0.0, requires_grad=True)
w1 = torch.tensor(-3.0, requires_grad=True)
w2 = torch.tensor(1.0, requires_grad=True)
b = torch.tensor(6.8813735870195432, requires_grad=True)
n = x1*w1 + x2*w2 + b
o = torch.tanh(n)

print(o.data.item())
o.backward()

print(f"x1.grad = {x1.grad.item():.4f}")
print(f"x2.grad = {x2.grad.item():.4f}")
print(f"w1.grad = {w1.grad.item():.4f}")
print(f"w2.grad = {w2.grad.item():.4f}")
print(f"b.grad = {b.grad.item():.4f}")

# Part 2 Neural Network