# Neural Networks: Zero to Hero

Video:

https://www.youtube.com/watch?v=VMj-3S1tku0&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ

Reference notebook:

https://github.com/karpathy/nn-zero-to-hero/blob/master/lectures/micrograd/micrograd_lecture_second_half_roughly.ipynb

# Part 1: Neuron

In [None]:
import math
import numpy as np
import matplotlib.pyplot as plt

In [None]:
class Value:
    def __init__(self, data, label="", _in=(), _op=""):
        self.data = data
        self.label = label
        self._inputs = list(_in)
        self._operation = _op
        self._grad = 0.0
        self._backward = lambda: None
    
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        res = Value(data=self.data+other.data, _in=(self, other), _op="+")
        def backward():
            self._grad += 1.0 * res._grad
            other._grad += 1.0 * res._grad
        res._backward = backward
        return res
    
    def __radd__(self, other):
        return Value(other) + self
        
    def __sub__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        res = Value(data=self.data-other.data, _in=(self, other), _op="-")
        def backward():
            self._grad += 1.0 * res._grad
            other._grad += -1.0 * res._grad
        res._backward = backward
        return res
    
    def __rsub__(self, other):
        return Value(other) - self

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        res = Value(data=self.data*other.data, _in=(self, other), _op="*")
        def backward():
            self._grad += other.data * res._grad
            other._grad += self.data * res._grad
        res._backward = backward
        return res
    
    def __rmul__(self, other):
        return Value(other) * self
    
    def __truediv__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        res = Value(data=self.data/other.data, _in=(self, other), _op="/")
        def backward():
            self._grad += (1.0 / other.data) * res._grad
            other._grad += (-self.data / other.data**2) * res._grad
        res._backward = backward
        return res
    
    def __rtruediv__(self, other):
        return Value(other) / self
    
    def __pow__(self, other):
        assert isinstance(other, (int, float))

        x = self.data
        o = x**other

        res = Value(data=o, _in=(self,), _op="pow")
        def backward():
            self._grad = (other * x**(other-1.0)) * res._grad
        res._backward = backward
        return res

    
    def tanh(self):
        x = self.data
        o = (math.exp(2*x) - 1.0) / (math.exp(2*x) + 1)
        res = Value(o, _in=(self,), _op="tanh")
        def backward():
            self._grad += (1.0 - res.data**2) * res._grad
        res._backward = backward
        return res

    def exp(self):
        x = self.data
        o = math.exp(x)
        res = Value(o, _in=(self,), _op="exp")
        def backward():
            self._grad += o * res._grad
        res._backward = backward
        return res
    
    def __repr__(self):
        return f"Value(l={self.label} d={self.data:.4f}, g={self._grad:.4f})"
    
    def topo_sort(self):
        # Topological sort
        topo = []
        visited = set()
        def build(v):
            if v not in visited:
                visited.add(v)
                for inp in v._inputs:
                    build(inp)
                topo.append(v)
        build(self)
        return topo
    
    def print_topo(self):
        topo = self.topo_sort()
        for node in reversed(topo):
            print(node)

    
    def backward(self):
        topo = self.topo_sort()        
        self._grad = 1.0
        for node in reversed(topo):
            node._backward()

    def zero_grad(self):
        topo = self.topo_sort()
        self._grad = 0.0
        for node in topo:
            node._grad = 0.0

    def check_grad(self):
        topo = self.topo_sort()
        
        result = {"grad_is_zero": 0, "grad_non_zero": 0}
        for node in topo:
            if node._grad == 0.0:
                result["grad_is_zero"] += 1
            else:
                # print(f"Grad for {node.label} is: {node._grad}")
                result["grad_non_zero"] += 1
        return result
    

In [None]:
a = Value(2.0)
print("a+2", a + 2)
print("2+a", 2 + a)
print("a-2", a - 2)
print("2-a", 2 - a)
print("a*2", a * 2)
print("2*a", 2 * a)
print("a/2", a / 2)
print("2/a", 2 / a)

print("a.exp()", a.exp())

In [None]:
from graphviz import Digraph

def build_dot_graph(root_node):
    def add_inputs(parent_node, all_nodes):
        if parent_node not in all_nodes:
            all_nodes.add(parent_node)
            for input_node in parent_node._inputs:
                add_inputs(input_node, all_nodes)

    all_nodes = set()
    add_inputs(root_node, all_nodes)

    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'})
    for node in all_nodes:
        node_label=f'{{ {node.label} ({node._operation}) | {{ d={node.data:.4f} | g={node._grad:.4f} }}}}'

        dot.node(str(id(node)), node_label, shape="record")

        if node._inputs:
            op_node_id = str(id(node))+'_'+node._operation
            dot.node( op_node_id, label=node._operation )
            dot.edge(op_node_id, str(id(node)))
            for input_node in node._inputs:
                dot.edge(str(id(input_node)), op_node_id)
    return dot

In [None]:
# Inputs
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# Weights
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# Bias
b = Value(6.8813735870195432, label='b')
# Forward pass
x1w1 = x1 * w1; x1w1.label = 'x1w1'
x2w2 = x2 * w2; x2w2.label = 'x2w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1w1x2w2'
n = x1w1x2w2 + b; n.label = 'n'
o = n.tanh(); o.label = 'o'

o.backward()

In [None]:
# Value(l=o d=0.7071, g=1.0000)
# Value(l=n d=0.8814, g=0.5000)
# Value(l=b d=6.8814, g=0.5000)
# Value(l=x1w1x2w2 d=-6.0000, g=0.5000)
# Value(l=x2w2 d=0.0000, g=0.5000)
# Value(l=w2 d=1.0000, g=0.0000)
# Value(l=x2 d=0.0000, g=0.5000)
# Value(l=x1w1 d=-6.0000, g=0.5000)
# Value(l=w1 d=-3.0000, g=1.0000)
# Value(l=x1 d=2.0000, g=-1.5000)
o.print_topo()

build_dot_graph(o)


In [None]:
# Inputs
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# Weights
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# Bias
b = Value(6.8813735870195432, label='b')
# Forward pass
x1w1 = x1 * w1; x1w1.label = 'x1w1'
x2w2 = x2 * w2; x2w2.label = 'x2w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1w1x2w2'
n = x1w1x2w2 + b; n.label = 'n'

e = (2*n).exp();  e.label = 'e'
o = (e - 1) / (e + 1); o.label = 'o'

o.backward()

In [None]:
# Value(l=o d=0.7071, g=1.0000)
# Value(l= d=6.8284, g=-0.1036)
# Value(l= d=1.0000, g=-0.1036)
# Value(l= d=4.8284, g=0.1464)
# Value(l= d=1.0000, g=-0.1464)
# Value(l=e d=5.8284, g=0.0429)
# Value(l= d=1.7627, g=0.2500)
# Value(l=n d=0.8814, g=0.5000)
# Value(l=b d=6.8814, g=0.5000)
# Value(l=x1w1x2w2 d=-6.0000, g=0.5000)
# Value(l=x2w2 d=0.0000, g=0.5000)
# Value(l=w2 d=1.0000, g=0.0000)
# Value(l=x2 d=0.0000, g=0.5000)
# Value(l=x1w1 d=-6.0000, g=0.5000)
# Value(l=w1 d=-3.0000, g=1.0000)
# Value(l=x1 d=2.0000, g=-1.5000)
# Value(l= d=2.0000, g=0.2203)
o.print_topo()

build_dot_graph(o)


In [None]:
import torch

x1 = torch.tensor(2.0, requires_grad=True)
x2 = torch.tensor(0.0, requires_grad=True)
w1 = torch.tensor(-3.0, requires_grad=True)
w2 = torch.tensor(1.0, requires_grad=True)
b = torch.tensor(6.8813735870195432, requires_grad=True)
n = x1*w1 + x2*w2 + b
o = torch.tanh(n)

print(o.data.item())
o.backward()

print(f"x1.grad = {x1.grad.item():.4f}")
print(f"x2.grad = {x2.grad.item():.4f}")
print(f"w1.grad = {w1.grad.item():.4f}")
print(f"w2.grad = {w2.grad.item():.4f}")
print(f"b.grad = {b.grad.item():.4f}")

# 0.7071067094802856
# x1.grad = -1.5000
# x2.grad = 0.5000
# w1.grad = 1.0000
# w2.grad = 0.0000
# b.grad = 0.5000

# Part 2 Neural Network

In [None]:
import random

In [None]:
class Neuron:
    def __init__(self, nin, label=""):
        self.label = label
        self.weights = []
        for i in range(nin):
            val = Value(random.uniform(-1.0, 1.0), f'{self.label}_w{i}')
            self.weights.append(val)
        self.bias = Value(random.uniform(-1.0, 1.0), f'{self.label}_b')

    def __call__(self, inputs):
        assert isinstance(inputs, list)
        assert all(isinstance(i, (int, float, Value)) for i in inputs)
        activation = sum((x * w for x, w in zip(inputs, self.weights)), start=self.bias)
        output = activation.tanh()
        return output
    
    def parameters(self):
        return self.weights + [self.bias]
    
class Layer:
    def __init__(self, nin, nout, label=""):
        self.label = label
        self.neurons = []
        for i in range(nout):
            neuron = Neuron(nin, label=f"{self.label}_n{i}")
            self.neurons.append(neuron)
    
    def __call__(self, inputs):
        assert isinstance(inputs, list)
        assert all(isinstance(i, (int, float, Value)) for i in inputs)
        results = []
        for neuron in self.neurons:
            results.append(neuron(inputs))
        return results
    
    def parameters(self):
        result = []
        for neuron in self.neurons:
            result.extend(neuron.parameters())
        return result
    
class MLP:
    def __init__(self, nin, dims):
        assert isinstance(nin, int)
        assert isinstance(dims, list)
        assert all(isinstance(d, int) for d in dims)
        self.layers = []
        all_dims = [nin] + dims

        for i in range(len(all_dims)-1):
            in_dim = all_dims[i]
            out_dim = all_dims[i+1]
            layer = Layer(in_dim, out_dim, label=f"l{i}")
            self.layers.append(layer)
    
    def __call__(self, inputs):

        for layer in self.layers:
            inputs = layer(inputs)

        if len(inputs) == 1:
            return inputs[0]

        return inputs
    
    def parameters(self):
        result = []
        for layer in self.layers:
            result.extend(layer.parameters())
        return result

In [None]:
X = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0],
]
Y = [1.0, -1.0, -1.0, 1.0]  # targets

In [None]:
random.seed(42)

mlp = MLP(3, [4, 4, 1])

In [None]:
# Initial Predictions
ypred = [mlp(x) for x in X]

# [0.6994093620224068, 0.5026295816615511, 0.6931545900944501, 0.8755224728708613]
print([yp.data for yp in ypred])

In [None]:
for i in range(20):

    # Forward pass
    for p in mlp.parameters():
        p._grad = 0.0

    ypred = [mlp(x) for x in X]

    loss = sum((yout-y)**2 for yout, y in zip(ypred, Y))
    print(loss.data)

    assert loss.check_grad()['grad_non_zero'] == 0

    # Backward Pass
    loss.backward()

    # Optimizer Update
    for p in mlp.parameters():
        p.data += -1.0 * 0.12 * p._grad

Expected output:
```bash
5.230517512042234
3.8535617226895393
3.9171389480837893
1.1416064377049708
1.924079918586684
3.812200468372784
3.481375033842076
1.580216893836423
0.5856976470695008
0.052233808841902554
0.04550309897751743
0.04052386585920299
0.03661536505330471
0.03342919066295122
0.030763809968969658
0.028491843935593043
0.026527308067071865
0.024809204513010157
0.023292578820850944
0.021943297412743534
```

In [None]:
# [1.0, -1.0, -1.0, 1.0]
# [0.9279139635559236, -0.9607378181994956, -0.9045175890704561, 0.9219712165552181]
print([y for y in Y])
print([yp.data for yp in ypred])