In [None]:
import math

# Value

In [None]:
class Value:
    def __init__(self, data, _children=(), _op: str = None, label: str = ""):
        self.data = data
        self.grad = 0.0
        self._prev = set(_children)
        self._op = _op
        self.label = label
        self._backward: callable = lambda: None

    def __repr__(self):
        return f"Value(data={self.data})"

    def __add__(self, other):
        other = (
            other if isinstance(other, Value) else Value(other)
        )  # Supporting add of integers
        out = Value(self.data + other.data, _children=(self, other), _op="+")

        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad

        out._backward = _backward
        return out

    def __neg__(self):
        return self * -1

    def __sub__(self, other):
        return self + (-other)

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, _children=(self, other), _op="*")

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad

        out._backward = _backward
        return out

    def __rmul__(self, other):  # int + object
        return self.__mul__(other)

    def __pow__(self, other):
        assert isinstance(other, int | float)
        out = Value(self.data**other, (self,), _op=f"**{other}")

        def _backward():
            self.grad += other * (self.data ** (other - 1)) * out.grad

        out._backward = _backward

        return out

    def __truediv__(self, other):
        return self * other**-1

    def exp(self):
        x = self.data
        out = Value(math.exp(x), (self,), "exp")

        def _backward():
            self.grad = out.data * out.grad

        out._backward = _backward
        return out

    def tanh(self):
        x = self.data
        t = (math.exp(2 * x) - 1) / (math.exp(2 * x) + 1)
        out = Value(t, _children=(self,), _op="tanh")

        def _backward():
            self.grad += (1 - t**2) * out.grad

        out._backward = _backward
        return out

    def backward(self):
        topo = []
        visited = set()

        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)

        build_topo(self)
        self.grad = 1.0
        for node in reversed(topo):  # Needs to be reversed because we start at the end
            node._backward()

# Visualization

In [None]:
# Visualizing the computation graph
from graphviz import Digraph


def trace(root):
    nodes, edges = set(), set()

    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)

    build(root)
    return nodes, edges


# for any value use a rectangle, for any operation use a circle
def draw_graph(value: Value):
    dot = Digraph(format="svg", graph_attr={"rankdir": "LR"})  # Left to right
    nodes, edges = trace(value)
    # For each node, add a rectangle with the value
    for n in nodes:
        uid = str(id(n))
        dot.node(
            name=uid,
            label="{%s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad),
            shape="record",
        )
        # For any operation, use a circle
        if n._op:
            dot.node(name=uid + n._op, label=n._op)
            # Add edges to the graph
            dot.edge(uid + n._op, uid)
    for n1, n2 in edges:
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)
    return dot

In [None]:
a = Value(2.0)
a * 2

In [None]:
a = Value(2.0)
a + 2

In [None]:
a = Value(2.0)
2 * a

In [None]:
a = Value(2.0)
b = Value(4.0)

a / b

To implement division we're going to gerneralize what division means:

$$
\frac{a}{b} = a \cdot \frac{1}{b} = a \cdot b^{-1}
$$

so we're going to implement a function that can compute:
$$
x^k \text{ for any x,k}
$$

In [None]:
a - b

# Mathematically Equivalancy

You can define the operations on **any** abstraction level you please as long as it's mathematically correct.

In [None]:
# inputs x1,x2
x1 = Value(2.0, label="x1")
x2 = Value(0.0, label="x2")
# weights wl,w2
w1 = Value(-3.0, label="w1")
w2 = Value(1.0, label="w2")
# bias of the neuron
b = Value(6.8813735870195432, label="b")  # Value set so the numbers come out "nice"
# x1*w1 + x2*w2 + b
x1w1 = x1 * w1
x1w1.label = "x1*w1"
x2w2 = x2 * w2
x2w2.label = "x2*w2"
x1w1x2w2 = x1w1 + x2w2
x1w1x2w2.label = "x1*w1 + x2*w2"
n = x1w1x2w2 + b
n.label = "n"
o = n.tanh()
o.label = "o"
o.backward()

In [None]:
draw_graph(o)


The following implements the same functionality as above, just instead of using $\tanh$ directly we use one of it's representations since:
$$
\tanh(x) = \frac{e^{2n} - 1}{e^{2n} + 1}
$$

In [None]:
# inputs x1,x2
x1 = Value(2.0, label="x1")
x2 = Value(0.0, label="x2")
# weights wl,w2
w1 = Value(-3.0, label="w1")
w2 = Value(1.0, label="w2")
# bias of the neuron
b = Value(6.8813735870195432, label="b")  # Value set so the numbers come out "nice"
# x1*w1 + x2*w2 + b
x1w1 = x1 * w1
x1w1.label = "x1*w1"
x2w2 = x2 * w2
x2w2.label = "x2*w2"
x1w1x2w2 = x1w1 + x2w2
x1w1x2w2.label = "x1*w1 + x2*w2"
n = x1w1x2w2 + b
n.label = "n"
# ---------
e = (2 * n).exp()
o = (e - 1) / (e + 1)
# ---------
o.label = "o"
o.backward()

In [None]:
draw_graph(o)

and it returns the same result in terms of values of data and gradients as $\tanh$, just with a longer computational graph as it is now more explicit

# PyTorch Sanity Check
We're now going to verify the result using a modern deep learning framework such as PyTorch

In [None]:
import torch

Because the variables are leaf nodes, PyTorch for efficiency reasons does not automatically compute the gradients, we therefore have to force it to, by setting the attributes ``requires_grad = True`` so it does backpropagation when we call ``backward()`` on the last object

In [None]:
x1 = torch.tensor([2.0]);                       x1.requires_grad = True
w1 = torch.tensor([-3.0]);                      w1.requires_grad = True
x2 = torch.tensor([0.0]);                       x2.requires_grad = True
w2 = torch.tensor([1.0]);                       w2.requires_grad = True
b = torch.tensor([6.8813735870195432]);         b.requires_grad = True

x1w1 = x1 * w1
x2w2 = x2 * w2

x1w1x2w2 = x1w1 + x2w2 + b

out = torch.tanh(x1w1x2w2)

print(f"out = {out.item()}")

In [None]:
out.backward()

In [None]:
print(f"x1.grad = {x1.grad.item()}")
print(f"w1.grad = {w1.grad.item()}")
print(f"x2.grad = {x2.grad.item()}")
print(f"w2.grad = {w2.grad.item()}")

In [None]:
grads = {
    "x1": x1.grad.item(),
    "w1": w1.grad.item(),
    "x2": x2.grad.item(),
    "w2": w2.grad.item(),
}
# Check if the gradients are the same
assert grads["x1"] == x1.grad.item()
assert grads["w1"] == w1.grad.item()
assert grads["x2"] == x2.grad.item()
assert grads["w2"] == w2.grad.item()