In [1]:
from __future__ import annotations
%matplotlib inline

In [2]:
def display_tree_visualization_of_node(node: Value, prefix="", is_last=True):
    """Display a tree-like visualization of the computational graph."""
    if prefix == "":  # Root node
        print(f"Result: {node.data}")
        print()
        connector = ""
    else:
        connector = "└── " if is_last else "├── "

    print(f"{prefix}{connector}{node.data:<4.2f} ({node._op}, grad:{node.grad:.10f}, id:{node._id}, name='{node._name}')")

    # Set the prefix for the next level
    child_prefix = prefix + ("    " if is_last else "│   ")

    # Process children
    children = list(sorted(node._children))
    for i, child in enumerate(children):
        is_last_child = (i == len(children) - 1)
        display_tree_visualization_of_node(child, child_prefix, is_last_child)

In [3]:
import math
import numpy as np


# we need to look at backward pass of the neuron

# for backward
# let's apply the rules from https://www.mathsisfun.com/calculus/derivatives-rules.html

class Value:
    def __init__(self, data, _children: tuple = None, *, op='leaf', _backward=lambda: None, name=""):
        self.data: float = data
        self._children: set = set() if _children is None else set(_children)
        self._op: str = op
        self.grad: float = 0.0  # Gradient initialized to zero
        self._id: int = id(self)  # Unique identifier for the node, useful for debugging
        self._name: str = name  # Name of the node for better visualization

        self._backward: callable = _backward

    def backward(self):
        # check if I am the root node (level 0), then it will be preset to 1.0
        # topological order all of the children in the graph
        topo = []
        visited = set()
        def build_topo(vertex):
            if vertex not in visited:
                visited.add(vertex)
                for child in vertex._children:
                    build_topo(child)
                topo.append(vertex)
        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1
        for v in reversed(topo):
            v._backward()

    def __repr__(self):
        return f"Value(data={self.data}, op='{self._op}', id={self._id}, name='{self._name}', grad='{self.grad}')"

    def __add__(self, other):
        if isinstance(other, int) or isinstance(other, float):
            other = Value(other)

        child_a = self
        child_b = other

        result = Value(self.data + other.data, (child_a, child_b), op='+')

        def _backward():
            child_a.grad += 1 * result.grad
            child_b.grad += 1 * result.grad

        result._backward = _backward

        return result

    def __mul__(self, other):
        if isinstance(other, int) or isinstance(other, float):
            other = Value(other)

        child_a = self
        child_b = other

        result = Value(self.data * other.data, (child_a, child_b), op='*')

        def _backward():
            child_a.grad += child_b.data * result.grad
            child_b.grad += child_a.data * result.grad

        result._backward = _backward
        return result

    def __neg__(self):
        return self.data * -1

    def __sub__(self, other):
        return self + (-other)

    def __rmul__(self, other):
        """Fallback - Solve the case where Value is on the right side of the multiplication operator."""
        return self * other

    def __radd__(self, other):
        """Fallback - Solve the case where Value is on the right side of the addition operator."""
        return self + other

    def __truediv__(self, other): # self / other
        return self * other**-1

    def __rtruediv__(self, other): # other / self
        return other * self**-1

    def __pow__(self, power, modulo=None):
        child = self

        result = Value(self.data ** power, (child,), op="**")

        def _backward():
            # taken from https://www.symbolab.com/cheat-sheets/Derivatives#
            child.grad += power * (child.data ** (power - 1)) * result.grad

        result._backward = _backward
        return result

    def relu(self):
        child = self

        result = Value(0 if child.data < 0 else child.data, (child,), op='ReLU')

        def _backward():
            child.grad += (result.data > 0) * result.grad

        result._backward = _backward

        return result

    def sigmoid(self):
        child = self

        result = Value(1 / (1 + np.exp(-child.data)), (child,), op='sigmoid')

        def _backward():
            # https://en.wikipedia.org/wiki/Logistic_function
            child.grad += result.data * (1 - result.data) * result.grad

        result._backward = _backward

        return result

    def exp(self):
        child = self

        result = Value(math.exp(child.data), (child,), op='exp')

        def _backward():
            child.grad += result.data * result.grad

        result._backward = _backward

        return result


    def __lt__(self, other):
        return self._id < other._id

In [4]:
a = Value(2.0, name="a")
b = Value(3.0, name="b")
c = Value(10.0, name="c")

result = a * b + c + 2
# result = result.relu()
result.backward() # Calculate gradients
display_tree_visualization_of_node(result)

Result: 18.0

18.00 (+, grad:1.0000000000, id:4660096784, name='')
    ├── 2.00 (leaf, grad:1.0000000000, id:4658532944, name='')
    └── 16.00 (+, grad:1.0000000000, id:4660683760, name='')
        ├── 6.00 (*, grad:1.0000000000, id:4660684672, name='')
        │   ├── 3.00 (leaf, grad:2.0000000000, id:4623001680, name='b')
        │   └── 2.00 (leaf, grad:3.0000000000, id:4660050784, name='a')
        └── 10.00 (leaf, grad:1.0000000000, id:4661100624, name='c')


In [11]:
a = Value(2.0, name="a")
b = Value(3.0, name="b")
c = Value(10.0, name="c")

result: Value = a * b + c + 2
result = result.sigmoid()
# result = result.relu()
# result = result.exp()
result.backward() # Calculate gradients
display_tree_visualization_of_node(result)

Result: 0.9999999847700205

1.00 (sigmoid, grad:1.0000000000, id:4892518432, name='')
    └── 18.00 (+, grad:0.0000000152, id:4896097232, name='')
        ├── 16.00 (+, grad:0.0000000152, id:4895071824, name='')
        │   ├── 6.00 (*, grad:0.0000000152, id:4895073744, name='')
        │   │   ├── 2.00 (leaf, grad:0.0000000457, id:4894979712, name='a')
        │   │   └── 3.00 (leaf, grad:0.0000000305, id:4896397904, name='b')
        │   └── 10.00 (leaf, grad:0.0000000152, id:4896399696, name='c')
        └── 2.00 (leaf, grad:0.0000000152, id:4895896336, name='')


In [5]:
# how to get all parameters?

# now let's look at neuron
# https://www.bing.com/images/search?q=neuron+math&form=HDRSC4&first=1
import random
random.seed(42)

class Neuron:
    def __init__(self, weights, bias=0.0):
        self.weights = [Value(random.uniform(-1, 1)) if not x else Value(x) for x in weights]
        self.bias = Value(bias if bias else random.uniform(-1, 1))

    def forward(self, neuron_inputs):
        assert len(neuron_inputs) == len(self.weights), "Input size must match weights size"
        weighted_sum = self.weights[0] * neuron_inputs[0]
        for w, i in zip(self.weights[1:], neuron_inputs[1:]):
            weighted_sum += (w * i)
        output = weighted_sum + self.bias
        return output.sigmoid()

    def __call__(self, *args, **kwargs):
        return self.forward(*args)

    def parameters(self):
        """Return all parameters of the neuron (weights and bias)."""
        return self.weights + [self.bias]

    def zero_grad(self):
        """Set all gradients of the neuron to zero."""
        for w in self.weights:
            w.grad = 0.0
        self.bias.grad = 0.0

class Layer:
    def __init__(self, n_in, n_out):
        self.neurons = [Neuron([None] * n_in) for _ in range(n_out)]

    def forward(self, layer_inputs):
        outputs = [neuron(layer_inputs) for neuron in self.neurons]
        return outputs

    def __call__(self, *args, **kwargs):
        return self.forward(*args)

    def parameters(self):
        """Return all parameters of the layer (weights and biases of all neurons)."""
        params = []
        for neuron in self.neurons:
            params.extend(neuron.parameters())
        return params

    def zero_grad(self):
        """Set all gradients of the layer's neurons to zero."""
        for neuron in self.neurons:
            neuron.zero_grad()

class MLP:
    def __init__(self, n_in: int, internal_dims: list):
        dimensions = [n_in] + internal_dims
        self.layers = [Layer(dimensions[i], dimensions[i + 1]) for i in range(len(dimensions) - 1)]

    def forward(self, mlp_inputs):
        mlp_inputs = [Value(x) if not isinstance(x, Value) else x for x in mlp_inputs]  # Convert inputs to Value objects
        for layer in self.layers:
            print(mlp_inputs)
            mlp_inputs = layer.forward(mlp_inputs)
        return mlp_inputs if len(mlp_inputs) != 1 else mlp_inputs[0]

    def __call__(self, *args, **kwargs):
        return self.forward(*args)

    def parameters(self):
        """Return all parameters of the MLP (weights and biases of all layers)."""
        params = []
        for layer in self.layers:
            params.extend(layer.parameters())
        return params

    def zero_grad(self):
        """Set all gradients of the MLP's layers to zero."""
        for layer in self.layers:
            layer.zero_grad()


In [6]:
np.random.seed(42)

train_x = [
    [0, 0],
    [0, 1],
    [1, 0],
    [1, 1]
]

train_y = [
    [0],
    [1],
    [1],
    [0]
]

mlp = MLP(n_in=2, internal_dims=[2, 1])

print(mlp.forward(train_x[0]))
print(mlp.forward(train_x[1]))
print(mlp.forward(train_x[2]))
print(mlp.forward(train_x[3]))
print()

for epoch in range(10000):
    mlp.zero_grad()  # Reset gradients before each epoch
    overall_loss = None
    for i in range(len(train_x)):
        inputs = train_x[i]
        target = train_y[i]

        outputs = mlp(inputs)  # Forward pass
        if overall_loss is None:
            overall_loss = (outputs - target[0]) ** 2  # Mean Squared Error
        else:
            overall_loss += (outputs - target[0]) ** 2  # Mean Squared Error - Accumulate loss

    overall_loss.backward()  # Backward pass to compute gradients

    # Update parameters using gradient descent
    learning_rate = 0.1
    for param in mlp.parameters():
        param.data -= learning_rate * param.grad

print(mlp.forward(train_x[0]))
print(mlp.forward(train_x[1]))
print(mlp.forward(train_x[2]))
print(mlp.forward(train_x[3]))


[Value(data=0, op='leaf', id=4661314384, name='', grad='0.0'), Value(data=0, op='leaf', id=4660664944, name='', grad='0.0')]
[Value(data=0.38937470755111603, op='sigmoid', id=4661376208, name='', grad='0.0'), Value(data=0.5874415780754889, op='sigmoid', id=4661265104, name='', grad='0.0')]
Value(data=0.4167704251711119, op='sigmoid', id=4661265744, name='', grad='0.0')
[Value(data=0, op='leaf', id=4661265872, name='', grad='0.0'), Value(data=1, op='leaf', id=4661266000, name='', grad='0.0')]
[Value(data=0.19782882989705186, op='sigmoid', id=4661266640, name='', grad='0.0'), Value(data=0.6955807834931, op='sigmoid', id=4661267280, name='', grad='0.0')]
Value(data=0.3599405159557011, op='sigmoid', id=4661267920, name='', grad='0.0')
[Value(data=1, op='leaf', id=4661268048, name='', grad='0.0'), Value(data=0, op='leaf', id=4661268176, name='', grad='0.0')]
[Value(data=0.45733208546958715, op='sigmoid', id=4661268816, name='', grad='0.0'), Value(data=0.45012156176767953, op='sigmoid', id=4

KeyboardInterrupt: 