In [None]:
import math

import matplotlib.pyplot as plt
import numpy as np

# Value

In [None]:
class Value:
    def __init__(self, data, _children=(), _op: str = None, label: str = ""):
        self.data = data
        self.grad = 0.0
        self._prev = set(_children)
        self._op = _op
        self.label = label
        self._backward: callable = lambda: None

    def __repr__(self):
        return f"Value(data={self.data})"

    def __add__(self, other):
        other = (
            other if isinstance(other, Value) else Value(other)
        )  # Supporting add of integers
        out = Value(self.data + other.data, _children=(self, other), _op="+")

        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad

        out._backward = _backward
        return out

    def __radd__(self, other):
        return self + other

    def __neg__(self):
        return self * -1

    def __sub__(self, other):
        return self + (-other)

    def __gt__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        return self.data > other.data

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, _children=(self, other), _op="*")

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad

        out._backward = _backward
        return out

    def __rmul__(self, other):  # int + object
        return self.__mul__(other)

    def __pow__(self, other):
        assert isinstance(other, int | float)
        out = Value(self.data**other, (self,), _op=f"**{other}")

        def _backward():
            self.grad += other * (self.data ** (other - 1)) * out.grad

        out._backward = _backward

        return out

    def __truediv__(self, other):
        return self * other**-1

    def exp(self):
        x = self.data
        out = Value(math.exp(x), (self,), "exp")

        def _backward():
            self.grad = out.data * out.grad

        out._backward = _backward
        return out

    def tanh(self):
        x = self.data
        t = (math.exp(2 * x) - 1) / (math.exp(2 * x) + 1)
        out = Value(t, _children=(self,), _op="tanh")

        def _backward():
            self.grad += (1 - t**2) * out.grad

        out._backward = _backward
        return out

    def backward(self):
        topo = []
        visited = set()

        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)

        build_topo(self)
        self.grad = 1.0
        for node in reversed(topo):  # Needs to be reversed because we start at the end
            node._backward()

# Visualization

In [None]:
# Visualizing the computation graph
from graphviz import Digraph


def trace(root):
    nodes, edges = set(), set()

    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)

    build(root)
    return nodes, edges


# for any value use a rectangle, for any operation use a circle
def draw_graph(value: Value):
    dot = Digraph(format="svg", graph_attr={"rankdir": "LR"})  # Left to right
    nodes, edges = trace(value)
    # For each node, add a rectangle with the value
    for n in nodes:
        uid = str(id(n))
        dot.node(
            name=uid,
            label="{%s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad),
            shape="record",
        )
        # For any operation, use a circle
        if n._op:
            dot.node(name=uid + n._op, label=n._op)
            # Add edges to the graph
            dot.edge(uid + n._op, uid)
    for n1, n2 in edges:
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)
    return dot

# Building a full Neural Network (aka. Multi-Layer Perceptron)
Now that we have the ``Value`` objects and the computational graph, we would like to build out an actual neural network (Multi-Layer Perceptron).

- An MLP consists of multiple layers
- A layer is a stack of multiple neurons
- A neurons consists of multiple inputs and a matching number of weights, plus a bias node

<img src='../img/neural_net.webp' alt='Example Multi Layer Percepton'>

## Neuron
A neuron takes a fixed number of inputs and multiplies them with an equal amount of weights, it adds a bias to it which will then pass through an activation function
$$
o = f (\sum_{i=1}^n x_i w_i + b)
$$
<img src='../img/neuron_model-cs231n.jpeg' alt='Example Neuron'>

In [None]:
class Neuron:
    def __init__(self, nin):
        self.w = [Value(np.random.uniform(-1, 1)) for _ in range(nin)]
        self.b = Value(np.random.uniform(-1, 1))

    def __call__(self, x):
        act = sum(xi * wi for xi, wi in zip(x, self.w)) + self.b
        out = act.tanh()
        return out

    def parameters(self):
        return self.w + [self.b]

## Layer
A layer is a set of neurons stacked on top of each other with the input passing through every neuron individually and producing a fixed number of outputs ``nout``

In [None]:
class Layer:
    def __init__(self, nin, nout):
        self.neurons = [Neuron(nin) for _ in range(nout)]

    def __call__(self, x):
        out = [n(x) for n in self.neurons]
        return out[0] if len(out) == 1 else out

    def parameters(self):
        param = []
        for neuron in self.neurons:
            param.extend(neuron.parameters())
        return param

In [None]:
x = [2.0, 3.0]
l = Layer(nin=2, nout=4)
l(x)

We then get four outputs, for the four neurons

## MLP
An MLP is number of layer in sequence.

In [None]:
class MLP:
    def __init__(self, nin: int, nouts: list[int]):
        sz_layer = [nin] + nouts
        self.layers = [Layer(sz_layer[i], sz_layer[i + 1]) for i in range(len(nouts))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

    def parameters(self):
        param = []
        for layer in self.layers:
            param.extend(layer.parameters())
        return param

In [None]:
x = [2.0, 3.0, 4.0]
mlp = MLP(3, [4, 4, 1])
y = mlp(x)
print(y)

In [None]:
y.backward()

In [None]:
draw_graph(y)

# Optimization
We now want to create an example that shows how an MLP is able to learn a pattern by implementing the standard algorithm used to optimize neural nets: **Stochastic Gradient Descent (SGD)**

In [None]:
xs = [[2.0, 3.0, -1.0], [3.0, -1.0, 0.5], [0.5, 1.0, 1.0], [1.0, 1.0, -1.0]]
ys = [1.0, -1.0, -1.0, 1.0]  # desired targets

model = MLP(3, [4, 4, 1])

In [None]:
ypreds = [model(x) for x in xs]
ypreds

We now need to determine how close our predictions align with the desired targets. There are multiple ways of doing this most straightforward is by computing the difference between the predictions and targets.

It is important though, that we can guarantee that out loss is positive, because otherwise we would have a *negative loss* which would indicate a gain.
Any operation that removes a possible $-$ sign is well suited. 

Most straightforward and often used is the **Mean-Squared Error (MSE)**:

$$
\text{MSE} = \frac{1}{n}\sum_{i=1}^n (\text{pred} - \text{targets})^2
$$

In [None]:
mse = lambda pred, label: sum([(xs - ys) ** 2 for xs, ys in zip(pred, label)]) / len(
    pred
)
loss = mse(ypreds, ys)
loss

Since the outputs of the MLP are Value objects as well computing the loss on the difference between the prediction and the targets will yield a Value objects as well containing the loss.

The value object is therefore attached at the end of the computational graph and we can call ``.backward()`` on it, which will kick off backpropagation, computing the loss with respect to the individual parameters.

In [None]:
loss.backward()

In [None]:
draw_graph(loss)

Since we now have the gradients, indicating what variable influence the output by how much we can start changing the values of the weights.

We nudged every weight by the gradient, but only by a small amount so that we can progress slowly, we therefore introduce a new parameter ``alpha`` which will be out **learning rate**.

We then nudged the parameters by decreasing the data by alpha times the gradient.

It is important to note that we **decrease** not increasing. This is because of the influence the gradients of the weights have on the output.

Considering that our final output is now our **loss**, and we want to decrease the loss we need to decrease the weights by their gradients, to get the output (loss) down as much as possible.

In [None]:
alpha = 0.001

for _ in range(2000):
    ypreds = [model(x) for x in xs]

    # Zero the gradients because of gradient accumulation
    for p in model.parameters():
        p.grad = 0.0

    loss = mse(ypreds, ys)
    loss.backward()

    for p in model.parameters():
        p.data -= alpha * p.grad

print(ypreds)
print(f"Loss {loss.data}")

Tuning the learning rate is an art of it's own, and there are many tools available that help with it.

There are also more advanced optimization algorithms such as **RMS Prob**, **Adam**, etc.
However, those only are optimization of the standard **SGD** they do not tackle the problem from a different angle. If you strip away all their optimizations you end up with Stochastic Gradient Descent

In other words:

**If you understand SGD you understand the biggest part of training neural nets, as eveything else is merely optimization ;)**

# Classification
Now let's try to solve a classification problem using out own NN.

In [None]:
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=200, shuffle=True, noise=0.15, random_state=42)

In [None]:
X

In [None]:
y

In [None]:
y = y * 2 - 1
y

In [None]:
# Getting all elements across the first dimension at position 0, which is the x-coordinate
X[:, 0][:10]

In [None]:
def model_predict_visualize_custom(X, y, net=None):
    plt.scatter(X[:, 0], X[:, 1], c=y, s=20, cmap="jet")

    if net is not None:
        x1_min, x1_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
        x2_min, x2_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
        xx1, xx2 = np.meshgrid(
            np.linspace(x1_min, x1_max, 100),
            np.linspace(x2_min, x2_max, 100),
        )
        X_grid = np.stack([xx1.ravel(), xx2.ravel()], axis=1)
        # Predict using the custom MLP
        y_grid = []
        for xg in X_grid:
            out = net(xg.tolist())
            # Output is a Value object, get its data and sign
            y_grid.append(np.sign(out.data))
        y_grid = np.array(y_grid).reshape(xx1.shape)
        plt.contourf(xx1, xx2, y_grid, cmap="jet", alpha=0.2)

    plt.show()


model_predict_visualize_custom(X, y)

In [None]:
# Converting to list as we don't support numpy
xs, ys = X.tolist(), y.tolist()
ys

In [None]:
mlp = MLP(2, [4, 1])
mlp(xs[0])

In [None]:
lr = 0.01
EPOCHS = 500

for epoch in range(EPOCHS):
    # Zero the gradients
    for p in mlp.parameters():
        p.grad = 0.0

    # Forward pass
    pred = []
    for x in xs:
        pred.append(mlp(x))

    loss = mse(pred, ys)
    # Computing accuracy
    p = [1 if p.data > 0 else -1 for p in pred]
    acc = np.mean(np.array(p) == np.array(ys))
    loss.backward()

    # Update
    for p in mlp.parameters():
        p.data -= lr * p.grad

    print(f"Epoch: {epoch + 1}, Loss: {loss}, Accuracy: {acc}")

In [None]:
model_predict_visualize_custom(X, y, mlp)