# Momentum

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_moons
import sys
import math
from pathlib import Path

# Add the project root to the Python path
project_root = Path().absolute().parent.parent
sys.path.append(str(project_root))

from src.nn import MLP
from src.optimizer import SGD, mse
from src.Engine import Value, draw_graph

In [None]:
X, y = make_moons(n_samples=200, shuffle=True, noise=0.15, random_state=42)
y = y * 2 - 1


def model_predict_visualize_custom(X, y, net=None):
    plt.scatter(X[:, 0], X[:, 1], c=y, s=20, cmap="jet")

    if net is not None:
        x1_min, x1_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
        x2_min, x2_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
        xx1, xx2 = np.meshgrid(
            np.linspace(x1_min, x1_max, 100), np.linspace(x2_min, x2_max, 100)
        )
        X_grid = np.stack([xx1.ravel(), xx2.ravel()], axis=1)
        # Predict using the custom MLP
        y_grid = []
        for xg in X_grid:
            out = net(xg.tolist())
            # Output is a Value object, get its data and sign
            y_grid.append(np.sign(out.data))
        y_grid = np.array(y_grid).reshape(xx1.shape)
        plt.contourf(xx1, xx2, y_grid, cmap="jet", alpha=0.2)

    plt.show()


model_predict_visualize_custom(X, y)

In [None]:
# Converting to list as we don't support numpy
xs, ys = X.tolist(), y.tolist()

In [None]:
mlp = MLP(2, [4, 1])
lr = 0.001
epochs = 500
losses = []
optimizer = SGD(mlp, lr=lr)
print(optimizer)

# SGD

In [None]:
def sgd(mlp):
    lr = 0.001
    losses = []
    loss = Value(100)
    epoch = 0

    while epoch < 500:
        # Forward pass
        pred = []
        for x in xs:
            pred.append(mlp(x))

        for p in mlp.parameters():
            p.grad = 0.0

        loss = mse(pred, ys)
        losses.append(loss.data)
        loss.backward()

        for p in mlp.parameters():
            p.data -= lr * p.grad

        epoch += 1
        print(f"Epoch {epoch}: Loss: {loss.data}")

    return losses

# Momentum
An extension of SGD is using Momentum, a concept applied from physics. 

Momentum is like a ball rolling down a hill. Standard SGD drops the ball at each step - it only knows the current slope. Momentum **remembers the previous motion**, thereby accelerating down consistent slopes, which also dampens oscillation. **The influence of previous updated is known as velocity.**

A parameter $\gamma$ controls the influence of the velocity on the next update, reducing the influence of earlier updates on the current one.

**Momentum Update Rule:**

$$
\begin{split}
&v_{t+1} = \gamma \cdot v_t + \nabla \mathcal{L}(\theta_t) \\
&\theta_{t+1} = \theta_t - \eta \cdot v_{t+1}
\end{split}
$$

This requires the allocation of additional memory for computing the velocity, meaning $|\theta|$ many additional variables.

In [None]:
def sgd_momentum(mlp):
    lr = 0.001
    gamma = 0.9
    velocity = {p: 0.0 for p in mlp.parameters()}
    losses = []
    loss = Value(100)
    epoch = 0

    while epoch < 500:
        # Forward pass
        pred = [mlp(x) for x in xs]

        for p in mlp.parameters():
            p.grad = 0.0

        loss = mse(pred, ys)
        losses.append(loss.data)
        loss.backward()

        for p in mlp.parameters():
            velocity[p] = gamma * velocity[p] + p.grad
            p.data -= lr * velocity[p]

        epoch += 1
        print(f"Epoch {epoch}: Loss: {loss.data}")

    return losses

In [None]:
mlp_sgd = MLP(2, [4, 1])
mlp_sgd_m = MLP(2, [4, 1])

# Setting equal inital parameters for testing
for p, q in zip(mlp_sgd.parameters(), mlp_sgd_m.parameters()):
    q.data = p.data

print("Vanilla SGD")
# sgd_losses = sgd(mlp_sgd)
print("SGD with momentum")
sgd_momentum_losses = sgd_momentum(mlp_sgd_m)

In [None]:
plt.plot(sgd_losses, label="Vanilla SGD")
plt.plot(sgd_momentum_losses, label="SGD with Momentum")
plt.legend()

In [None]:
model_predict_visualize_custom(X, y, mlp_sgd)

In [None]:
model_predict_visualize_custom(X, y, mlp_sgd_m)