# Optimizers

Implementations for SDG, Momentum, etc. up to AdamW and Muon

## SGD (Stochastic Gradient Descent)

**Idea:** Take a step in the negative gradient direction.

```
θ = θ - lr * g
```

That's it. Simple, but struggles with noisy gradients and ill-conditioned landscapes.

In [None]:
import torch

In [None]:
class SDG(torch.optim.Optimizer):
    def __init__(self, params, lr=0.01):
        defaults = dict(lr=lr)
        super().__init__(params, defaults)
    
    @torch.no_grad()
    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is not None:
                    p.data.add_(p.grad, alpha=-group['lr'])


In [None]:
# Seed
torch.manual_seed(42)

# Forward + backward
x = torch.randn(16, 32)
target = torch.randn(16, 64)

# Create identical weights
W1 = torch.randn(64, 32, requires_grad=True)
W2 = W1.clone().detach().requires_grad_(True)

# Params
lr = 0.02

# Optimizers
opt_torch = torch.optim.SGD([W1], lr=lr)
opt_custom = SDG([W2], lr=lr)

for i in range(20):
    opt_torch.zero_grad()
    opt_custom.zero_grad()
    loss1 = ((x @ W1.T - target) ** 2).mean()
    loss2 = ((x @ W2.T - target) ** 2).mean()
    loss1.backward()
    loss2.backward()

    opt_torch.step()
    opt_custom.step()

    weight_max_diff = (W1 - W2).abs().max().item()
    assert weight_max_diff == 0.0
print(f"All good after {i+1} iterations!")

## SGD with Momentum

**Idea:** Accumulate gradients over time into a "velocity." Smooths out noise, builds up speed in consistent directions.

```
v = β * v + g
θ = θ - lr * v
```

Typical β = 0.9 (averages ~10 steps). Steady-state velocity is g/(1-β), so effective step is larger than vanilla SGD.

In [None]:
import torch

In [None]:
class SDGMomentum(torch.optim.Optimizer):
    def __init__(self, params, lr=0.01, momentum=0.9):
        defaults = dict(lr=lr, momentum=momentum)
        super().__init__(params, defaults)
    
    @torch.no_grad()
    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                # Lazy Init
                if p not in self.state:
                    self.state[p] = {
                        'momentum_buffer': torch.zeros_like(p),
                    }
                # Update Step
                buf = self.state[p]['momentum_buffer']
                buf.mul_(group['momentum']).add_(p.grad)
                p.data.add_(buf, alpha=-group['lr'])

In [None]:
# Seed
torch.manual_seed(42)

# Forward + backward
x = torch.randn(16, 32)
target = torch.randn(16, 64)

# Create identical weights
W1 = torch.randn(64, 32, requires_grad=True)
W2 = W1.clone().detach().requires_grad_(True)

# Params
lr = 0.02

# Optimizers
opt_torch = torch.optim.SGD([W1], lr=lr, momentum=0.9)
opt_custom = SDGMomentum([W2], lr=lr, momentum=0.9)

for i in range(20):
    opt_torch.zero_grad()
    opt_custom.zero_grad()
    loss1 = ((x @ W1.T - target) ** 2).mean()
    loss2 = ((x @ W2.T - target) ** 2).mean()
    loss1.backward()
    loss2.backward()

    opt_torch.step()
    opt_custom.step()

    state1 = opt_torch.state[W1]
    state2 = opt_custom.state[W2]
    assert list(state1.keys()) == ['momentum_buffer']
    assert torch.equal(state1['momentum_buffer'], state2['momentum_buffer'])

    weight_max_diff = (W1 - W2).abs().max().item()
    assert weight_max_diff == 0.0
    print(f"Diff {weight_max_diff}")

print(f"All good after {i+1} iterations!")