In [5]:
import sys
sys.path.append("../torchlet")

In [2]:
from abc import ABC, abstractmethod
from torchlet.engine import Element

class Optimizer(ABC):
    def __init__(self, params):
        self.params = params
    
    @abstractmethod
    def step(self):
        """Perform a single optimization step."""
        pass

    def zero_grad(self):
        """Zero out the gradients for all parameters."""
        for param in self.params:
            param.grad = None

In [4]:
class SGD(Optimizer):
    def __init__(self, params, lr=0.01):
        super().__init__(params)
        self.lr = lr

    def step(self):
        for param in self.params:
            if param.grad is not None:
                param.data -= self.lr * param.grad

In [5]:
class SGDMomentum(Optimizer):
    def __init__(self, params, lr=0.01, momentum=0.9):
        super().__init__(params)
        self.lr = lr
        self.momentum = momentum
        self.velocity = {param: Element(0) for param in self.params}

    def step(self):
        for param in self.params:
            if param.grad is not None:
                self.velocity[param] = self.momentum * self.velocity[param] + self.lr * param.grad
                param.data -= self.velocity[param]

In [6]:
class Nesterov(Optimizer):
    def __init__(self, params, lr=0.01, momentum=0.9):
        super().__init__(params)
        self.lr = lr
        self.momentum = momentum
        self.velocity = {param: Element(0) for param in self.params}

    def step(self):
        for param in self.params:
            if param.grad is not None:
                prev_velocity = self.velocity[param]
                self.velocity[param] = self.momentum * self.velocity[param] + self.lr * param.grad
                param.data -= (self.momentum * prev_velocity + self.lr * param.grad)

In [7]:
class RMSprop(Optimizer):
    def __init__(self, params, lr=0.01, beta=0.9, epsilon=1e-8):
        super().__init__(params)
        self.lr = lr
        self.beta = beta
        self.epsilon = epsilon
        self.squared_grad = {param: Element(0) for param in self.params}

    def step(self):
        for param in self.params:
            if param.grad is not None:
                self.squared_grad[param] = self.beta * self.squared_grad[param] + (1 - self.beta) * param.grad ** 2
                param.data -= self.lr * param.grad / (self.squared_grad[param].data ** 0.5 + self.epsilon)

In [8]:
class AdaGrad(Optimizer):
    def __init__(self, params, lr=0.01, epsilon=1e-8):
        super().__init__(params)
        self.lr = lr
        self.epsilon = epsilon
        self.sum_of_squares = {param: Element(0) for param in self.params}

    def step(self):
        for param in self.params:
            if param.grad is not None:
                self.sum_of_squares[param] += param.grad ** 2
                param.data -= self.lr * param.grad / (self.sum_of_squares[param].data ** 0.5 + self.epsilon)

In [9]:
class Adam(Optimizer):
    def __init__(self, params, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        super().__init__(params)
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = {param: Element(0) for param in self.params}
        self.v = {param: Element(0) for param in self.params}
        self.t = 0

    def step(self):
        self.t += 1
        for param in self.params:
            if param.grad is not None:
                self.m[param] = self.beta1 * self.m[param] + (1 - self.beta1) * param.grad
                self.v[param] = self.beta2 * self.v[param] + (1 - self.beta2) * (param.grad ** 2)

                m_hat = self.m[param] / (1 - self.beta1 ** self.t)
                v_hat = self.v[param] / (1 - self.beta2 ** self.t)

                param.data -= self.lr * m_hat / (v_hat.data ** 0.5 + self.epsilon)

In [10]:
class AdamW(Adam):
    def __init__(self, params, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, weight_decay=0.01):
        super().__init__(params, lr, beta1, beta2, epsilon)
        self.weight_decay = weight_decay

    def step(self):
        self.t += 1
        for param in self.params:
            if param.grad is not None:
                self.m[param] = self.beta1 * self.m[param] + (1 - self.beta1) * param.grad
                self.v[param] = self.beta2 * self.v[param] + (1 - self.beta2) * (param.grad ** 2)

                m_hat = self.m[param] / (1 - self.beta1 ** self.t)
                v_hat = self.v[param] / (1 - self.beta2 ** self.t)

                param.data -= self.lr * m_hat / (v_hat.data ** 0.5 + self.epsilon) + self.weight_decay * param.data

In [13]:
from torchlet.engine import Element
from torchlet.nn import MLP

# Initialize model and optimizer
model = MLP(3, [4, 4, 1])
optimizer = Adam(model.parameters())

# Forward pass
x = [Element(1.0), Element(2.0), Element(3.0)]
y = model(x)

# Compute loss and backpropagate
loss = y * Element(1.0)
loss.backward()

# Step the optimizer
optimizer.step()

# Zero the gradients for the next iteration
optimizer.zero_grad()

UFuncTypeError: Cannot cast ufunc 'subtract' output from dtype('O') to dtype('float64') with casting rule 'same_kind'