Add AdaGrad and RMSProp Optimizers. These optimizers implement parame…

…ter-wise adaptive learning rates, which can be beneficial for dealing with sparse or multi-scale data. AdaGrad keeps a history of past squared gradients, while RMSProp incorporates a moving average of past squared gradients, offering a more adaptable learning rate. Both optimizers also support L2 regularization through weight decay.
m0saan · Jun 17, 2023 · 063e9db · 063e9db
1 parent b43e00f
commit 063e9db
Show file tree

Hide file tree

Showing 2 changed files with 384 additions and 2 deletions.
diff --git a/minima/optim.py b/minima/optim.py
@@ -1,7 +1,7 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/04_optim.ipynb.
 
 # %% auto 0
-__all__ = ['Optimizer', 'SGD', 'Adam']
+__all__ = ['Optimizer', 'SGD', 'AdaGrad', 'RMSProp', 'Adam']
 
 # %% ../nbs/04_optim.ipynb 2
 import minima as mi
@@ -122,7 +122,138 @@ def _reg_step(self, p):
         if self.wd != 0:
             p.data *= (1 - self.lr * self.wd)
 
-# %% ../nbs/04_optim.ipynb 9
+# %% ../nbs/04_optim.ipynb 10
+class AdaGrad(Optimizer):
+    """
+    Implements AdaGrad optimization algorithm.
+
+    AdaGrad is an optimizer with parameter-wise learning rates, which adapts the learning rate
+    based on how frequently a parameter gets updated during training. It's particularly useful
+    for sparse data.
+
+    Parameters
+    ----------
+    params : Iterable
+        The parameters of the model to be optimized.
+    lr : float, optional
+        The initial learning rate.
+    wd : float, optional
+        The weight decay (L2 regularization).
+    eps : float, optional
+        A small constant for numerical stability.
+    """
+    def __init__(
+        self,
+        params,  # The parameters of the model to be optimized.
+        lr=0.01,  # The initial learning rate.
+        wd=0.0,  # The weight decay (L2 regularization).
+        eps=1e-7,  # A small constant for numerical stability.
+    ):
+        super().__init__(params)
+
+        self.lr = lr
+        self.cache = {}
+        self.wd = wd
+        self.eps = eps
+
+    def step(self):
+        """
+        Performs a single optimization step.
+
+        This method uses the current gradients to adjust the parameters using AdaGrad algorithm.
+        """
+        for self.idx, p in enumerate(self.params):
+            self._reg_step(p)
+            self._opt_step(p)
+
+    def _opt_step(self, p):
+        """
+        Performs the optimization step for a single parameter tensor.
+
+        It computes parameter-wise learning rates and updates the parameters accordingly.
+        """
+        if self.idx not in self.cache:
+            self.cache[self.idx] = init.zeros(*p.shape)
+        self.cache[self.idx] += p.grad.data ** 2
+        p.data = p.data - (self.lr / (self.cache[self.idx] + self.eps) ** 0.5 ) * p.grad.data
+
+    def _reg_step(self, p):
+        """
+        Applies weight decay for a single parameter tensor.
+
+        This form of L2 regularization can help prevent overfitting.
+        """
+        if self.wd != 0:
+            p.data *= (1 - self.lr * self.wd)
+
+# %% ../nbs/04_optim.ipynb 13
+class RMSProp(Optimizer):
+    """
+    Implements RMSProp optimization algorithm.
+
+    RMSProp is an optimizer with parameter-wise adaptive learning rates, which adapt the learning rate
+    for each parameter individually, making it suitable for dealing with sparse or multi-scale data.
+
+    Parameters
+    ----------
+    params : Iterable
+        The parameters of the model to be optimized.
+    lr : float, optional
+        The initial learning rate.
+    wd : float, optional
+        The weight decay (L2 regularization).
+    eps : float, optional
+        A small constant for numerical stability.
+    rho : float, optional
+        The decay rate for the moving average of squared gradients.
+    """
+    def __init__(
+        self,
+        params,  # The parameters of the model to be optimized.
+        lr=0.001,  # The initial learning rate.
+        wd=0.0,  # The weight decay (L2 regularization).
+        eps=1e-7,  # A small constant for numerical stability.
+        rho=0.9, # The decay rate for the moving average of squared gradients.
+    ):
+        super().__init__(params)
+
+        self.lr = lr
+        self.cache = {}
+        self.wd = wd
+        self.eps = eps
+        self.rho = rho
+
+    def step(self):
+        """
+        Performs a single optimization step.
+
+        This method uses the current gradients to adjust the parameters using RMSProp algorithm.
+        """
+        for self.idx, p in enumerate(self.params):
+            self._reg_step(p)
+            self._opt_step(p)
+
+    def _opt_step(self, p):
+        """
+        Performs the optimization step for a single parameter tensor.
+
+        It computes parameter-wise learning rates and updates the parameters accordingly.
+        """
+        if self.idx not in self.cache:
+            self.cache[self.idx] = init.zeros(*p.shape)
+        self.cache[self.idx] = self.rho * self.cache[self.idx] + (1 - self.rho) * p.grad.data ** 2
+        p.data = p.data - (self.lr / (self.cache[self.idx] + self.eps) ** 0.5 ) * p.grad.data
+
+    def _reg_step(self, p):
+        """
+        Applies weight decay for a single parameter tensor.
+
+        This form of L2 regularization can help prevent overfitting.
+        """
+        if self.wd != 0:
+            p.data *= (1 - self.lr * self.wd)
+
+# %% ../nbs/04_optim.ipynb 16
 class Adam(Optimizer):
     """
     Implements the Adam optimization algorithm.