Skip to content

Commit

Permalink
Add AdaGrad and RMSProp Optimizers. These optimizers implement parame…
Browse files Browse the repository at this point in the history
…ter-wise adaptive learning rates, which can be beneficial for dealing with sparse or multi-scale data. AdaGrad keeps a history of past squared gradients, while RMSProp incorporates a moving average of past squared gradients, offering a more adaptable learning rate. Both optimizers also support L2 regularization through weight decay.
  • Loading branch information
m0saan committed Jun 17, 2023
1 parent b43e00f commit 063e9db
Show file tree
Hide file tree
Showing 2 changed files with 384 additions and 2 deletions.
135 changes: 133 additions & 2 deletions minima/optim.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/04_optim.ipynb.

# %% auto 0
__all__ = ['Optimizer', 'SGD', 'Adam']
__all__ = ['Optimizer', 'SGD', 'AdaGrad', 'RMSProp', 'Adam']

# %% ../nbs/04_optim.ipynb 2
import minima as mi
Expand Down Expand Up @@ -122,7 +122,138 @@ def _reg_step(self, p):
if self.wd != 0:
p.data *= (1 - self.lr * self.wd)

# %% ../nbs/04_optim.ipynb 9
# %% ../nbs/04_optim.ipynb 10
class AdaGrad(Optimizer):
"""
Implements AdaGrad optimization algorithm.
AdaGrad is an optimizer with parameter-wise learning rates, which adapts the learning rate
based on how frequently a parameter gets updated during training. It's particularly useful
for sparse data.
Parameters
----------
params : Iterable
The parameters of the model to be optimized.
lr : float, optional
The initial learning rate.
wd : float, optional
The weight decay (L2 regularization).
eps : float, optional
A small constant for numerical stability.
"""
def __init__(
self,
params, # The parameters of the model to be optimized.
lr=0.01, # The initial learning rate.
wd=0.0, # The weight decay (L2 regularization).
eps=1e-7, # A small constant for numerical stability.
):
super().__init__(params)

self.lr = lr
self.cache = {}
self.wd = wd
self.eps = eps

def step(self):
"""
Performs a single optimization step.
This method uses the current gradients to adjust the parameters using AdaGrad algorithm.
"""
for self.idx, p in enumerate(self.params):
self._reg_step(p)
self._opt_step(p)

def _opt_step(self, p):
"""
Performs the optimization step for a single parameter tensor.
It computes parameter-wise learning rates and updates the parameters accordingly.
"""
if self.idx not in self.cache:
self.cache[self.idx] = init.zeros(*p.shape)
self.cache[self.idx] += p.grad.data ** 2
p.data = p.data - (self.lr / (self.cache[self.idx] + self.eps) ** 0.5 ) * p.grad.data

def _reg_step(self, p):
"""
Applies weight decay for a single parameter tensor.
This form of L2 regularization can help prevent overfitting.
"""
if self.wd != 0:
p.data *= (1 - self.lr * self.wd)

# %% ../nbs/04_optim.ipynb 13
class RMSProp(Optimizer):
"""
Implements RMSProp optimization algorithm.
RMSProp is an optimizer with parameter-wise adaptive learning rates, which adapt the learning rate
for each parameter individually, making it suitable for dealing with sparse or multi-scale data.
Parameters
----------
params : Iterable
The parameters of the model to be optimized.
lr : float, optional
The initial learning rate.
wd : float, optional
The weight decay (L2 regularization).
eps : float, optional
A small constant for numerical stability.
rho : float, optional
The decay rate for the moving average of squared gradients.
"""
def __init__(
self,
params, # The parameters of the model to be optimized.
lr=0.001, # The initial learning rate.
wd=0.0, # The weight decay (L2 regularization).
eps=1e-7, # A small constant for numerical stability.
rho=0.9, # The decay rate for the moving average of squared gradients.
):
super().__init__(params)

self.lr = lr
self.cache = {}
self.wd = wd
self.eps = eps
self.rho = rho

def step(self):
"""
Performs a single optimization step.
This method uses the current gradients to adjust the parameters using RMSProp algorithm.
"""
for self.idx, p in enumerate(self.params):
self._reg_step(p)
self._opt_step(p)

def _opt_step(self, p):
"""
Performs the optimization step for a single parameter tensor.
It computes parameter-wise learning rates and updates the parameters accordingly.
"""
if self.idx not in self.cache:
self.cache[self.idx] = init.zeros(*p.shape)
self.cache[self.idx] = self.rho * self.cache[self.idx] + (1 - self.rho) * p.grad.data ** 2
p.data = p.data - (self.lr / (self.cache[self.idx] + self.eps) ** 0.5 ) * p.grad.data

def _reg_step(self, p):
"""
Applies weight decay for a single parameter tensor.
This form of L2 regularization can help prevent overfitting.
"""
if self.wd != 0:
p.data *= (1 - self.lr * self.wd)

# %% ../nbs/04_optim.ipynb 16
class Adam(Optimizer):
"""
Implements the Adam optimization algorithm.
Expand Down
Loading

0 comments on commit 063e9db

Please sign in to comment.