adam

pchavanne · Jan 14, 2017 · 683b413 · 683b413
1 parent 75f3946
commit 683b413
Show file tree

Hide file tree

Showing 4 changed files with 107 additions and 19 deletions.
diff --git a/docs/modules/updates.rst b/docs/modules/updates.rst
@@ -13,8 +13,11 @@ Updating functions that are passed to the network.
     momentum
     nesterov_momentum
     adagrad
-    adadelta
     rmsprop
+    adadelta
+    adam
+    admax
+    nadam
     hessian_free
 
 Detailed description
@@ -24,6 +27,9 @@ Detailed description
 .. autofunction:: momentum
 .. autofunction:: nesterov_momentum
 .. autofunction:: adagrad
-.. autofunction:: adadelta
 .. autofunction:: rmsprop
+.. autofunction:: adadelta
+.. autofunction:: adam
+.. autofunction:: admax
+.. autofunction:: nadam
 .. autofunction:: hessian_free
diff --git a/yadll/init.py b/yadll/init.py
@@ -184,6 +184,7 @@ def orthogonal(shape, gain=1, name=None, borrow=True, **kwargs):
     ----------
 
     .. [1] http://smerity.com/articles/2016/orthogonal_init.html
+    .. [2] https://arxiv.org/pdf/1312.6120.pdf
     """
     if gain == relu:
         gain = np.sqrt(2)

diff --git a/yadll/layers.py b/yadll/layers.py
@@ -590,14 +590,14 @@ class BatchNormalization(Layer):
     Normalize the input layer over each mini-batch according to [1]_:
 
     .. math::
-    x_hat = (x - E[x]) / \\sqrt(\\Var[x] + \\epsilon)
+        \\hat{x} = \\frac{x - E[x]}{\\sqrt(Var[x] + \\epsilon)}
 
-    y = \\gamma * x_hat + \\beta
+        y = \\gamma * \\hat{x} + \\beta
 
     References
     ----------
 
-    ..[1] http://jmlr.org/proceedings/papers/v37/ioffe15.pdf
+    .. [1] http://jmlr.org/proceedings/papers/v37/ioffe15.pdf
     """
     nb_instances = 0
 

diff --git a/yadll/updates.py b/yadll/updates.py
@@ -36,7 +36,7 @@ def momentum(cost, params, learning_rate=0.1, momentum=0.9, **kwargs):
     """
     updates = sgd(cost, params, learning_rate)
     for param in params:
-        velocity = shared_variable(np.zeros(param.get_value(borrow=True).shape))
+        velocity = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable)
         p = momentum * velocity + updates[param]
         updates[velocity] = p - param
         updates[param] = p
@@ -47,35 +47,62 @@ def nesterov_momentum(cost, params, learning_rate=0.1, momentum=0.9, **kwargs):
     """Stochastic Gradient Descent (SGD) updates with Nesterov momentum
     * ``velocity := momentum * velocity - learning_rate * gradient``
     * ``param := param + momentum * velocity - learning_rate * gradient``
+
+    References
+    ----------
+    .. [1] https://github.com/lisa-lab/pylearn2/pull/136#issuecomment-10381617
     """
     updates = sgd(cost, params, learning_rate)
     for param in params:
-        velocity = shared_variable(np.zeros(param.get_value(borrow=True).shape))
+        velocity = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable)
         p = momentum * velocity + updates[param] - param
         updates[velocity] = p
-        updates[param] = momentum * p + updates[param]
+        updates[param] += momentum * p
     return updates
 
 
 def adagrad(cost, params, learning_rate=1.0, epsilon=1e-6, **kwargs):
     """Adaptive Gradient Descent
     Scale learning rates by dividing with the square root of accumulated
     squared gradients
+
+    References
+    ----------
+    .. [1] http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
     """
     gparams = T.grad(cost, params)
     updates = OrderedDict()
     for param, gparam in zip(params, gparams):
-        accu = shared_variable(np.zeros(param.get_value(borrow=True).shape))
+        accu = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable)
         accu_new = accu + gparam ** 2
         updates[accu] = accu_new
         updates[param] = param - learning_rate * gparam / T.sqrt(accu_new + epsilon)
     return updates
 
 
+def rmsprop(cost, params, learning_rate=0.01, rho=0.9, epsilon=1e-6, **kwargs):
+    """RMSProp updates
+    Scale learning rates by dividing with the moving average of the root mean
+    squared (RMS) gradients
+    """
+    gparams = T.grad(cost, params)
+    updates = OrderedDict()
+    for param, gparam in zip(params, gparams):
+        accu = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable)
+        accu_new = rho * accu + (1. - rho) * gparam ** 2
+        updates[accu] = accu_new
+        updates[param] = param - learning_rate * gparam / T.sqrt(accu_new + epsilon)
+    return updates
+
+
 def adadelta(cost, params, learning_rate=1.0, rho=0.95, epsilon=1e-6, **kwargs):
     """Adadelta Gradient Descent
     Scale learning rates by a the ratio of accumulated gradients to accumulated
     step sizes
+
+    References
+    ----------
+    .. [1] https://arxiv.org/pdf/1212.5701v1.pdf
     """
     gparams = T.grad(cost, params)
     updates = OrderedDict()
@@ -85,7 +112,7 @@ def adadelta(cost, params, learning_rate=1.0, rho=0.95, epsilon=1e-6, **kwargs):
         delta_accu = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable)
 
         # update accu (as in rmsprop)
-        accu_new = rho * accu + (1 - rho) * gparam ** 2
+        accu_new = rho * accu + (1. - rho) * gparam ** 2
         updates[accu] = accu_new
 
         # compute parameter update, using the 'old' delta_accu
@@ -94,24 +121,75 @@ def adadelta(cost, params, learning_rate=1.0, rho=0.95, epsilon=1e-6, **kwargs):
         updates[param] = param - learning_rate * update
 
         # update delta_accu (as accu, but accumulating updates)
-        delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2
+        delta_accu_new = rho * delta_accu + (1. - rho) * update ** 2
         updates[delta_accu] = delta_accu_new
 
     return updates
 
 
-def rmsprop(cost, params, learning_rate=1.0, rho=0.9, epsilon=1e-6, **kwargs):
-    """RMSProp updates
-    Scale learning rates by dividing with the moving average of the root mean
-    squared (RMS) gradients
+def adam(cost, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6, **kwargs):
+    """Adam Gradient Descent
+    Scale learning rates by Adaptive moment estimation
+
+    References
+    ----------
+    .. [1] https://arxiv.org/pdf/1412.6980v8.pdf
     """
     gparams = T.grad(cost, params)
     updates = OrderedDict()
+    t = theano.shared(floatX(0.))
+    t_t = 1. + t
+    l_r_t = learning_rate * T.sqrt(1. - beta2 ** t_t) / (1. - beta1 ** t_t)
     for param, gparam in zip(params, gparams):
-        accu = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable)
-        accu_new = rho * accu + (1 - rho) * gparam ** 2
-        updates[accu] = accu_new
-        updates[param] = param - learning_rate * gparam / T.sqrt(accu_new + epsilon)
+        m = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable)
+        v = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable)
+        m_t = beta1 * m + (1. - beta1) * gparam
+        v_t = beta2 * v + (1. - beta2) * T.sqr(gparam)
+        updates[m] = m_t
+        updates[v] = v_t
+        updates[param] = param - l_r_t * m_t / (T.sqrt(v_t) + epsilon)
+    updates[t] = t_t
+    return updates
+
+
+def adamax(cost, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6, **kwargs):
+    """Adam Gradient Descent
+    Scale learning rates by adaptive moment estimation
+
+    References
+    ----------
+    .. [1] https://arxiv.org/pdf/1412.6980v8.pdf
+    """
+    gparams = T.grad(cost, params)
+    updates = OrderedDict()
+    t = theano.shared(floatX(0.))
+    t_t = 1. + t
+    l_r_t = learning_rate / (1. - beta1 ** t_t)
+    for param, gparam in zip(params, gparams):
+        m = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable)
+        u = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable)
+        m_t = beta1 * m + (1. - beta1) * gparam
+        u_t = T.maximum(beta2 * u, abs(gparam))
+        updates[m] = m_t
+        updates[u] = u_t
+        updates[param] = param - l_r_t * m_t / (u_t + epsilon)
+    updates[t] = t_t
+    return updates
+
+
+def nadam(cost, params, learning_rate=1.0, rho=0.95, epsilon=1e-6, **kwargs):
+    """Adam Gradient Descent
+    Nesterov Momentum in Adam
+
+    References
+    ----------
+    .. [1] http://cs229.stanford.edu/proj2015/054_report.pdf
+    """
+    # TODO implement nadam method
+    raise NotImplementedError
+    gparams = T.grad(cost, params)
+    updates = OrderedDict()
+
     return updates
 
 
@@ -123,6 +201,9 @@ def hessian_free(cost, parms, **kwargs):
     ----------
     .. [1] http://www.cs.toronto.edu/~jmartens/docs/Deep_HessianFree.pdf
     .. [2] http://www.cs.toronto.edu/~hinton/absps/momentum.pdf
+    .. [3] http://www.cs.utoronto.ca/~ilya/pubs/2011/HF-RNN.pdf
+    .. [4] http://olivier.chapelle.cc/pub/precond.pdf
+    .. [5] http://www.cs.toronto.edu/~rkiros/papers/shf13.pdf
     """
     # TODO implement hessian_free method
     raise NotImplementedError