mlpack · rcurtin · Nov 22, 2023 · Jun 7, 2023 · Jun 7, 2023 · Jun 7, 2023
diff --git a/HISTORY.md b/HISTORY.md
@@ -11,6 +11,10 @@
   * Fix setting number of classes correctly in `SoftmaxRegression::Train()`
     (#3553).
 
+  * Adapt MultiheadAttention and LayerNorm to new Layer interface (#3547)
+
+  * Inconsistent use of the "input" parameter to the Backward method in ANNs (#3551)
+
 ### mlpack 4.2.1
 ###### 2023-09-05
   * Reinforcement Learning: Gaussian noise (#3515).

diff --git a/src/mlpack/methods/ann/activation_functions/activation_functions.hpp b/src/mlpack/methods/ann/activation_functions/activation_functions.hpp
@@ -5,6 +5,24 @@
  * Convenience include for all activation functions implemented for mlpack's
  * neural network toolkit.
  *
+ * An activation function should define methods to evaluate the function
+ * and its derivative.
+ *
+ * For the forward pass, a class should define
+ * static double Fn(double x) -- evaluate y = F(x) at a single point
+ * and
+ * static void Fn(const InputVecType& x, OutputVecType& y) -- evaluate y = F(x)
+ * for a vector
+ *
+ * For the backward pass, a class should define the derivative function.  For
+ * efficiency of implementation, it will be provided both x (the inputs) and
+ * y (the result of F(x)).  The following should be defined
+ * static double Deriv(double x, double y) -- evaluate dF(x)/dx for one value
+ * of x given both x and y=F(x)
+ * static void Deriv(const InputVecType& x, const OutputVecType& y,
+ *                   DerivVecType& dy) -- evaluate dF(x)/dx for a vector x
+ *                                        and a vector y=F(x)
+ *
  * mlpack is free software; you may redistribute it and/or modify it under the
  * terms of the 3-clause BSD license.  You should have received a copy of the
  * 3-clause BSD license along with mlpack.  If not, see

diff --git a/src/mlpack/methods/ann/activation_functions/bipolar_sigmoid_function.hpp b/src/mlpack/methods/ann/activation_functions/bipolar_sigmoid_function.hpp
@@ -54,24 +54,26 @@ class BipolarSigmoidFunction
   /**
    * Computes the first derivative of the Bipolar Sigmoid function.
    *
-   * @param y Input activation.
+   * @param x Input activation.
+   * @param y Result of Fn(x).
    * @return f'(x)
    */
-  static double Deriv(const double y)
+  static double Deriv(const double /* x */, const double y)
   {
     return (1.0 - std::pow(y,2 )) / 2.0;
   }
 
   /**
    * Computes the first derivatives of the Bipolar Sigmoid function.
    *
-   * @param y Input activations.
-   * @param x The resulting derivatives.
+   * @param x Input activation.
+   * @param y Result of Fn(x).
+   * @param dy The resulting derivatives.
    */
-  template<typename InputVecType, typename OutputVecType>
-  static void Deriv(const InputVecType& y, OutputVecType& x)
+  template<typename InputVecType, typename OutputVecType, typename DerivVecType>
+  static void Deriv(const InputVecType& /* x */, const OutputVecType& y, DerivVecType& dy)
   {
-    x =  (1.0 - arma::pow(y, 2)) / 2.0;
+    dy =  (1.0 - arma::pow(y, 2)) / 2.0;
   }
 }; // class BipolarSigmoidFunction
 

diff --git a/src/mlpack/methods/ann/activation_functions/elish_function.hpp b/src/mlpack/methods/ann/activation_functions/elish_function.hpp
@@ -1,6 +1,7 @@
 /**
  * @file methods/ann/activation_functions/elish_function.hpp
  * @author Bisakh Mondal
+ * @author Adam Kropp
  *
  * Definition and implementation of the ELiSH function as described by
  * Mina Basirat and Peter M. Roth.
@@ -70,40 +71,56 @@ class ElishFunction
   template<typename InputVecType, typename OutputVecType>
   static void Fn(const InputVecType& x, OutputVecType& y)
   {
-    y = ((x < 0.0) % ((arma::exp(x) -1) / (1 + arma::exp(-x))))
+    y = ((x < 0.0) % ((arma::exp(x) - 1) / (1 + arma::exp(-x))))
         + ((x >= 0.0) % (x / (1 + arma::exp(-x))));
   }
 
   /**
    * Computes the first derivatives of ELiSH function.
    *
-   * @param y Input data.
+   * @param x Input activation.
+   * @param y Result of Fn(x).
    * @return f'(x).
    */
-  static double Deriv(const double y)
+  static double Deriv(const double x, const double y)
   {
-    if (y < 0.0)
+    if (x < 0.0)
     {
-      return std::exp(y) - 2 / (1 + std::exp(y)) +
-          2 / std::pow(1 + std::exp(y) , 2);
+      return std::exp(x) - 2 / (1 + std::exp(x)) +
+          2 / std::pow(1 + std::exp(x) , 2);
+    }
+    else if (x == 0) {
+        return 0.5; // the expression below is indeterminate at 0, even though
+                    // the expression solely in terms of x is defined (= 0.5)
+    } else {
+        return (y / x) * (1 + x - y);
     }
-
-    return 1 / (1 + std::exp(-y)) + y * std::exp(-y) /
-        std::pow(1 + std::exp(-y) , 2);
   }
 
   /**
    * Computes the first derivatives of the ELiSH function.
    *
-   * @param y Input data.
-   * @param x The resulting derivatives.
+   * @param x Input activation.
+   * @param y Result of Fn(x).
+   * @param dy The resulting derivatives.
    */
-  template<typename InputVecType, typename OutputVecType>
-  static void Deriv(const InputVecType& y, OutputVecType& x)
+  template<typename InputVecType, typename OutputVecType, typename DerivVecType>
+  static void Deriv(const InputVecType& x,
+                    const OutputVecType& y,
+                    DerivVecType& dy)
   {
-    x = ((y < 0.0) % (arma::exp(y) - 2 / (1 + arma::exp(y)) + 2 / arma::pow(
-        1 + arma::exp(y), 2))) + ((y >= 0.0) % (1 / (1 + arma::exp(-y)) + y %
-        arma::exp(-y) / arma::pow(1 + arma::exp(-y), 2)));
+    // simplified the x>=0 part to be in terms of x and y -- maybe
+    // the x<0 part can be as well?
+    // the expression is indeterminate at 0, even though
+    // the expression solely in terms of x is defined (= 0.5)
+    // only calculate exp(x) once for each element where x < 0
+    // this gives approx 3x speedup, despite allocating the temp vector
+    DerivVecType ex = (x < 0) % arma::exp(x);
+    dy = ((x < 0) % ((ex - 2 / (1 + ex) + 2 / arma::pow(1 + ex, 2)))) +
+         ((x > 0) % ((y / x) % (1.0 + x - y)));
+    // need to do this here, because the /x above gives nans even when the
+    // condition is not met (e.g. when x > 0 is false)
+    dy(arma::find(x == 0)).fill(0.5);
   }
 }; // class ElishFunction
 

diff --git a/src/mlpack/methods/ann/activation_functions/elliot_function.hpp b/src/mlpack/methods/ann/activation_functions/elliot_function.hpp
@@ -65,24 +65,28 @@ class ElliotFunction
   /**
    * Computes the first derivative of the Elliot function.
    *
-   * @param y Input data.
+   * @param x Input activation.
+   * @param y Result of Fn(x).
    * @return f'(x).
    */
-  static double Deriv(const double y)
+  static double Deriv(const double x, const double /* y */)
   {
-    return std::pow(1.0 - std::abs(y), 2);
+    return 1.0 / std::pow(1.0 + std::abs(x), 2);
   }
 
   /**
    * Computes the first derivatives of the Elliot function.
    * 
-   * @param y Input activations.
-   * @param x The resulting derivatives.
+   * @param x Input activation.
+   * @param y Result of Fn(x).
+   * @param dy The resulting derivatives.
    */
-  template <typename InputVecType, typename OutputVecType>
-  static void Deriv(const InputVecType &y, OutputVecType &x)
+  template <typename InputVecType, typename OutputVecType, typename DerivVecType>
+  static void Deriv(const InputVecType & x,
+                    const OutputVecType& /* y */,
+                    DerivVecType &dy)
   {
-    x = arma::pow(1.0 - arma::abs(y), 2);
+    dy = 1.0 / arma::pow(1.0 + arma::abs(x), 2);
   }
 }; // class ElliotFunction
 

diff --git a/src/mlpack/methods/ann/activation_functions/gaussian_function.hpp b/src/mlpack/methods/ann/activation_functions/gaussian_function.hpp
@@ -1,6 +1,7 @@
 /**
  * @file gaussian_function.hpp
  * @author Himanshu Pathak
+ * @author Adam Kropp
  *
  * Definition and implementation of the gaussian function.
  *
@@ -54,24 +55,26 @@ class GaussianFunction
   /**
    * Computes the first derivative of the gaussian function.
    *
-   * @param y Input data.
+   * @param x Input activation.
+   * @param y Result of Fn(x).
    * @return f'(x)
    */
-  static double Deriv(const double y)
+  static double Deriv(const double x, const double y)
   {
-    return 2 * -y * std::exp(-1 * std::pow(y, 2));
+    return -2 * x * y;
   }
 
   /**
    * Computes the first derivatives of the gaussian function.
    *
-   * @param y Input activations.
-   * @param x The resulting derivatives.
+   * @param x Input activation.
+   * @param y Result of Fn(x).
+   * @param dy The resulting derivatives.
    */
-  template<typename InputVecType, typename OutputVecType>
-  static void Deriv(const InputVecType& y, OutputVecType& x)
+  template<typename InputVecType, typename OutputVecType, typename DerivVecType>
+  static void Deriv(const InputVecType& x, const OutputVecType& y, DerivVecType& dy)
   {
-    x = 2 * -y % arma::exp(-1 * arma::pow(y, 2));
+    dy = -2 * x % y;
   }
 }; // class GaussianFunction
 

diff --git a/src/mlpack/methods/ann/activation_functions/gelu_function.hpp b/src/mlpack/methods/ann/activation_functions/gelu_function.hpp
@@ -1,6 +1,7 @@
 /**
  * @file methods/ann/activation_functions/gelu_function.hpp
  * @author Himanshu Pathak
+ * @author Adam Kropp
  *
  * Definition and implementation of the Gaussian Error Linear Unit (GELU)
  * function.
@@ -22,7 +23,7 @@ namespace mlpack {
  *
  * @f{eqnarray*}{
  * f(x) = 0.5 * x * {1 + tanh[(2/pi)^(1/2) * (x + 0.044715 * x^3)]} \\
- * f'(x) = 0.5 * tanh(0.0356774 * x^3) + 0.797885 * x) + 
+ * f'(x) = 0.5 * tanh(0.0356774 * x^3 + 0.797885 * x) +
  *         (0.0535161x^3 + 0.398942 * x) * 
  *         sech^2(0.0356774 * x^3+0.797885 * x) + 0.5\\
  * @f}
@@ -58,30 +59,36 @@ class GELUFunction
   /**
    * Computes the first derivative of the GELU function.
    *
-   * @param y Input data.
+   * @param x Input activation.
+   * @param y Result of Fn(x).
    * @return f'(x)
    */
-  static double Deriv(const double y)
+  static double Deriv(const double x, const double /* y */)
   {
-    return 0.5 * std::tanh(0.0356774 * std::pow(y, 3) + 0.797885 * y) +
-           (0.0535161 * std::pow(y, 3) + 0.398942 * y) *
-           std::pow(1 / std::cosh(0.0356774 * std::pow(y, 3) +
-           0.797885 * y), 2) + 0.5;
+    if (x < -10) return 0.0; // catch overflows
+    return 0.5 * std::tanh(0.0356774 * std::pow(x, 3) + 0.797885 * x) +
+           (0.0535161 * std::pow(x, 3) + 0.398942 * x) *
+           std::pow(1 / std::cosh(0.0356774 * std::pow(x, 3) +
+           0.797885 * x), 2) + 0.5;
   }
 
   /**
    * Computes the first derivatives of the GELU function.
    *
-   * @param y Input data.
-   * @param x The resulting derivatives.
+   * @param x Input activation.
+   * @param y Result of Fn(x).
+   * @param dy The resulting derivatives.
    */
-  template<typename InputVecType, typename OutputVecType>
-  static void Deriv(const InputVecType& y, OutputVecType& x)
+  template<typename InputVecType, typename OutputVecType, typename DerivVecType>
+  static void Deriv(const InputVecType& x,
+                    const OutputVecType& /* y */,
+                    DerivVecType& dy)
   {
-    x = 0.5 * arma::tanh(0.0356774 * arma::pow(y, 3) + 0.797885 * y) +
-        (0.0535161 * arma::pow(y, 3) + 0.398942 * y) %
-        arma::pow(1 / arma::cosh(0.0356774 * arma::pow(y, 3) +
-        0.797885 * y), 2) + 0.5;
+    dy = 0.5 * arma::tanh(0.0356774 * arma::pow(x, 3) + 0.797885 * x) +
+        (0.0535161 * arma::pow(x, 3) + 0.398942 * x) %
+        arma::pow(1 / arma::cosh(0.0356774 * arma::pow(x, 3) +
+        0.797885 * x), 2) + 0.5;
+    dy(arma::find(x < -10)).fill(0); // catch overflows
   }
 }; // class GELUFunction
 

diff --git a/src/mlpack/methods/ann/activation_functions/hard_sigmoid_function.hpp b/src/mlpack/methods/ann/activation_functions/hard_sigmoid_function.hpp
@@ -63,10 +63,11 @@ class HardSigmoidFunction
   /**
    * Computes the first derivatives of hard sigmoid function.
    *
-   * @param y Input data.
+   * @param x Input activation.
+   * @param y Result of Fn(x).
    * @return f'(x)
    */
-  static double Deriv(const double y)
+  static double Deriv(const double /* x */, const double y)
   {
     if (y == 0.0 || y == 1.0)
     {
@@ -78,18 +79,21 @@ class HardSigmoidFunction
   /**
    * Computes the first derivatives of the hard sigmoid function.
    *
-   * @param y Input data.
-   * @param x The resulting derivatives.
+   * @param x Input activation.
+   * @param y Result of Fn(x).
+   * @param dy The resulting derivatives.
    */
-  template<typename InputVecType, typename OutputVecType>
-  static void Deriv(const InputVecType& y, OutputVecType& x)
+  template<typename InputVecType, typename OutputVecType, typename DerivVecType>
+  static void Deriv(const InputVecType& x,
+                    const OutputVecType& y,
+                    DerivVecType& dy)
   {
-    x.set_size(size(y));
+    dy.set_size(size(y));
 
     #pragma omp for
     for (size_t i = 0; i < (size_t) y.n_elem; ++i)
     {
-      x(i) = Deriv(y(i));
+      dy(i) = Deriv(x(i), y(i));
     }
   }
 }; // class HardSigmoidFunction

diff --git a/src/mlpack/methods/ann/activation_functions/hard_swish_function.hpp b/src/mlpack/methods/ann/activation_functions/hard_swish_function.hpp
@@ -81,33 +81,37 @@ class HardSwishFunction
   /**
    * Computes the first derivative of the Hard Swish function.
    *
-   * @param y Input data.
+   * @param x Input activation.
+   * @param * (y) Result of Fn(x).
    * @return f'(x).
    */
-  static double Deriv(const double y)
+  static double Deriv(const double x, const double /* y */)
   {
-    if (y <= -3)
+    if (x <= -3)
       return 0;
-    else if (y >= 3)
+    else if (x >= 3)
       return 1;
 
-    return (2 * y + 3.0) / 6.0;
+    return (2 * x + 3.0) / 6.0;
   }
 
   /**
    * Computes the first derivatives of the Hard Swish function.
    *
-   * @param y Input data.
-   * @param x The resulting derivatives.
+   * @param x Input activation.
+   * @param y Result of Fn(x).
+   * @param dy The resulting derivatives.
    */
-  template <typename InputVecType, typename OutputVecType>
-  static void Deriv(const InputVecType &y, OutputVecType &x)
+  template <typename InputVecType, typename OutputVecType, typename DerivVecType>
+  static void Deriv(const InputVecType &x,
+                    const OutputVecType& y,
+                    DerivVecType &dy)
   {
-    x.set_size(size(y));
+    dy.set_size(size(x));
 
     #pragma omp for
-    for (size_t i = 0; i < (size_t) y.n_elem; i++)
-      x(i) = Deriv(y(i));
+    for (size_t i = 0; i < (size_t) x.n_elem; i++)
+      dy(i) = Deriv(x(i), y(i));
   }
 }; // class HardSwishFunction