nengo · drasmuss · Aug 16, 2021 · Jun 14, 2021 · Aug 12, 2021 · Aug 13, 2021
diff --git a/.nengobones.yml b/.nengobones.yml
@@ -15,13 +15,14 @@ manifest_in: {}
 
 setup_py:
   install_req:
+    - packaging>=20.9
     - scipy>=1.0.0
     - tensorflow>=2.1.0
   tests_req:
     - pytest>=6.1.0
     - pytest-rng>=1.0.0
   docs_req:
-    - matplotlib>=3.0.2
+    - matplotlib>=3.0.2,<3.4.3
     - jupyter>=1.0.0
     - seaborn>=0.9.0
     - sphinx>=1.8
@@ -65,7 +66,7 @@ travis_yml:
     TF_VERSION: tensorflow
   jobs:
     - script: static
-    - script: test
+    - script: remote-test
     - script: test
       env:
         TF_VERSION: tensorflow==2.1.0
@@ -94,6 +95,19 @@ ci_scripts:
     coverage: true
     pip_install:
       - $TF_VERSION
+  - template: remote-script
+    remote_script: test
+    output_name: remote-test
+    host: azure
+    travis_var_key: 2895d60e3414
+    azure_name: nengo-dl
+    azure_group: nengo-ci
+    coverage: true
+    remote_vars:
+      TF_FORCE_GPU_ALLOW_GROWTH: "true"
+      TF_VERSION: $TF_VERSION
+    remote_setup:
+      - conda install -y -c conda-forge cudatoolkit=11.3 cudnn=8.2
   - template: remote-script
     remote_script: docs
     output_name: remote-docs
@@ -102,7 +116,7 @@ ci_scripts:
     azure_name: nengo-dl-docs
     azure_group: nengo-ci
     remote_setup:
-      - conda install -y -c conda-forge cudatoolkit=11.2 cudnn=8.1
+      - conda install -y -c conda-forge cudatoolkit=11.3 cudnn=8.2
   - template: remote-script
     remote_script: examples
     output_name: remote-examples
@@ -111,7 +125,7 @@ ci_scripts:
     azure_name: nengo-dl-examples
     azure_group: nengo-ci
     remote_setup:
-      - conda install -y -c conda-forge cudatoolkit=11.2 cudnn=8.1
+      - conda install -y -c conda-forge cudatoolkit=11.3 cudnn=8.2
   - template: deploy
 
 codecov_yml: {}

diff --git a/.travis.yml b/.travis.yml
@@ -31,7 +31,7 @@ jobs:
       SCRIPT="static"
   -
     env:
-      SCRIPT="test"
+      SCRIPT="remote-test"
   -
     env:
       TF_VERSION="tensorflow==2.1.0"

diff --git a/CHANGES.rst b/CHANGES.rst
@@ -35,15 +35,26 @@ Release history
   and ``B`` LMU matrices. This is mainly useful in combination with
   ``trainable_theta=True``, where setting ``discretizer="euler"`` may improve the
   training speed (possibly at the cost of some accuracy). (`#41`_)
+- The ``keras_lmu.LMUFFT`` layer can now use raw convolution internally (as opposed to
+  FFT-based convolution). The new ``conv_mode`` option exposes this. The new
+  ``truncate_ir`` option allows truncating the impulse response when running with a
+  raw convolution mode, for efficiency. Whether FFT-based or raw convolution is faster
+  depends on the specific model, hardware, and amount of truncation. (`#42`_)
 
 **Changed**
 
 - The ``A`` and ``B`` matrices are now stored as constants instead of non-trainable
   variables. This can improve the training/inference speed, but it means that saved
   weights from previous versions will be incompatible. (`#41`_)
+- Renamed ``keras_lmu.LMUFFT`` to ``keras_lmu.LMUFeedforward``. (`#42`_)
+
+**Fixed**
+
+- Fixed dropout support in TensorFlow 2.6. (`#42`_)
 
 .. _#40: https://github.com/nengo/keras-lmu/pull/40
 .. _#41: https://github.com/nengo/keras-lmu/pull/41
+.. _#42: https://github.com/nengo/keras-lmu/pull/42
 
 0.3.1 (November 16, 2020)
 =========================

diff --git a/keras_lmu/__init__.py b/keras_lmu/__init__.py
@@ -1,6 +1,6 @@
 """KerasLMU provides a package for deep learning with Legendre Memory Units."""
 
-from .layers import LMU, LMUFFT, LMUCell
+from .layers import LMU, LMUCell, LMUFeedforward
 from .version import version as __version__
 
 __copyright__ = "2019-2021, Applied Brain Research"

diff --git a/keras_lmu/layers.py b/keras_lmu/layers.py
@@ -4,7 +4,12 @@
 
 import numpy as np
 import tensorflow as tf
-from tensorflow.python.keras.layers.recurrent import DropoutRNNCellMixin
+from packaging import version
+
+if version.parse(tf.__version__) < version.parse("2.6.0rc0"):
+    from tensorflow.python.keras.layers.recurrent import DropoutRNNCellMixin
+else:
+    from keras.layers.recurrent import DropoutRNNCellMixin
 
 
 class LMUCell(DropoutRNNCellMixin, tf.keras.layers.Layer):
@@ -516,7 +521,7 @@ def theta(self):
         if self.built:
             return (
                 self.layer.theta
-                if isinstance(self.layer, LMUFFT)
+                if isinstance(self.layer, LMUFeedforward)
                 else self.layer.cell.theta
             )
 
@@ -541,7 +546,7 @@ def build(self, input_shapes):
             and input_shapes[1] is not None
             and not self.trainable_theta
         ):
-            self.layer = LMUFFT(
+            self.layer = LMUFeedforward(
                 memory_d=self.memory_d,
                 order=self.order,
                 theta=self._init_theta,
@@ -620,15 +625,14 @@ def from_config(cls, config):
         return super().from_config(config)
 
 
-class LMUFFT(tf.keras.layers.Layer):
+class LMUFeedforward(tf.keras.layers.Layer):
     """
-    Layer class for the FFT variant of the LMU.
+    Layer class for the feedforward variant of the LMU.
 
     This class assumes no recurrent connections are desired in the memory component.
 
     Produces the output of the delay system by evaluating the convolution of the input
-    sequence with the impulse response from the LMU cell. The convolution operation is
-    calculated using the fast Fourier transform (FFT).
+    sequence with the impulse response from the LMU cell.
 
     Parameters
     ----------
@@ -665,6 +669,15 @@ class LMUFFT(tf.keras.layers.Layer):
     return_sequences : bool, optional
         If True, return the full output sequence. Otherwise, return just the last
         output in the output sequence.
+    conv_mode : "fft" or "raw"
+        The method for performing the inpulse response convolution. "fft" uses FFT
+        convolution (default). "raw" uses explicit convolution, which may be faster
+        for particular models on particular hardware.
+    truncate_ir : float
+        The portion of the impulse response to truncate when using "raw"
+        convolution (see ``conv_mode``). This is an approximate upper bound on the error
+        relative to the exact implementation. Smaller ``theta`` values result in more
+        truncated elements for a given value of ``truncate_ir``, improving efficiency.
     """
 
     def __init__(
@@ -678,13 +691,18 @@ def __init__(
         kernel_initializer="glorot_uniform",
         dropout=0,
         return_sequences=False,
+        conv_mode="fft",
+        truncate_ir=1e-4,
         **kwargs,
     ):
         super().__init__(**kwargs)
 
         if input_to_hidden and hidden_cell is None:
             raise ValueError("input_to_hidden must be False if hidden_cell is None")
 
+        if conv_mode not in ("fft", "raw"):
+            raise ValueError(f"Unrecognized conv mode '{conv_mode}'")
+
         self.memory_d = memory_d
         self.order = order
         self.theta = theta
@@ -694,6 +712,8 @@ def __init__(
         self.kernel_initializer = kernel_initializer
         self.dropout = dropout
         self.return_sequences = return_sequences
+        self.conv_mode = conv_mode.lower()
+        self.truncate_ir = truncate_ir
 
         # create a standard LMUCell to generate the impulse response during `build`
         self.delay_layer = tf.keras.layers.RNN(
@@ -733,19 +753,40 @@ def build(self, input_shape):
             # TODO: we could dynamically run the impulse response for longer if
             #  needed using stateful=True
             raise ValueError(
-                f"LMUFFT requires that the input shape's temporal axis be fully "
-                f"specified (got {seq_len})"
+                f"LMUFeedforward requires that the input shape's temporal axis be "
+                f"fully specified (got {seq_len})"
             )
 
         impulse = tf.reshape(tf.eye(seq_len, 1), (1, -1, 1))
 
-        self.impulse_response = tf.signal.rfft(
-            tf.squeeze(
-                tf.transpose(self.delay_layer(impulse, training=False)), axis=-1
-            ),
-            fft_length=[2 * seq_len],
+        self.impulse_response = tf.squeeze(
+            self.delay_layer(impulse, training=False), axis=0
         )
 
+        if self.conv_mode == "fft":
+            self.impulse_response = tf.signal.rfft(
+                tf.transpose(self.impulse_response),
+                fft_length=[2 * seq_len],
+            )
+        else:
+            if self.truncate_ir is not None:
+                assert self.impulse_response.shape == (seq_len, self.order)
+
+                cumsum = tf.math.cumsum(
+                    tf.math.abs(self.impulse_response), axis=0, reverse=True
+                )
+                cumsum = cumsum / cumsum[0]
+                to_drop = tf.reduce_all(cumsum < self.truncate_ir, axis=-1)
+                if to_drop[-1]:
+                    cutoff = tf.where(to_drop)[0, -1]
+                    self.impulse_response = self.impulse_response[:cutoff]
+
+            self.impulse_response = tf.reshape(
+                self.impulse_response,
+                (self.impulse_response.shape[0], 1, 1, self.order),
+            )
+            self.impulse_response = self.impulse_response[::-1, :, :, :]
+
         if self.kernel_initializer is not None:
             self.kernel = self.add_weight(
                 name="kernel",
@@ -781,8 +822,6 @@ def call(self, inputs, training=None):
         if training is None:
             training = tf.keras.backend.learning_phase()
 
-        seq_len = tf.shape(inputs)[1]
-
         if self.dropout:
             inputs = tf.keras.layers.Dropout(
                 self.dropout, noise_shape=(inputs.shape[0], 1) + inputs.shape[2:]
@@ -795,21 +834,10 @@ def call(self, inputs, training=None):
             else tf.matmul(inputs, self.kernel, name="input_encoder_mult")
         )
 
-        # FFT requires shape (batch, memory_d, timesteps)
-        u = tf.transpose(u, perm=[0, 2, 1])
-
-        # Pad sequences to avoid circular convolution
-        # Perform the FFT
-        fft_input = tf.signal.rfft(u, fft_length=[2 * seq_len], name="input_pad")
-
-        # Elementwise product of FFT (with broadcasting)
-        result = tf.expand_dims(fft_input, axis=-2) * self.impulse_response
-
-        # Inverse FFT
-        m = tf.signal.irfft(result, fft_length=[2 * seq_len])[..., :seq_len]
-
-        m = tf.reshape(m, (-1, self.order * self.memory_d, seq_len))
-        m = tf.transpose(m, perm=[0, 2, 1])
+        if self.conv_mode == "fft":
+            m = self._fft_convolution(u)
+        elif self.conv_mode == "raw":
+            m = self._raw_convolution(u)
 
         # apply hidden cell
         h_in = tf.concat((m, inputs), axis=-1) if self.input_to_hidden else m
@@ -831,6 +859,41 @@ def call(self, inputs, training=None):
 
         return h
 
+    def _fft_convolution(self, u):
+        seq_len = tf.shape(u)[1]
+
+        # FFT requires shape (batch, memory_d, timesteps)
+        u = tf.transpose(u, perm=[0, 2, 1])
+
+        # Pad sequences to avoid circular convolution
+        # Perform the FFT
+        fft_input = tf.signal.rfft(u, fft_length=[2 * seq_len])
+
+        # Elementwise product of FFT (with broadcasting)
+        result = tf.expand_dims(fft_input, axis=-2) * self.impulse_response
+
+        # Inverse FFT
+        m = tf.signal.irfft(result, fft_length=[2 * seq_len])[..., :seq_len]
+
+        m = tf.reshape(m, (-1, self.order * self.memory_d, seq_len))
+
+        return tf.transpose(m, perm=[0, 2, 1])
+
+    def _raw_convolution(self, u):
+        seq_len = tf.shape(u)[1]
+        ir_len = self.impulse_response.shape[0]
+
+        u = tf.expand_dims(u, -1)
+        m = tf.nn.conv2d(
+            u,
+            self.impulse_response,
+            strides=1,
+            data_format="NHWC",
+            padding=[[0, 0], [ir_len - 1, 0], [0, 0], [0, 0]],
+        )
+        m = tf.reshape(m, (-1, seq_len, self.memory_d * self.order))
+        return m
+
     def get_config(self):
         """Return config of layer (for serialization during model saving/loading)."""
 
@@ -846,6 +909,8 @@ def get_config(self):
                 kernel_initializer=self.kernel_initializer,
                 dropout=self.dropout,
                 return_sequences=self.return_sequences,
+                conv_mode=self.conv_mode,
+                truncate_ir=self.truncate_ir,
             )
         )
 

diff --git a/keras_lmu/tests/__init__.py b/keras_lmu/tests/__init__.py
@@ -0,0 +1,18 @@
+# pylint: disable=missing-docstring
+
+import subprocess
+import sys
+
+# check if GPU support is available
+# note: we run this in a subprocess because list_physical_devices()
+# will fix certain process-level TensorFlow configuration
+# options the first time it is called
+tf_gpu_installed = not subprocess.call(
+    [
+        sys.executable,
+        "-c",
+        "import sys; "
+        "import tensorflow as tf; "
+        "sys.exit(len(tf.config.list_physical_devices('GPU')) == 0)",
+    ]
+)