Merge pull request #1009 from dwf/linear_interface_refactor

Refactor common stuff from Linear and Convolutional
mila-iqia · Mar 1, 2016 · 3f02718 · 3f02718
2 parents 47285eb + 687c90f
commit 3f02718
Show file tree

Hide file tree

Showing 8 changed files with 93 additions and 52 deletions.
diff --git a/blocks/bricks/__init__.py b/blocks/bricks/__init__.py
@@ -2,7 +2,8 @@
 from .base import application, Brick, lazy
 from .bn import (BatchNormalization, SpatialBatchNormalization,
                  BatchNormalizedMLP)
-from .interfaces import Activation, Feedforward, Initializable, Random
+from .interfaces import (Activation, Feedforward, Initializable, LinearLike,
+                         Random)
 from .simple import (Linear, Bias, Maxout, LinearMaxout, Identity, Tanh,
                      Logistic, Softplus, Rectifier, Softmax,
                      NDimensionalSoftmax)
@@ -11,8 +12,8 @@
 
 __all__ = ('application', 'Brick', 'lazy', 'BatchNormalization',
            'SpatialBatchNormalization', 'BatchNormalizedMLP',
-           'Activation', 'Feedforward', 'Initializable', 'Random',
-           'Linear', 'Bias', 'Maxout', 'LinearMaxout', 'Identity',
+           'Activation', 'Feedforward', 'Initializable', 'LinearLike',
+           'Random', 'Linear', 'Bias', 'Maxout', 'LinearMaxout', 'Identity',
            'Tanh', 'Logistic', 'Softplus', 'Rectifier', 'Softmax',
            'NDimensionalSoftmax', 'Sequence', 'FeedforwardSequence',
            'MLP', 'WithExtraDims')
diff --git a/blocks/bricks/conv.py b/blocks/bricks/conv.py
@@ -3,13 +3,14 @@
                                               get_conv_output_shape)
 from theano.tensor.signal.pool import pool_2d, Pool
 
-from blocks.bricks import Initializable, Feedforward, Sequence, Activation
+from blocks.bricks import (Initializable, Feedforward, Sequence, Activation,
+                           LinearLike)
 from blocks.bricks.base import application, Brick, lazy
 from blocks.roles import add_role, FILTER, BIAS
 from blocks.utils import shared_floatx_nans
 
 
-class Convolutional(Initializable):
+class Convolutional(LinearLike):
     """Performs a 2D convolution.
 
     Parameters
@@ -106,14 +107,6 @@ def _allocate(self):
             self.parameters.append(b)
             self.add_auxiliary_variable(b.norm(2), name='b_norm')
 
-    def _initialize(self):
-        if self.use_bias:
-            W, b = self.parameters
-            self.biases_init.initialize(b, self.rng)
-        else:
-            W, = self.parameters
-        self.weights_init.initialize(W, self.rng)
-
     @application(inputs=['input_'], outputs=['output'])
     def apply(self, input_):
         """Perform the convolution.
@@ -136,29 +129,24 @@ def apply(self, input_):
             for 'full' it is ``image_size + filter_size - 1``.
 
         """
-        if self.use_bias:
-            W, b = self.parameters
-        else:
-            W, = self.parameters
-
         if self.image_size == (None, None):
             input_shape = None
         else:
             input_shape = (self.batch_size, self.num_channels)
             input_shape += self.image_size
 
         output = self.conv2d_impl(
-            input_, W,
+            input_, self.W,
             input_shape=input_shape,
             subsample=self.step,
             border_mode=self.border_mode,
             filter_shape=((self.num_filters, self.num_channels) +
                           self.filter_size))
         if self.use_bias:
             if self.tied_biases:
-                output += b.dimshuffle('x', 0, 'x', 'x')
+                output += self.b.dimshuffle('x', 0, 'x', 'x')
             else:
-                output += b.dimshuffle('x', 0, 1, 2)
+                output += self.b.dimshuffle('x', 0, 1, 2)
         return output
 
     def get_dim(self, name):

diff --git a/blocks/bricks/interfaces.py b/blocks/bricks/interfaces.py
@@ -167,6 +167,39 @@ def _push_initialization_config(self):
                     child.biases_init = self.biases_init
 
 
+class LinearLike(Initializable):
+    """Initializable subclass with logic for :class:`Linear`-like classes.
+
+    Notes
+    -----
+    Provides `W` and `b` properties that can be overridden in subclasses
+    to implement pre-application transformations on the weights and
+    biases.  Application methods should refer to ``self.W`` and ``self.b``
+    rather than accessing the parameters list directly.
+
+    This assumes a layout of the parameters list with the weights coming
+    first and biases (if ``use_bias`` is True) coming second.
+
+    """
+    @property
+    def W(self):
+        return self.parameters[0]
+
+    @property
+    def b(self):
+        if self.use_bias:
+            return self.parameters[1]
+        else:
+            raise AttributeError('use_bias is False')
+
+    def _initialize(self):
+        # Use self.parameters[] references in case W and b are overridden
+        # to return non-shared-variables.
+        if self.use_bias:
+            self.biases_init.initialize(self.parameters[1], self.rng)
+        self.weights_init.initialize(self.parameters[0], self.rng)
+
+
 class Random(Brick):
     """A mixin class for Bricks which need Theano RNGs.
 

diff --git a/blocks/bricks/sequences.py b/blocks/bricks/sequences.py
@@ -1,4 +1,5 @@
 """Bricks that compose together other bricks in linear sequences."""
+import copy
 from toolz import interleave
 from picklable_itertools.extras import equizip
 
@@ -86,6 +87,10 @@ class MLP(Sequence, Initializable, Feedforward):
     dims : list of ints
         A list of input dimensions, as well as the output dimension of the
         last layer. Required for :meth:`~.Brick.allocate`.
+    prototype : :class:`.Brick`, optional
+        The transformation prototype. A copy will be created for every
+        activation. If not provided, an instance of :class:`~simple.Linear`
+        will be used.
 
     Notes
     -----
@@ -107,11 +112,15 @@ class MLP(Sequence, Initializable, Feedforward):
 
     """
     @lazy(allocation=['dims'])
-    def __init__(self, activations, dims, **kwargs):
+    def __init__(self, activations, dims, prototype=None, **kwargs):
         self.activations = activations
-
-        self.linear_transformations = [Linear(name='linear_{}'.format(i))
-                                       for i in range(len(activations))]
+        self.prototype = Linear() if prototype is None else prototype
+        self.linear_transformations = []
+        for i in range(len(activations)):
+            linear = copy.deepcopy(self.prototype)
+            name = self.prototype.__class__.__name__.lower()
+            linear.name = '{}_{}'.format(name, i)
+            self.linear_transformations.append(linear)
         # Interleave the transformations and activations
         application_methods = []
         for entity in interleave([self.linear_transformations, activations]):

diff --git a/blocks/bricks/simple.py b/blocks/bricks/simple.py
@@ -5,7 +5,7 @@
 
 from blocks.bricks.base import application, Brick, lazy
 from blocks.bricks.interfaces import Activation, Feedforward, Initializable
-from blocks.bricks.interfaces import Random  # noqa
+from blocks.bricks.interfaces import LinearLike, Random  # noqa
 
 from blocks.bricks.wrappers import WithExtraDims
 from blocks.roles import add_role, WEIGHT, BIAS
@@ -14,7 +14,7 @@
 logger = logging.getLogger(__name__)
 
 
-class Linear(Initializable, Feedforward):
+class Linear(LinearLike, Feedforward):
     r"""A linear transformation with optional bias.
 
     Brick which applies a linear (affine) transformation by multiplying
@@ -44,14 +44,6 @@ def __init__(self, input_dim, output_dim, **kwargs):
         self.input_dim = input_dim
         self.output_dim = output_dim
 
-    @property
-    def W(self):
-        return self.parameters[0]
-
-    @property
-    def b(self):
-        return self.parameters[1]
-
     def _allocate(self):
         W = shared_floatx_nans((self.input_dim, self.output_dim), name='W')
         add_role(W, WEIGHT)
@@ -63,14 +55,6 @@ def _allocate(self):
             self.parameters.append(b)
             self.add_auxiliary_variable(b.norm(2), name='b_norm')
 
-    def _initialize(self):
-        if self.use_bias:
-            W, b = self.parameters
-            self.biases_init.initialize(b, self.rng)
-        else:
-            W, = self.parameters
-        self.weights_init.initialize(W, self.rng)
-
     @application(inputs=['input_'], outputs=['output'])
     def apply(self, input_):
         """Apply the linear transformation.
@@ -86,13 +70,9 @@ def apply(self, input_):
             The transformed input plus optional bias
 
         """
+        output = tensor.dot(input_, self.W)
         if self.use_bias:
-            W, b = self.parameters
-        else:
-            W, = self.parameters
-        output = tensor.dot(input_, W)
-        if self.use_bias:
-            output += b
+            output += self.b
         return output
 
     def get_dim(self, name):

diff --git a/blocks/serialization.py b/blocks/serialization.py
@@ -321,7 +321,7 @@ def add_to_dump(object_, file_, name, parameters=None, use_cpickle=False,
 
     """
     if name in ['_pkl', '_parameters']:
-        raise ValueError("_pkl and _parameters are reserved names and can't" \
+        raise ValueError("_pkl and _parameters are reserved names and can't"
                          " be used as name for your object.")
 
     external_parameters = {}
@@ -337,7 +337,7 @@ def add_to_dump(object_, file_, name, parameters=None, use_cpickle=False,
         file_.seek(0)  # To be able to read what is in the tar file already.
         with closing(tarfile.TarFile(fileobj=file_, mode='r')) as tar_file:
             if '_parameters' not in tar_file.getnames():
-                raise ValueError("There is no parameters in the archive, so" \
+                raise ValueError("There is no parameters in the archive, so"
                                  " you can't use the argument parameters.")
             else:
                 parameters = numpy.load(
@@ -346,7 +346,7 @@ def add_to_dump(object_, file_, name, parameters=None, use_cpickle=False,
                 s2 = [_unmangle_parameter_name(x)[1] for x in
                       external_parameters.values()]
                 if not s1.issuperset(s2):
-                    raise ValueError('The set of parameters is different' \
+                    raise ValueError('The set of parameters is different'
                                      ' from the one in the archive.')
 
     if use_cpickle:

diff --git a/tests/bricks/test_bricks.py b/tests/bricks/test_bricks.py
@@ -362,6 +362,16 @@ def test_mlp():
     assert mlp.rng == mlp.linear_transformations[0].rng
 
 
+def test_mlp_prototype_argument():
+    class MyLinear(Linear):
+        pass
+    mlp = MLP(activations=[Tanh(), Tanh(), None],
+              dims=[4, 5, 6, 7], prototype=MyLinear())
+    assert all(isinstance(lt, MyLinear) for lt in mlp.linear_transformations)
+    assert all(lt.name == 'mylinear_{}'.format(i)
+               for i, lt in enumerate(mlp.linear_transformations))
+
+
 def test_mlp_apply():
     x = tensor.matrix()
     x_val = numpy.random.rand(2, 16).astype(theano.config.floatX)

diff --git a/tests/bricks/test_interfaces.py b/tests/bricks/test_interfaces.py
@@ -0,0 +1,20 @@
+import numpy
+import theano
+from theano import tensor
+from blocks.bricks import Linear
+from blocks.initialization import Constant, IsotropicGaussian
+
+
+def test_linearlike_subclass_initialize_works_overridden_w():
+    class NotQuiteLinear(Linear):
+        @property
+        def W(self):
+            W = super(NotQuiteLinear, self).W
+            return W / tensor.sqrt((W ** 2).sum(axis=0))
+
+    brick = NotQuiteLinear(5, 10, weights_init=IsotropicGaussian(0.02),
+                           biases_init=Constant(1))
+    brick.initialize()
+    assert not numpy.isnan(brick.parameters[0].get_value()).any()
+    numpy.testing.assert_allclose((brick.W ** 2).sum(axis=0).eval(), 1,
+                                  rtol=1e-6)