Skip to content

Commit

Permalink
Merge pull request #1079 from dwf/gradientdescent_refactor
Browse files Browse the repository at this point in the history
GradientDescent: refactoring, enhancements, bug fixes
  • Loading branch information
dwf committed May 9, 2016
2 parents bf89d49 + 4e7f384 commit 30ab153
Show file tree
Hide file tree
Showing 6 changed files with 334 additions and 138 deletions.
316 changes: 188 additions & 128 deletions blocks/algorithms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,66 +56,80 @@ def process_batch(self, batch):
pass


class DifferentiableCostMinimizer(TrainingAlgorithm):
"""Minimizes a differentiable cost given as a Theano expression.
variable_mismatch_error = """
Blocks tried to match the sources ({sources}) of the training dataset to \
the names of the Theano variables ({variables}), but failed to do so. \
If you want to train on a subset of the sources that your dataset provides, \
pass the `sources` keyword argument to its constructor, use the \
FilterSources transformer provided by Fuel, or pass on_unused_sources='warn' \
or on_unused_sources='ignore' to the GradientDescent algorithm."""

source_missing_error = """
Blocks didn't find all the sources ({sources}) of the training dataset \
that match the names of the Theano variables ({variables})."""


determinism_error = """Cannot infer parameter list in a fixed order.
Because dictionaries are unordered (and Python uses randomized hashing, \
which can change the iteration order over the same dictionary from one \
interpreter session to the next), Blocks cannot infer the parameters list \
from a plain dictionary of gradients in an order that is reproducible \
across interpreter sessions; please either specify the parameters \
explicitly or pass gradients as an OrderedDict (though exercise care in \
constructing that OrderedDict, as an OrderedDict created by iterating \
over an unordered iterable (e.g. a dict) will still have an arbitrary \
and unpredictable order that could cause problems with \
reproducibility)."""


Very often the goal of training is to minimize the expected value of a
Theano expression. Batch processing in this cases typically consists of
running a (or a few) Theano functions.
:class:`DifferentiableCostMinimizer` is the base class for such
algorithms.
class UpdatesAlgorithm(TrainingAlgorithm):
"""Base class for algorithms that use Theano functions with updates.
Parameters
----------
cost : :class:`~tensor.TensorVariable`
The objective to be minimized.
parameters : list of :class:`~tensor.TensorSharedVariable`
The parameters to be tuned.
updates : list of tuples or :class:`~collections.OrderedDict`
The updates that should be performed.
theano_func_kwargs : dict, optional
A passthrough to `theano.function` for additional arguments.
Useful for passing `profile` or `mode` arguments to the theano
function that will be compiled for the algorithm.
on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn'
Controls behavior when not all sources in a batch are used
(i.e. there is no variable with a matching name in the inputs
of the computational graph of the updates).
Attributes
----------
updates : list of :class:`~tensor.TensorSharedVariable` updates
Updates to be done for every batch. It is required that the
updates are done using the old values of optimized parameters.
cost : :class:`~tensor.TensorVariable`
The objective to be minimized.
parameters : list of :class:`~tensor.TensorSharedVariable`
The parameters to be tuned.
Notes
-----
Changing `updates` attribute or calling `add_updates` after
the `initialize` method is called will have no effect.
.. todo::
Some shared variables are not parameters (e.g. those created by
random streams).
.. todo::
Due to a rather premature status of the :class:`ComputationGraph`
class the parameter used only inside scans are not fetched
currently.
"""
def __init__(self, cost, parameters):
self.cost = cost
self.parameters = parameters
self._cost_computation_graph = ComputationGraph(self.cost)
self._updates = []

@property
def inputs(self):
"""Return inputs of the cost computation graph.
Returns
-------
inputs : list of :class:`~tensor.TensorVariable`
Inputs to this graph.
def __init__(self, updates=None, theano_func_kwargs=None,
on_unused_sources='raise', **kwargs):
self.updates = [] if updates is None else updates
self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs
is not None else dict())
self.on_unused_sources = on_unused_sources
super(UpdatesAlgorithm, self).__init__(**kwargs)

"""
return self._cost_computation_graph.inputs
def initialize(self):
logger.info("Initializing the training algorithm")
update_values = [new_value for _, new_value in self.updates]
logger.debug("Inferring graph inputs...")
self.inputs = ComputationGraph(update_values).inputs
logger.debug("Compiling training function...")
self._function = theano.function(
self.inputs, [], updates=self.updates, **self.theano_func_kwargs)
logger.info("The training algorithm is initialized")

@property
def updates(self):
Expand All @@ -142,23 +156,40 @@ def add_updates(self, updates):
raise ValueError
self.updates.extend(updates)

def _validate_source_names(self, batch):
in_names = [v.name for v in self.inputs]

variable_mismatch_error = """
Blocks tried to match the sources ({sources}) of the training dataset to \
the names of the Theano variables ({variables}), but failed to do so. \
If you want to train on a subset of the sources that your dataset provides, \
pass the `sources` keyword argument to its constructor. Or pass \
on_unused_sources='warn' or on_unused_sources='ignore' to \
the GradientDescent algorithm."""

source_missing_error = """
if not set(in_names).issubset(set(batch.keys())):
raise ValueError("Didn't find all sources: " +
source_missing_error.format(
sources=batch.keys(),
variables=in_names))
if not set(batch.keys()).issubset(set(in_names)):
if self.on_unused_sources == 'ignore':
pass
elif self.on_unused_sources == 'warn':
if not hasattr(self, '_unused_source_warned'):
logger.warn(variable_mismatch_error.format(
sources=batch.keys(),
variables=in_names))
self._unused_source_warned = True
elif self.on_unused_sources == 'raise':
raise ValueError(
"mismatch of variable names and data sources" +
variable_mismatch_error.format(
sources=batch.keys(),
variables=in_names))
else:
raise ValueError("Wrong value of on_unused_sources: {}."
.format(self.on_unused_sources))

Blocks didn't find all the sources ({sources}) of the training dataset \
that match the names of the Theano variables ({variables})."""
def process_batch(self, batch):
self._validate_source_names(batch)
ordered_batch = [batch[v.name] for v in self.inputs]
self._function(*ordered_batch)


class GradientDescent(DifferentiableCostMinimizer):
class GradientDescent(UpdatesAlgorithm):
"""A base class for all gradient descent algorithms.
By "gradient descent" we mean a training algorithm of the following
Expand All @@ -177,14 +208,20 @@ class GradientDescent(DifferentiableCostMinimizer):
Parameters
----------
cost : :class:`~tensor.TensorVariable`, optional
The objective to be minimized. Unused if `gradients` is specified.
parameters : list of :class:`~tensor.TensorSharedVariable`, optional
The parameters to be tuned. If not provided, inferred from the
keys of `gradients` (in which case `gradients` *must* be an
`OrderedDict`).
step_rule : instance of :class:`StepRule`, optional
An object encapsulating most of the algorithm's logic. Its
`compute_steps` method is called to get Theano expression for
steps. Note, that the step rule might have a state, e.g. to
remember a weighted sum of gradients from previous steps like it is
done in gradient descent with momentum. If ``None``, an instance of
:class:`Scale` is created.
gradients : dict, optional
gradients : OrderedDict, optional
A dictionary mapping a parameter to an expression for the cost's
gradient with respect to the parameter. If ``None``, the gradient
are taken automatically using :func:`theano.gradient.grad`.
Expand All @@ -198,100 +235,123 @@ class GradientDescent(DifferentiableCostMinimizer):
A passthrough to `theano.tensor.grad`'s `consider_constant`
argument. A list of expressions through which gradients will not
be backpropagated. Only makes sense when `gradients` is `None`.
on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn'
Controls behavior when not all sources are used.
theano_func_kwargs : dict, optional
A passthrough to `theano.function` for additional arguments.
Useful for passing `profile` or `mode` arguments to the theano
function that will be compiled for the algorithm.
Attributes
----------
gradients : dict
gradients : OrderedDict
The gradient dictionary.
step_rule : instance of :class:`StepRule`
The step rule.
"""
def __init__(self, step_rule=None, gradients=None, known_grads=None,
consider_constant=None, on_unused_sources='raise',
theano_func_kwargs=None, **kwargs):
if gradients:
kwargs.setdefault("parameters", gradients.keys())
super(GradientDescent, self).__init__(**kwargs)
Notes
-----
Changing `updates` attribute or calling `add_updates` after
the `initialize` method is called will have no effect.
If a cost and parameters are provided, gradients are taken immediately
upon construction, and changes to these attributes after construction
will have no effect.
`gradients` must be an `OrderedDict` if `parameters` is unspecified
because ordinary dictionaries have an unpredictable iteration
order due to hash randomization (which is enabled by default since
versions 2.7.3 and 3.2.3 of Python). This source of variability,
when combined with Theano's heuristic graph optimizations, can cause
serious reproducibility issues.
"""
def __init__(self, cost=None, parameters=None, step_rule=None,
gradients=None, known_grads=None, consider_constant=None,
**kwargs):
# Set initial values for cost, parameters, gradients.
self.cost = cost
self.parameters = parameters
self.gradients = gradients

# If we don't have gradients, we'll need to infer them from the
# cost and the parameters, both of which must not be None.
if not self.gradients:
logger.info("Taking the cost gradient")
self.gradients = dict(
equizip(self.parameters, tensor.grad(
self.cost, self.parameters,
known_grads=known_grads,
consider_constant=consider_constant)))
logger.info("The cost gradient computation graph is built")
self.gradients = self._compute_gradients(known_grads,
consider_constant)
else:
if cost is not None:
logger.warning(('{}: gradients already specified directly; '
'cost is unused.'
.format(self.__class__.__name__)))
if self.parameters is None and isinstance(gradients, OrderedDict):
# If the dictionary is ordered, it's safe to use the keys
# as they have a deterministic order.
self.parameters = list(self.gradients.keys())
elif self.parameters is not None:
# If parameters and gradients.keys() don't match we can
# try to recover if gradients is ordered.
if set(self.parameters) != set(self.gradients.keys()):
logger.warn("Specified parameters list does not match "
"keys in provided gradient dictionary; "
"using parameters inferred from gradients")
if not isinstance(self.gradients, OrderedDict):
raise ValueError(determinism_error)
self.parameters = list(self.gradients.keys())
else:
# self.parameters is not None, and gradients isn't
# an OrderedDict. We can't do anything safe.
raise ValueError(determinism_error)
if known_grads:
raise ValueError("known_grads has no effect when gradients "
"are passed in")
if consider_constant is not None:
raise ValueError("consider_constant has no effect when "
"gradients are passed in")
self.step_rule = step_rule if step_rule else Scale()

self.total_gradient_norm = l2_norm(
self.gradients.values()).copy(name="total_gradient_norm")
# The order in which the different gradient terms appears
# here matters, as floating point addition is non-commutative (and
# Theano's graph optimizations are not order-independent).
# This is why we do not use .values().
gradient_values = [self.gradients[p] for p in self.parameters]
self.total_gradient_norm = (l2_norm(gradient_values)
.copy(name="total_gradient_norm"))

self.step_rule = step_rule if step_rule else Scale()
logger.debug("Computing parameter steps...")
self.steps, self.step_rule_updates = (
self.step_rule.compute_steps(self.gradients))
self.total_step_norm = l2_norm(
self.steps.values()).copy(name="total_step_norm")
self.on_unused_sources = on_unused_sources
self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs
is not None else dict())

def initialize(self):
logger.info("Initializing the training algorithm")
all_updates = self.updates
# Note: the gradients are computed in the same order in which
# the parameters were given. Keep it like that to ensure
# reproducibility.
for parameter in self.parameters:
all_updates.append((parameter, parameter - self.steps[parameter]))
all_updates += self.step_rule_updates
self._function = theano.function(
self.inputs, [], updates=all_updates, **self.theano_func_kwargs)
logger.info("The training algorithm is initialized")

def _validate_source_names(self, batch):
in_names = [v.name for v in self.inputs]

if not set(in_names).issubset(set(batch.keys())):
raise ValueError("Didn't find all sources: " +
source_missing_error.format(
sources=batch.keys(),
variables=in_names))
if not set(batch.keys()).issubset(set(in_names)):
if self.on_unused_sources == 'ignore':
pass
elif self.on_unused_sources == 'warn':
if not hasattr(self, '_unused_source_warned'):
logger.warn(variable_mismatch_error.format(
sources=batch.keys(),
variables=in_names))
self._unused_source_warned = True
elif self.on_unused_sources == 'raise':
raise ValueError(
"mismatch of variable names and data sources" +
variable_mismatch_error.format(
sources=batch.keys(),
variables=in_names))
else:
raise ValueError("Wrong value of on_unused_sources: {}."
.format(self.on_unused_sources))
# Same as gradient_values above: the order may influence a
# bunch of things, so enforce a consistent one (don't use
# .values()).
step_values = [self.steps[p] for p in self.parameters]
self.total_step_norm = (l2_norm(step_values)
.copy(name="total_step_norm"))

# Once again, iterating on gradients may not be deterministically
# ordered if it is not an OrderedDict. We add the updates here in
# the order specified in self.parameters. Keep it this way to
# maintain reproducibility.
kwargs.setdefault('updates', []).extend(
itertools.chain(((parameter, parameter - self.steps[parameter])
for parameter in self.parameters),
self.step_rule_updates)
)
super(GradientDescent, self).__init__(**kwargs)

def process_batch(self, batch):
self._validate_source_names(batch)
ordered_batch = [batch[v.name] for v in self.inputs]
self._function(*ordered_batch)
def _compute_gradients(self, known_grads, consider_constant):
if self.cost is None:
raise ValueError("can't infer gradients; no cost specified")
elif self.parameters is None or len(self.parameters) == 0:
raise ValueError("can't infer gradients; no parameters "
"specified")
# While this strictly speaking could be a dict and not an
# OrderedDict (because we iterate over it in the order of
# self.parameters), this guards a little bit against
# nondeterminism introduced by future refactoring.
logger.info("Taking the cost gradient")
gradients = OrderedDict(
equizip(self.parameters, tensor.grad(
self.cost, self.parameters,
known_grads=known_grads,
consider_constant=consider_constant)))
logger.info("The cost gradient computation graph is built")
return gradients


@add_metaclass(ABCMeta)
Expand Down

0 comments on commit 30ab153

Please sign in to comment.