Fix gradient for RNG (#377)

* vae debug and rand uniform * Add more compact example "random_gradient" to test grad for rng. Co-authored-by: Ethan Caballero <ethan.victor.caballero@gmail.com>
mila-iqia · Oct 27, 2020 · 4549916 · 4549916
1 parent 7a2a44a
commit 4549916
Show file tree

Hide file tree

Showing 14 changed files with 495 additions and 13 deletions.
diff --git a/examples/vae.py b/examples/vae.py
@@ -0,0 +1,292 @@
+"""Example of an MLP in Myia.
+
+Myia is still a work in progress, and this example may change in the future.
+"""
+
+import time
+from dataclasses import dataclass
+
+import numpy
+import torch
+from numpy.random import RandomState
+from torchvision import datasets, transforms
+
+import myia.public_api as pub
+from myia import ArithmeticData, myia, value_and_grad
+from myia.api import to_device
+from myia.debug import traceback  # noqa
+from myia.operations import array_exp, array_pow, random_initialize
+
+###########
+# Options #
+###########
+
+
+dtype = "float32"
+
+backend = "pytorch"
+# backend = 'relay'  # Uncomment to use relay backend
+
+device_type = "cpu"
+# device_type = 'cuda'  # Uncomment to run on the gpu
+
+backend_options_dict = {
+    "pytorch": {"device": device_type},
+    "relay": {"target": device_type, "device_id": 0},
+}
+
+backend_options = backend_options_dict[backend]
+
+###############
+# Hyperparams #
+###############
+
+
+lr = getattr(numpy, dtype)(0.01)
+
+
+########
+# Data #
+########
+
+
+# This just generates random data so we don't have to load a real dataset,
+# but the model will work just as well on a real dataset.
+
+
+def param(R, *size):
+    """Generates a random array using the generator R."""
+    return numpy.array(R.rand(*size) * 2 - 1, dtype=dtype)
+
+
+def generate_data(n, batch_size, input_size, target_size, *, seed=87):
+    """Generate inputs and targets.
+
+    Generates n batches of samples of size input_size, matched with
+    a single target.
+    """
+    R = RandomState(seed=seed)
+    return [
+        (param(R, batch_size, input_size), param(R, batch_size, target_size))
+        for i in range(n)
+    ]
+
+
+def mlp_parameters(*layer_sizes, seed=90909):
+    """Generates parameters for a MLP given a list of layer sizes."""
+    R = RandomState(seed=seed)
+    parameters = []
+    for i, o in zip(layer_sizes[:-1], layer_sizes[1:]):
+        W = param(R, i, o)
+        b = param(R, 1, o)
+        parameters.append((W, b))
+    return parameters
+
+
+#########
+# Model #
+#########
+
+
+# We generate a MLP model with some arbitrary number of layers and tanh
+# activations.
+
+
+@dataclass(frozen=True)
+class Linear(ArithmeticData):
+    """Linear layer."""
+
+    W: "Weights array"
+    b: "Biases vector"
+
+    def apply(self, input):
+        """Apply the layer."""
+        return input @ self.W + self.b
+
+
+@dataclass(frozen=True)
+class Tanh(ArithmeticData):
+    """Tanh layer."""
+
+    def apply(self, input):
+        """Apply the layer."""
+        return numpy.tanh(input)
+
+
+@dataclass(frozen=True)
+class Sequential(ArithmeticData):
+    """Sequential layer, applies all sub-layers in order."""
+
+    layers: "Tuple of layers"
+
+    def apply(self, x):
+        """Apply the layer."""
+        for layer in self.layers:
+            x = layer.apply(x)
+        return x
+
+
+@dataclass(frozen=True)
+class VAE(ArithmeticData):
+    """Sequential layer, applies all sub-layers in order."""
+
+    fc1: "layer fc1"
+    fc21: "layer fc21"
+    fc22: "layer fc22"
+    fc3: "layer fc3"
+    fc4: "layer fc4"
+
+    def encode(self, x):
+        h1 = pub.relu(self.fc1.apply(x))
+        return self.fc21.apply(h1), self.fc22.apply(h1)
+
+    def reparameterize(self, mu, logvar, rstate):
+        std = array_exp(0.5 * logvar)
+        eps, rstate = pub.uniform(rstate, (2, 20), -1.0, 1.0)
+        return mu + eps * std, rstate
+
+    def decode(self, z):
+        h3 = pub.relu(self.fc3.apply(z))
+        return pub.sigmoid(self.fc4.apply(h3))
+
+    def forward(self, x, rstate):
+        mu, logvar = self.encode(pub.reshape(x, (-1, 784)))
+        z, rstate = self.reparameterize(mu, logvar, rstate)
+        return self.decode(z), mu, logvar, rstate
+
+
+params = (
+    mlp_parameters(*(784, 400))[0],
+    mlp_parameters(*(400, 20))[0],
+    mlp_parameters(*(400, 20))[0],
+    mlp_parameters(*(20, 400))[0],
+    mlp_parameters(*(400, 784))[0],
+)
+
+model = VAE(
+    Linear(params[0][0], params[0][1]),
+    Linear(params[1][0], params[1][1]),
+    Linear(params[2][0], params[2][1]),
+    Linear(params[3][0], params[3][1]),
+    Linear(params[4][0], params[4][1]),
+)
+
+model = to_device(model, backend, backend_options, broaden=False)
+
+
+# Reconstruction + KL divergence losses summed over all elements and batch
+def loss_function(recon_x, x, mu, logvar):
+    BCE = pub.binary_cross_entropy(
+        recon_x, pub.reshape(x, (-1, 784)), reduction="sum"
+    )
+
+    # see Appendix B from VAE paper:
+    # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
+    # https://arxiv.org/abs/1312.6114
+    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
+    KLD = -0.5 * pub._sum(1 + logvar - array_pow(mu, 2) - array_exp(logvar))
+
+    return BCE + KLD
+
+
+def cost(model, data, rstate):
+    recon_batch, mu, logvar, _rstate = model.forward(data, rstate)
+    loss = loss_function(recon_batch, data, mu, logvar)
+    return loss.item(), _rstate
+
+
+@myia(backend=backend, backend_options=backend_options, return_backend=True)
+def step(model, data, lr, rstate):
+    """Returns the loss and parameter gradients.
+
+    value_and_grad will return cost(model, x, y) and dcost(...)/dmodel.
+    The 'model' argument can be omitted: by default the derivative wrt
+    the first argument is returned.
+    """
+    (_cost, rstate), dmodel = value_and_grad(cost, "model")(
+        model, data, rstate, dout=(1, 1)
+    )
+    return _cost, model - lr * dmodel, rstate
+
+
+@myia(backend=backend, backend_options=backend_options, return_backend=True)
+def step_eval(model, data, rstate):
+    """Returns the loss and parameter gradients.
+
+    value_and_grad will return cost(model, x, y) and dcost(...)/dmodel.
+    The 'model' argument can be omitted: by default the derivative wrt
+    the first argument is returned.
+    """
+    return cost(model, data, rstate)
+
+
+@myia(backend=backend, backend_options=backend_options, return_backend=True)
+def step_init_seed():
+    """Returns the loss and parameter gradients.
+
+    value_and_grad will return cost(model, x, y) and dcost(...)/dmodel.
+    The 'model' argument can be omitted: by default the derivative wrt
+    the first argument is returned.
+    """
+    return random_initialize(1)
+
+
+lr = getattr(numpy, dtype)(0.01)
+
+if __name__ == "__main__":
+    seed = 123
+    cuda = False
+    batch_size = 2
+    epochs = 1
+
+    torch.manual_seed(seed)
+
+    device = torch.device("cuda" if cuda else "cpu")
+
+    kwargs = {"num_workers": 1, "pin_memory": True} if cuda else {}
+    train_loader = torch.utils.data.DataLoader(
+        datasets.MNIST(
+            "../data",
+            train=True,
+            download=True,
+            transform=transforms.ToTensor(),
+        ),
+        batch_size=batch_size,
+        shuffle=True,
+        **kwargs,
+    )
+
+    rand_state = step_init_seed()
+
+    for _ in range(epochs):
+        costs = []
+        t0 = time.time()
+        for i, (data, _) in enumerate(train_loader):
+            print("i", i + 1, "/", len(train_loader))
+            _cost, model, rand_state = step(
+                model, data.reshape((batch_size, 784)).numpy(), lr, rand_state
+            )
+            costs.append(_cost)
+        costs = [float(c.from_device()) for c in costs]
+        c = sum(costs) / len(costs)
+        t = time.time() - t0
+        print(f"Cost: {c:15.10f}\tTime: {t:15.10f}")
+
+    test_loader = torch.utils.data.DataLoader(
+        datasets.MNIST("../data", train=False, transform=transforms.ToTensor()),
+        batch_size=batch_size,
+        shuffle=True,
+        **kwargs,
+    )
+
+    costs = []
+    t0 = time.time()
+    for i, (data, _) in enumerate(test_loader):
+        _cost, rand_state = step_eval(
+            model, data.reshape((batch_size, 784)).numpy(), rand_state
+        )
+        costs.append(_cost)
+    costs = [float(c.from_device()) for c in costs]
+    c = sum(costs) / len(costs)
+    t = time.time() - t0
+    print(f"Cost: {c:15.10f}\tTime: {t:15.10f}")
diff --git a/myia/abstract/to_abstract.py b/myia/abstract/to_abstract.py
@@ -401,6 +401,11 @@ def pytype_to_abstract(main: AbstractArray, args):
     return AbstractArray(ANYTHING, values={SHAPE: ANYTHING, TYPE: ANYTHING})
 
 
+@ovld  # noqa: F811
+def pytype_to_abstract(main: AbstractRandomState, args):
+    return AbstractRandomState()
+
+
 @ovld  # noqa: F811
 def pytype_to_abstract(main: RandomStateWrapper, args):
     return AbstractRandomState()

diff --git a/myia/abstract/utils.py b/myia/abstract/utils.py
@@ -22,6 +22,7 @@
     AbstractFunctionUnique,
     AbstractJTagged,
     AbstractKeywordArgument,
+    AbstractRandomState,
     AbstractScalar,
     AbstractStructure,
     AbstractTaggedUnion,
@@ -496,6 +497,7 @@ def sensitivity_transform(self, x: (AbstractFunction, AbstractFunctionUnique)):
 
     * The sensitivity of a function is an Env
     * The sensitivity of J(x) is x
+    * We set the sensitivity of a random state as a nil scalar
     """
     return AbstractScalar({VALUE: ANYTHING, TYPE: xtype.EnvType})
 
@@ -505,6 +507,11 @@ def sensitivity_transform(self, x: AbstractJTagged):
     return self(x.element)
 
 
+@ovld  # noqa: F811
+def sensitivity_transform(self, x: AbstractRandomState):
+    return AbstractScalar({VALUE: 0, TYPE: xtype.Int[64]})
+
+
 #################
 # Force through #
 #################

diff --git a/myia/compile/backends/__init__.py b/myia/compile/backends/__init__.py
@@ -113,10 +113,10 @@ def get_backend_names():
 def get_default():
     """Returns the default backend.
 
-    This is fetched from the MYIA_BACKEND environement variable or
+    This is fetched from the MYIA_BACKEND environment variable or
     from the built-in defaults.
 
-    The syntax for specifiying a backend is
+    The syntax for specifying a backend is
     'name?option1=value1&option2=value2' when name is the name of the
     backend and option1 is a valid keyword option for that backend.
     This is strongly inspired by HTTP query syntax except you don't
@@ -135,7 +135,7 @@ def get_default():
 def parse_default():
     """Parses the default backend.
 
-    Returns name and options from the environement or builtin default.
+    Returns name and options from the environment or builtin default.
     See the documentation of get_default() for the backend string syntax.
     """
     backend_spec = os.environ.get("MYIA_BACKEND", "pytorch")
@@ -199,7 +199,7 @@ def register_backend(name, load_fn, defaults_fn):
     Arguments:
         name (str): Name of the backend, must be unique
         load_fn: function that will load the backend.  This must
-                 return a callable that will take keyword arguemnts
+                 return a callable that will take keyword arguments
                  for options.
         defaults_fn: function that takes the same default arguments as
                      load_fn and maps them to canonical and/or default

diff --git a/myia/frontends/abstract_types.py b/myia/frontends/abstract_types.py
@@ -8,7 +8,7 @@
     AbstractArray,
     AbstractScalar,
 )
-from ..xtype import Bool
+from ..xtype import Bool, NDArray
 
 AA_bool = AbstractArray(
     AbstractScalar({TYPE: Bool, VALUE: ANYTHING}),
@@ -19,4 +19,7 @@
 AS = AbstractScalar({TYPE: ANYTHING, VALUE: ANYTHING})
 
 
-__all__ = ["AA_bool", "AS"]
+AA = AbstractArray(ANYTHING, {SHAPE: ANYTHING, TYPE: NDArray})
+
+
+__all__ = ["AA_bool", "AS", "AA"]