More advances towards a VAE.

muammar · Oct 15, 2019 · 1c258a2 · 1c258a2
1 parent bff44b6
commit 1c258a2
Show file tree

Hide file tree

Showing 4 changed files with 165 additions and 56 deletions.
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 [![License](https://img.shields.io/badge/license-BSD-green)](https://github.com/muammar/ml4chem/blob/master/LICENSE)
 [![Downloads](https://img.shields.io/github/downloads/muammar/ml4chem/total.svg?maxAge=2592000?style=flat-square)](https://github.com/muammar/ml4chem/releases)
 [![GitHub release](https://img.shields.io/github/release/muammar/ml4chem.svg)](https://github.com/muammar/ml4chem/releases/latest)
-
+--------------------------------------------------------------------------------
 
 This package is written in Python 3, and intends to offer modern and rich
 features to perform machine learning workflows for chemical physics.
@@ -29,14 +29,14 @@ A list of features and methods are shown below.
 - [Messagepack serialization](https://msgpack.org/index.html).
 
 
-## Citing
+### Citing
 
 If you find this software useful, please use this DOI to cite it:
 
 [![DOI](https://zenodo.org/badge/161847010.svg)](https://zenodo.org/badge/latestdoi/161847010)
 
 
-## Documentation
+### Documentation
 
 You can read the documentation at [https://ml4chem.dev](https://ml4chem.dev)
 where you can get started. It is arranged in a way that you can go through
@@ -46,12 +46,12 @@ index](https://ml4chem.dev/genindex.html) to get more information about
 different classes and functions of ML4Chem.
 
 
-## Dask dashboard
+### Dask dashboard
 ![](https://raw.githubusercontent.com/muammar/ml4chem/master/docs/source/_static/dask_dashboard.png)
 
 Note: This package is under development.
 
-## Copyright
+### Copyright
 ML4Chem: Machine Learning for Chemistry and Materials (ML4Chem) Copyright (c) 2019, The
 Regents of the University of California, through Lawrence Berkeley National
 Laboratory (subject to receipt of any required approvals from the U.S.

diff --git a/ml4chem/data/visualization.py b/ml4chem/data/visualization.py
@@ -4,6 +4,7 @@
 import seaborn as sns
 from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
 from ml4chem.data.serialization import load
+import time
 
 
 def parity(predictions, true, scores=False, filename=None, **kwargs):

diff --git a/ml4chem/models/autoencoders.py b/ml4chem/models/autoencoders.py
@@ -97,8 +97,8 @@ def prepare_model(
             logger.info("==============")
             logger.info("Model name: {}.".format(self.name()))
             logger.info(
-                "Structure of Autoencoder: {}".format(
-                    "(input, " + str(self.hiddenlayers)[1:-1] + ", output)"
+                "Structure of {}: {}".format(
+                    self.name(), "(input, " + str(self.hiddenlayers)[1:-1] + ", output)"
                 )
             )
 
@@ -133,21 +133,24 @@ def prepare_model(
 
             # Stacking up the layers.
             if self.name() == "VAE":
-                keys = ["pre", "mu", "logvar"]
+                keys = ["h", "mu", "logvar"]
                 mu = []
                 logvar = []
 
                 index = -3
-                for i in range(2):
+                for _ in range(2):
                     index += 1
-                    mu.append(encoder.pop(index))
+                    if index == -2:
+                        mu.append(encoder.pop(index))
+                    else:
+                        encoder.pop(index)
 
-                pre = torch.nn.Sequential(*encoder)
+                h = torch.nn.Sequential(*encoder)
                 logvar = torch.nn.Linear(inp_dim, out_dim)
-                logvar = torch.nn.Sequential(*[logvar, activation[self.activation]()])
+                logvar = torch.nn.Sequential(*[logvar])
                 mu = torch.nn.Sequential(*mu)
 
-                values = [pre, mu, logvar]
+                values = [h, mu, logvar]
                 encoder = torch.nn.ModuleDict(list(map(list, zip(keys, values))))
             else:
                 encoder = torch.nn.Sequential(*encoder)
@@ -158,20 +161,31 @@ def prepare_model(
             Decoder
             """
             for inp_dim, out_dim in zip(decoder_layers, decoder_layers[1:]):
-                _decoder = torch.nn.Linear(inp_dim, out_dim)
-                decoder.append(_decoder)
+                decoder.append(torch.nn.Linear(inp_dim, out_dim))
                 decoder.append(activation[self.activation]())
 
-            # The last decoder layer for symbol
             inp_dim = out_dim
-            _decoder = torch.nn.Linear(inp_dim, output_dimension)
-            decoder.append(_decoder)
+
+            """
+            if self.name() == "VAE":
+                h = torch.nn.Sequential(*decoder)
+                mu = torch.nn.Linear(inp_dim, output_dimension)
+                mu = torch.nn.Sequential(*[mu])
+                logvar = torch.nn.Linear(inp_dim, output_dimension)
+                logvar = torch.nn.Sequential(*[logvar])
+                values = [h, mu, logvar]
+                decoder = torch.nn.ModuleDict(list(map(list, zip(keys, values))))
+            else:
+            """
+            # The last decoder layer for symbol
+            decoder.append(torch.nn.Linear(inp_dim, output_dimension))
             # According to this video https://youtu.be/xTU79Zs4XKY?t=416
             # real numbered inputs need no activation function in the output
             # layer decoder.append(activation[self.activation]())
 
             # Stacking up the layers.
             decoder = torch.nn.Sequential(*decoder)
+
             symbol_decoder_pair.append([symbol, decoder])
 
         self.encoders = torch.nn.ModuleDict(symbol_encoder_pair)
@@ -180,7 +194,7 @@ def prepare_model(
         logger.info(self.decoders)
 
         if purpose == "training":
-            # Iterate over all modules and just intialize those that are
+            # Iterate over all modules and just initialize those that are
             # a linear layer.
             logger.warning(
                 "Initialization of weights with Xavier Uniform by " "default."
@@ -331,6 +345,34 @@ def get_latent_space(self, X, svm=False, purpose=None):
 
 
 class VAE(AutoEncoder):
+    """Variational Autoencoder (VAE)
+
+
+    This module uses variational autoencoders for pipelines in chemistry.
+
+    Parameters
+    ----------
+    hiddenlayers : dict
+        Dictionary with encoder, and decoder layers in the Auto Encoder.
+    activation : str
+        The activation function.
+
+
+    Notes
+    -----
+    When defining the hiddenlayers keyword argument, input and output
+    dimensions are automatically determined. For example, suppose you have an
+    input data point with 10 dimensions and you want to autoencode with
+    targets having 14 dimensions, a latent space with 4 dimensions and just one
+    hidden layer with 5 nodes between input-layer / latent-layer and
+    latent-layer / output-layer. Your `hiddenlayers` dictionary would look like
+    this:
+
+        >>> hiddenlayers = {'encoder': (5, 4), 'decoder': (4, 5)}
+
+    That would generate an autoencoder with topology (10, 5, 4 | 4, 5, 14).
+    """
+
     NAME = "VAE"
 
     @classmethod
@@ -350,15 +392,59 @@ def encode(self, symbol, x):
         
         Returns
         -------
-        z
-            Latent vector.
+        mu, logvar
+            Mean and variance.
         """
-        pre = self.encoders[symbol]["pre"](x)
-        mu = self.encoders[symbol]["mu"](pre)
-        logvar = self.encoders[symbol]["logvar"](pre)
+        h = self.encoders[symbol]["h"](x)
+        mu = self.encoders[symbol]["mu"](h)
+        logvar = self.encoders[symbol]["logvar"](h)
         return mu, logvar
 
+    # def decode(self, symbol, z):
+    #     """Decode latent vector, z
+    #     
+    #     Parameters
+    #     ----------
+    #     symbol : str
+    #         Chemical symbol.
+    #     z : array
+    #         Latent vector.
+    #     
+    #     Returns
+    #     -------
+    #     mu, logvar
+    #         Mean and variance.
+    #     
+    #     Notes
+    #     -----
+    #     See page 11 "Kingma, D. P. & Welling, M. Auto-Encoding Variational
+    #     Bayes. (2013)".
+    #     """
+
+    #     h = self.decoders[symbol]["h"](z)
+    #     mu = self.decoders[symbol]["mu"](h)
+    #     logvar = self.decoders[symbol]["logvar"](h)
+    #     return mu, logvar
+
     def reparameterize(self, mu, logvar):
+        """Reparameterization trick
+
+        This trick samples the posterior (a latent vector) from a
+        multivariate Gaussian probability distribution. At the same time it
+        allows the model to be backward-propagated.
+
+        Parameters
+        ----------
+        mu : [type]
+            [description]
+        logvar : [type]
+            [description]
+
+        Returns
+        -------
+        [type]
+            [description]
+        """
         std = torch.exp(0.5 * logvar)
         eps = torch.randn_like(std)
         return mu + eps * std
@@ -375,25 +461,35 @@ def forward(self, X):
 
         Returns
         -------
-        outputs : tensor
+        mu and lovar for two multivariate gaussian 
             Decoded latent vector.
         """
 
+        mus_latent = []
+        logvars_latent = []
+        # mus_output = []
+        # logvars_output = []
         outputs = []
-        mus = []
-        logvars = []
         for hash, image in X.items():
             for symbol, x in image:
-                mu, logvar = self.encode(symbol, x)
-                z = self.reparameterize(mu, logvar)
-                output = self.decode(symbol, z)
-                outputs.append(output)
-                mus.append(mu)
-                logvars.append(logvar)
+                mu_latent, logvar_latent = self.encode(symbol, x)
+                z = self.reparameterize(mu_latent, logvar_latent)
+                mus_latent.append(mu_latent)
+                logvars_latent.append(logvar_latent)
+                reconstruction = self.decode(symbol, z)
+                # mu_output, logvar_output = self.decode(symbol, z)
+                # mus_output.append(mu_output)
+                # logvars_output.append(logvar_output)
+                outputs.append(reconstruction)
+
+        mus_latent = torch.stack(mus_latent)
+        logvars_latent = torch.stack(logvars_latent)
+        # mus_output = torch.stack(mus_output)
+        # logvars_output = torch.stack(logvars_output)
         outputs = torch.stack(outputs)
-        mus = torch.stack(mus)
-        logvars = torch.stack(logvars)
-        return outputs, mus, logvars
+
+        # return outputs, mus_latent, logvars_latent, mus_output, logvars_output
+        return outputs, mus_latent, logvars_latent
 
 
 class train(object):
@@ -697,11 +793,11 @@ def train_batches(
         """
         inputs = OrderedDict(chunk)
         if model.name() == "VAE":
-            outputs, mus, logvars = model(inputs)
+            # outputs, mus_latent, logvars_latent, mus_output, logvars_output = model(inputs)
+            outputs, mus_latent, logvars_latent, = model(inputs)
         else:
             outputs = model(inputs)
-
-        args = {"outputs": outputs, "targets": targets[index]}
+            args = {"outputs": outputs, "targets": targets[index]}
 
         _args, _varargs, _keywords, _defaults = inspect.getargspec(lossfxn)
 
@@ -713,11 +809,21 @@ def train_batches(
             # In the case of using EncoderMapLoss the inputs are needed, too.
             args.update({"inputs": inputs_chunk_vals[index]})
 
-        elif "mus" in _args and "logvars" in _args:
-            mus = {"mus": mus}
-            logvars = {"logvars": logvars}
-            args.update(mus)
-            args.update(logvars)
+        elif "mus_latent" in _args and "logvars_latent" in _args:
+            args = {
+                "outputs": outputs,
+                "targets": targets[index],
+                "mus_latent": mus_latent,
+                "logvars_latent": logvars_latent,
+            }
+        # elif "mus_latent" in _args and "logvars_latent" in _args:
+        #     args = {
+        #         "targets": targets[index],
+        #         "mus_latent": mus_latent,
+        #         "logvars_latent": logvars_latent,
+        #         "mus_output": mus_output,
+        #         "logvars_output": logvars_output,
+        #     }
 
         loss = lossfxn(**args)
         loss.backward()

diff --git a/ml4chem/models/loss.py b/ml4chem/models/loss.py
@@ -1,4 +1,5 @@
 import torch
+import numpy as np
 
 
 def AtomicMSELoss(outputs, targets, atoms_per_image):
@@ -33,8 +34,6 @@ def AtomicMSELoss(outputs, targets, atoms_per_image):
 def SumSquaredDiff(outputs, targets):
     """Sum of squared differences loss function
 
-    This is the default loss function for a real-valued autoencoder.
-
     Parameters
     ----------
     outputs : tensor
@@ -57,10 +56,7 @@ def SumSquaredDiff(outputs, targets):
 
 
 def MSELoss(outputs, targets):
-    """Default loss function
-
-    If user does not input loss function we provide mean-squared error loss
-    function.
+    """Mean-squared error loss function
 
     Parameters
     ----------
@@ -254,15 +250,21 @@ def get_pairwise_distances(positions, squared=False):
 
     return distances
 
-def VAELoss(outputs, targets, mus, logvars):
-    # BCE = F.binary_cross_entropy(recon_x, x.view(-1, 784), reduction='sum')
+
+## def VAELoss(targets, mus_latent, logvars_latent, mus_output, logvars_output):
+def VAELoss(outputs, targets, mus_latent, logvars_latent):
+
+    # LOG_2_PI = np.log(2 * np.pi)
+    # loss_rec = LOG_2_PI + torch.sum(logvars_output + (targets - mus_output) ** 2 / (2 * torch.exp(logvars_output)))
+
+    loss_rec = MSELoss(outputs, targets)
 
     # see Appendix B from VAE paper:
     # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
     # https://arxiv.org/abs/1312.6114
     # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
 
-    criterion = torch.nn.MSELoss()
-    mse = criterion(outputs, targets) * 0.5
-    kld = -0.5 * torch.sum(1 + logvars - mus.pow(2) - logvars.exp())
-    return mse + kld
+    kld = -0.5 * torch.sum(
+        1 + logvars_latent - mus_latent.pow(2) - logvars_latent.exp()
+    )
+    return loss_rec + kld