Skip to content

Commit

Permalink
General improvements and steps forward to a VAE implementation
Browse files Browse the repository at this point in the history
- examples/: Updated AutoEncoder example.
- models/autoencoders.py: addition of VAE.
- models/loss.py: a loss function to train VAE.
  • Loading branch information
muammar committed Oct 10, 2019
1 parent e2ec6f2 commit 66ebbc6
Show file tree
Hide file tree
Showing 10 changed files with 991 additions and 156 deletions.
14 changes: 4 additions & 10 deletions examples/autoencoder/cu_inference.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
import logging
import sys

sys.path.append("../../")
from ase.io import Trajectory
from dask.distributed import Client, LocalCluster
from ml4chem.data.handler import DataSet
from ml4chem.fingerprints import LatentFeatures
from ml4chem.models.autoencoders import AutoEncoder, train
from ml4chem.data.serialization import load
import json
import torch
from ml4chem.utils import logger


def autoencode():
Expand All @@ -25,7 +22,7 @@ def autoencode():
normalized = True

data_handler = DataSet(images, purpose=purpose)
images, energies = data_handler.get_images(purpose=purpose)
images, energies = data_handler.get_data(purpose=purpose)

fingerprints = (
"Gaussian",
Expand All @@ -35,7 +32,7 @@ def autoencode():
"save_preprocessor": "inference.scaler",
},
)
encoder = {"model": "model.ml4c", "params": "model.params"}
encoder = {"model": "ml4chem.ml4c", "params": "ml4chem.params"}
preprocessor = ("MinMaxScaler", {"feature_range": (-1, 1)})

fingerprints = LatentFeatures(
Expand All @@ -53,10 +50,7 @@ def autoencode():


if __name__ == "__main__":
# logging.basicConfig(filename='cu_inference.log', level=logging.INFO,
logging.basicConfig(
level=logging.INFO, format="%(filename)s:%(lineno)s %(levelname)s:%(message)s"
)
logger("cu_inference.log")
cluster = LocalCluster()
client = Client(cluster, asyncronous=True)
autoencode()
Binary file modified examples/autoencoder/cu_training.latent
Binary file not shown.
931 changes: 827 additions & 104 deletions examples/autoencoder/cu_training.log

Large diffs are not rendered by default.

17 changes: 6 additions & 11 deletions examples/autoencoder/cu_training.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import logging
import sys

sys.path.append("../../")
from ase.io import Trajectory
from dask.distributed import Client, LocalCluster
from ml4chem import Potentials
from ml4chem.data.handler import DataSet
from ml4chem.fingerprints import Cartesian, Gaussian
from ml4chem.fingerprints import Gaussian
from ml4chem.models.autoencoders import AutoEncoder, train
from ml4chem.data.serialization import dump
from ml4chem.utils import logger


def autoencode():
Expand All @@ -23,7 +23,7 @@ def autoencode():
Data Structure Preparation
"""
data_handler = DataSet(images, purpose=purpose)
training_set, energy_targets = data_handler.get_images(purpose=purpose)
training_set, energy_targets = data_handler.get_data(purpose=purpose)

"""
Let's create the targets of the model
Expand All @@ -50,12 +50,11 @@ def autoencode():
# Arguments for training the potential
convergence = {"rmse": 5e-2}
epochs = 2000
lr = 1e-0
lr = 1e-3
weight_decay = 0
regularization = None

opt_kwars = {"lr": lr}
optimizer = ("lbfgs", opt_kwars)
optimizer = ("adam", {"lr": lr, "weight_decay": weight_decay, "amsgrad": True})

inputs = targets
train(
Expand All @@ -81,11 +80,7 @@ def autoencode():


if __name__ == "__main__":
logging.basicConfig(
filename="cu_training.log",
level=logging.INFO,
format="%(filename)s:%(lineno)s %(levelname)s:%(message)s",
)
logger(filename="cu_training.log")
cluster = LocalCluster()
client = Client(cluster, asyncronous=True)
inputs, outputs, data_handler = autoencode()
Binary file modified examples/autoencoder/cu_training.scaler
Binary file not shown.
Binary file modified examples/autoencoder/inference.scaler
Binary file not shown.
Binary file removed examples/autoencoder/model.ml4c
Binary file not shown.
21 changes: 0 additions & 21 deletions examples/autoencoder/model.params

This file was deleted.

151 changes: 141 additions & 10 deletions ml4chem/models/autoencoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def prepare_model(
"""
Encoder
"""
# The first encoder layer for symbol
# The first encoder's layer for symbol
out_dimension = encoder_layers[0]
_encoder = torch.nn.Linear(input_dimension, out_dimension)
encoder.append(_encoder)
Expand All @@ -132,7 +132,26 @@ def prepare_model(
encoder.append(activation[self.activation]())

# Stacking up the layers.
encoder = torch.nn.Sequential(*encoder)
if self.name() == "VAE":
keys = ["pre", "mu", "logvar"]
mu = []
logvar = []

index = -3
for i in range(2):
index += 1
mu.append(encoder.pop(index))

pre = torch.nn.Sequential(*encoder)
logvar = torch.nn.Linear(inp_dim, out_dim)
logvar = torch.nn.Sequential(*[logvar, activation[self.activation]()])
mu = torch.nn.Sequential(*mu)

values = [pre, mu, logvar]
encoder = torch.nn.ModuleDict(list(map(list, zip(keys, values))))
else:
encoder = torch.nn.Sequential(*encoder)

symbol_encoder_pair.append([symbol, encoder])

"""
Expand Down Expand Up @@ -171,6 +190,42 @@ def prepare_model(
# nn.init.normal_(m.weight) # , mean=0, std=0.01)
torch.nn.init.xavier_uniform_(m.weight)

def encode(self, symbol, x):
"""Encode input
Parameters
----------
symbol : str
Chemical symbol.
x : array
Input array.
Returns
-------
z
Latent vector.
"""
z = self.encoders[symbol](x)
return z

def decode(self, symbol, z):
"""Decode latent vector, z
Parameters
----------
symbol : str
Chemical symbol.
z : array
Latent vector.
Returns
-------
reconstruction
Tensor with reconstruction.
"""
reconstruction = self.decoders[symbol](z)
return reconstruction

def forward(self, X):
"""Forward propagation
Expand All @@ -190,17 +245,17 @@ def forward(self, X):
outputs = []
for hash, image in X.items():
for symbol, x in image:
latent_vector = self.encoders[symbol](x)
decoder = self.decoders[symbol](latent_vector)
outputs.append(decoder)
z = self.encode(symbol, x)
output = self.decode(symbol, z)
outputs.append(output)
outputs = torch.stack(outputs)
return outputs

def get_latent_space(self, X, svm=False, purpose=None):
"""Get latent space for training ML4Chem
"""Get latent space for training ML4Chem models
This method takes an input and use the encoder to return latent space
in the structure needed for training ML4Chem.
in the structure needed for training ML4Chem models or visualization.
Parameters
----------
Expand Down Expand Up @@ -238,7 +293,7 @@ def get_latent_space(self, X, svm=False, purpose=None):
hashes.append(hash)
_symbols = []
for symbol, x in image:
latent_vector = self.encoders[symbol](x)
latent_vector = self.encode(symbol, x)
_symbols.append(symbol)

if svm:
Expand All @@ -263,7 +318,7 @@ def get_latent_space(self, X, svm=False, purpose=None):
for hash, image in X.items():
latent_space[hash] = []
for symbol, x in image:
latent_vector = self.encoders[symbol](x)
latent_vector = self.encode(symbol, x)

if svm:
_latent_vector = latent_vector.detach().numpy()
Expand All @@ -275,6 +330,72 @@ def get_latent_space(self, X, svm=False, purpose=None):
return latent_space


class VAE(AutoEncoder):
NAME = "VAE"

@classmethod
def name(cls):
"""Returns name of class"""
return cls.NAME

def encode(self, symbol, x):
"""Encode input
Parameters
----------
symbol : str
Chemical symbol.
x : array
Input array.
Returns
-------
z
Latent vector.
"""
pre = self.encoders[symbol]["pre"](x)
mu = self.encoders[symbol]["mu"](pre)
logvar = self.encoders[symbol]["logvar"](pre)
return mu, logvar

def reparameterize(self, mu, logvar):
std = torch.exp(0.5 * logvar)
eps = torch.randn_like(std)
return mu + eps * std

def forward(self, X):
"""Forward propagation
This method takes an input and applies encoder and decoder layers.
Parameters
----------
X : list
List of inputs either raw or in the feature space.
Returns
-------
outputs : tensor
Decoded latent vector.
"""

outputs = []
mus = []
logvars = []
for hash, image in X.items():
for symbol, x in image:
mu, logvar = self.encode(symbol, x)
z = self.reparameterize(mu, logvar)
output = self.decode(symbol, z)
outputs.append(output)
mus.append(mu)
logvars.append(logvar)
outputs = torch.stack(outputs)
mus = torch.stack(mus)
logvars = torch.stack(logvars)
return outputs, mus, logvars


class train(object):
"""Train the model
Expand Down Expand Up @@ -575,7 +696,11 @@ def train_batches(
The loss function of the batch.
"""
inputs = OrderedDict(chunk)
outputs = model(inputs)
if model.name() == "VAE":
outputs, mus, logvars = model(inputs)
else:
outputs = model(inputs)

args = {"outputs": outputs, "targets": targets[index]}

_args, _varargs, _keywords, _defaults = inspect.getargspec(lossfxn)
Expand All @@ -588,6 +713,12 @@ def train_batches(
# In the case of using EncoderMapLoss the inputs are needed, too.
args.update({"inputs": inputs_chunk_vals[index]})

elif "mus" in _args and "logvars" in _args:
mus = {"mus": mus}
logvars = {"logvars": logvars}
args.update(mus)
args.update(logvars)

loss = lossfxn(**args)
loss.backward()

Expand Down
13 changes: 13 additions & 0 deletions ml4chem/models/loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,3 +253,16 @@ def get_pairwise_distances(positions, squared=False):
distances = torch.sqrt(distances)

return distances

def VAELoss(outputs, targets, mus, logvars):
# BCE = F.binary_cross_entropy(recon_x, x.view(-1, 784), reduction='sum')

# see Appendix B from VAE paper:
# Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
# https://arxiv.org/abs/1312.6114
# 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)

criterion = torch.nn.MSELoss()
mse = criterion(outputs, targets) * 0.5
kld = -0.5 * torch.sum(1 + logvars - mus.pow(2) - logvars.exp())
return mse + kld

0 comments on commit 66ebbc6

Please sign in to comment.