General improvements.

- Moved `Potentials` under `atomistic`. - Created new `DeepLearningTrainer` base class for the `train` class inside each model. It adds support for model checkpoint saving thanks to the `checkpoint_save` class. - `atomistic.neuralnetwork` module now supports training with uncertainties. - Small improvements to the `atomistic.models.loss` module.
muammar · Feb 13, 2020 · 15ff500 · 15ff500
1 parent 6781f7b
commit 15ff500
Show file tree

Hide file tree

Showing 6 changed files with 109 additions and 14 deletions.
diff --git a/ml4chem/__init__.py b/ml4chem/__init__.py
@@ -1,5 +1 @@
-from ml4chem.atomistic.potentials import Potentials
-
-
-__all__ = ["Potentials"]
 __version__ = "0.0.8-dev"
diff --git a/ml4chem/atomistic/__init__.py b/ml4chem/atomistic/__init__.py
@@ -0,0 +1,4 @@
+from ml4chem.atomistic.potentials import Potentials
+
+
+__all__ = ["Potentials"]
diff --git a/ml4chem/atomistic/models/base.py b/ml4chem/atomistic/models/base.py
@@ -1,3 +1,4 @@
+from ml4chem.atomistic import Potentials
 from abc import ABC, abstractmethod
 import torch
 
@@ -22,3 +23,37 @@ def prepare_model(self, **kwargs):
     def forward(self, X):
         """Forward propagation pass"""
         pass
+
+
+class DeepLearningTrainer(ABC, object):
+    def checkpoint_save(self, epoch, model, label=None, checkpoint=None, path=""):
+        """Checkpoint saver
+
+        A method that saves the checkpoint of a model during training. 
+        
+        Parameters
+        ----------
+        epoch : int
+            Epoch number. 
+        model : object
+            A DeepLearning object. 
+        label : str, optional
+            String with checkpoint label, by default None.
+        checkpoint : int, optional
+            Set checkpoints. If set to 100, at each 100 epoch the model will be
+            saved. Use -1 to save each epoch. Default is None.
+        path : str, optional
+            Path to save the checkpoint, by default "".
+        """
+
+        if label is None:
+            label = f"checkpoint-{epoch}"
+        else:
+            label = f"{label}-checkpoint-{epoch}"
+
+        if checkpoint is None:
+            pass
+        elif checkpoint == -1:
+            Potentials.save(model=model, label=label, path=path)
+        elif epoch % checkpoint == 0:
+            Potentials.save(model=model, label=label, path=path)
diff --git a/ml4chem/atomistic/models/loss.py b/ml4chem/atomistic/models/loss.py
@@ -38,9 +38,8 @@ def AtomicMSELoss(outputs, targets, atoms_per_image, uncertainty=None):
         outputs_atom = torch.div(outputs, atoms_per_image)
         targets_atom = torch.div(targets, atoms_per_image)
         loss = (
-            criterion(outputs_atom, targets_atom) / torch.pow(uncertainty, 2)
+            criterion(outputs_atom, targets_atom) / (2 * torch.pow(uncertainty, 1))
         ).sum() * 0.5
-
     return loss
 
 
@@ -64,6 +63,7 @@ def SumSquaredDiff(outputs, targets):
     In the literature it is mentioned that for real-valued autoencoders the
     reconstruction loss function is the sum of squared differences.
     """
+
     loss = (outputs - targets).pow(2).sum() * 0.5
     return loss
 

diff --git a/ml4chem/atomistic/models/neuralnetwork.py b/ml4chem/atomistic/models/neuralnetwork.py
@@ -7,7 +7,7 @@
 import numpy as np
 from collections import OrderedDict
 from ml4chem.metrics import compute_rmse
-from ml4chem.atomistic.models.base import DeepLearningModel
+from ml4chem.atomistic.models.base import DeepLearningModel, DeepLearningTrainer
 from ml4chem.atomistic.models.loss import AtomicMSELoss
 from ml4chem.optim.handler import get_optimizer, get_lr_scheduler
 from ml4chem.utils import convert_elapsed_time, get_chunks, get_number_of_parameters
@@ -209,7 +209,7 @@ def forward(self, X):
         return outputs
 
 
-class train(object):
+class train(DeepLearningTrainer):
     """Train the model
 
     Parameters
@@ -244,6 +244,17 @@ class train(object):
 
         >>> lr_scheduler = ('ReduceLROnPlateau',
                             {'mode': 'min', 'patience': 10})
+    uncertainty : list
+        A list of uncertainties that are used to penalize during the loss
+        function evaluation.
+    checkpoint : dict
+        Set checkpoints. Dictionary with following structure:
+
+        >>> checkpoint = {"label": label, "checkpoint": 100, "path": ""}
+        
+        `label` refers to the name used to save the checkpoint, `checkpoint`
+        is a integer or -1 for saving all epochs, and the path is where the
+        checkpoint is stored. Default is None and no checkpoint is saved.
     """
 
     def __init__(
@@ -260,6 +271,8 @@ def __init__(
         device="cpu",
         batch_size=None,
         lr_scheduler=None,
+        uncertainty=None,
+        checkpoint=None,
     ):
 
         self.initial_time = time.time()
@@ -275,6 +288,13 @@ def __init__(
             targets = list(get_chunks(targets, batch_size, svm=False))
             atoms_per_image = list(get_chunks(atoms_per_image, batch_size, svm=False))
 
+            if uncertainty != None:
+                uncertainty = list(get_chunks(uncertainty, batch_size, svm=False))
+                uncertainty = [
+                    torch.tensor(u, requires_grad=False, dtype=torch.float)
+                    for u in uncertainty
+                ]
+
         logger.info(" ")
         logging.info("Batch Information")
         logging.info("-----------------")
@@ -342,12 +362,18 @@ def __init__(
         self.epochs = epochs
         self.model = model
         self.lr_scheduler = lr_scheduler
+        self.checkpoint = checkpoint
 
         # Data scattering
         client = dask.distributed.get_client()
         self.chunks = [client.scatter(chunk) for chunk in chunks]
         self.targets = [client.scatter(target) for target in targets]
 
+        if uncertainty != None:
+            self.uncertainty = [client.scatter(u) for u in uncertainty]
+        else:
+            self.uncertainty = uncertainty
+
         if lossfxn is None:
             self.lossfxn = AtomicMSELoss
         else:
@@ -373,6 +399,7 @@ def trainer(self):
             loss, outputs_ = train.closure(
                 self.chunks,
                 self.targets,
+                self.uncertainty,
                 self.model,
                 self.lossfxn,
                 self.atoms_per_image,
@@ -403,10 +430,14 @@ def trainer(self):
 
             ts = time.time()
             ts = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d " "%H:%M:%S")
+
             logger.info(
                 "{:6d} {} {:8e} {:8f} {:8f}".format(epoch, ts, loss, rmse, rmse_atom)
             )
 
+            if self.checkpoint is not None:
+                self.checkpoint_save(epoch, self.model, **self.checkpoint)
+
             if self.convergence is None and epoch == self.epochs:
                 converged = True
             elif self.convergence is not None and rmse < self.convergence["energy"]:
@@ -420,7 +451,9 @@ def trainer(self):
         )
 
     @classmethod
-    def closure(Cls, chunks, targets, model, lossfxn, atoms_per_image, device):
+    def closure(
+        Cls, chunks, targets, uncertainty, model, lossfxn, atoms_per_image, device
+    ):
         """Closure
 
         This class method clears previous gradients, iterates over batches,
@@ -435,6 +468,9 @@ def closure(Cls, chunks, targets, model, lossfxn, atoms_per_image, device):
             Tensor with input data points in batch with index.
         targets : tensor or list
             The targets.
+        uncertainty : list
+            A list of uncertainties that are used to penalize during the loss
+            function evaluation.
         model : obj
             Pytorch model to perform forward() and get gradients.
         lossfxn : obj
@@ -458,7 +494,16 @@ def closure(Cls, chunks, targets, model, lossfxn, atoms_per_image, device):
             accumulation.append(
                 client.submit(
                     train.train_batches,
-                    *(index, chunk, targets, model, lossfxn, atoms_per_image, device)
+                    *(
+                        index,
+                        chunk,
+                        targets,
+                        uncertainty,
+                        model,
+                        lossfxn,
+                        atoms_per_image,
+                        device,
+                    )
                 )
             )
         dask.distributed.wait(accumulation)
@@ -482,7 +527,7 @@ def closure(Cls, chunks, targets, model, lossfxn, atoms_per_image, device):
 
     @classmethod
     def train_batches(
-        Cls, index, chunk, targets, model, lossfxn, atoms_per_image, device
+        Cls, index, chunk, targets, uncertainty, model, lossfxn, atoms_per_image, device
     ):
         """A function that allows training per batches
 
@@ -497,6 +542,9 @@ def train_batches(
             The targets.
         model : obj
             Pytorch model to perform forward() and get gradients.
+        uncertainty : list
+            A list of uncertainties that are used to penalize during the loss
+            function evaluation.
         lossfxn : obj
             A loss function object.
         atoms_per_image : list
@@ -512,7 +560,12 @@ def train_batches(
         inputs = OrderedDict(chunk)
         outputs = model(inputs)
 
-        loss = lossfxn(outputs, targets[index], atoms_per_image[index])
+        if uncertainty == None:
+            loss = lossfxn(outputs, targets[index], atoms_per_image[index])
+        else:
+            loss = lossfxn(
+                outputs, targets[index], atoms_per_image[index], uncertainty[index]
+            )
         loss.backward()
 
         gradients = []

diff --git a/ml4chem/atomistic/potentials.py b/ml4chem/atomistic/potentials.py
@@ -3,6 +3,7 @@
 import copy
 import json
 import logging
+import os
 import torch
 from ase.calculators.calculator import Calculator
 from ml4chem.backends.available import available_backends
@@ -160,9 +161,15 @@ def save(model=None, features=None, path=None, label="ml4chem"):
         """
 
         if path is None:
-            path = ""
+            path = "."
 
-        path += label
+        if os.path.isdir(path) is False:
+            os.makedirs(path)
+
+        if path[-1] == "/":
+            path += label
+        else:
+            path = path + "/" + label
 
         if model is not None:
             model_name = model.name()