Improved features and models modules.

- Pyflakes cleanup. - Black cleanup. - features.coulombmatrix module improved. - models.kernelridge: improved efficiency of KernelRidge regression class.
muammar · Jan 16, 2020 · 823f188 · 823f188
1 parent 94b452e
commit 823f188
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 27 deletions.
diff --git a/ml4chem/features/base.py b/ml4chem/features/base.py
@@ -1,4 +1,3 @@
-import dask
 import torch
 import numpy as np
 from abc import ABC, abstractmethod

diff --git a/ml4chem/features/coulombmatrix.py b/ml4chem/features/coulombmatrix.py
@@ -5,7 +5,6 @@
 import os
 import time
 import torch
-import numpy as np
 import pandas as pd
 from collections import OrderedDict
 from dscribe.descriptors import CoulombMatrix as CoulombMatrixDscribe
@@ -20,7 +19,7 @@
 class CoulombMatrix(AtomisticFeatures, CoulombMatrixDscribe):
     """Coulomb Matrix features
 
-    
+
     Parameters
     ----------
     filename : str
@@ -81,7 +80,6 @@ def __init__(
         # This is a very general way of not forgetting to save variables
         _params = vars()
 
-
         # Delete useless variables
         delete = [
             "self",
@@ -92,7 +90,7 @@ def __init__(
             "value",
             "keys",
             "batch_size",
-            "__class__"
+            "__class__",
         ]
 
         for param in delete:
@@ -246,19 +244,12 @@ def calculate(self, images=None, purpose="training", data=None, svm=False):
         feature_space = []
 
         if svm and purpose == "training":
-            logger.info("Building array with reference space.")
-            reference_space = []
 
             for i, image in enumerate(images.items()):
                 restacked = client.submit(
                     self.restack_image, *(i, image, scaled_feature_space, svm)
                 )
 
-                # image = (hash, ase_image) -> tuple
-                # for atom in image[1]:
-                #     restacked_atom = client.submit(self.restack_atom, *(i, atom, scaled_feature_space))
-                #     reference_space.append(restacked_atom)
-
                 feature_space.append(restacked)
 
         elif svm is False and purpose == "training":
@@ -288,9 +279,11 @@ def calculate(self, images=None, purpose="training", data=None, svm=False):
 
         if svm and purpose == "training":
             # FIXME This might need to be improved
+            logger.info("Building array with reference space.")
             hashes, reference_space = list(zip(*feature_space))
             del hashes
             reference_space = list(itertools.chain.from_iterable(reference_space))
+            logger.info("Finished reference space.")
 
         feature_space = OrderedDict(feature_space)
 

diff --git a/ml4chem/features/gaussian.py b/ml4chem/features/gaussian.py
@@ -3,7 +3,6 @@
 import logging
 import os
 import time
-import torch
 import dask.array as da
 import numpy as np
 import pandas as pd
@@ -142,7 +141,7 @@ def __init__(
             "value",
             "keys",
             "batch_size",
-            "__class__"
+            "__class__",
         ]
 
         for param in delete:

diff --git a/ml4chem/models/kernelridge.py b/ml4chem/models/kernelridge.py
@@ -203,7 +203,6 @@ def prepare_model(
         logger.info("Computing Kernel Matrix...")
         # We start populating computations with delayed functions to
         # operate with dask's scheduler
-        client = dask.distributed.get_client()
         kernel_matrix = self.get_kernel_matrix(
             feature_space, reference_features, purpose=purpose
         )
@@ -259,15 +258,14 @@ def get_kernel_matrix(self, feature_space, reference_features, purpose):
         """
         initial_time = time.time()
 
-        client = dask.distributed.get_client()
         call = {"exponential": exponential, "laplacian": laplacian, "rbf": rbf}
 
         if self.batch_size is None:
             chunks = [feature_space]
         else:
             chunks = list(get_chunks(feature_space, self.batch_size))
             logger.info(
-                "    The calculations are distributed in {} batches of {}.".format(
+                "    The calculations are distributed in {} batches of {} molecules.".format(
                     len(chunks), self.batch_size
                 )
             )
@@ -342,20 +340,33 @@ def get_kernel_matrix(self, feature_space, reference_features, purpose):
         """
         # We build the LT matrix needed for ADA
         if purpose == "training":
+            self.LT = []
             logger.info("Building LT matrix")
-            intermediates = []
+            computations = []
             for index, feature_space in enumerate(feature_space.items()):
-                intermediates.append(self.get_lt(index))
-            intermediates = dask.compute(*intermediates, scheduler=self.scheduler)
+                computations.append(self.get_lt(index))
 
-            self.LT = np.array(intermediates)
-            lt_time = time.time() - initial_time
-            h, m, s = convert_elapsed_time(lt_time)
-            logger.info(
-                "LT matrix built in {} hours {} minutes {:.2f} seconds.".format(h, m, s)
-            )
+            if self.batch_size is not None:
+                computations = list(get_chunks(computations, self.batch_size))
+                logger.info(
+                    "    The calculations are distributed in {} batches of {} molecules.".format(
+                        len(computations), self.batch_size
+                    )
+                )
+                for chunk in computations:
+                    self.LT += dask.compute(*chunk, scheduler=self.scheduler)
+
+                self.LT = np.array(self.LT)
+                del computations
+                del chunk
+                lt_time = time.time() - initial_time
+                h, m, s = convert_elapsed_time(lt_time)
+                logger.info(
+                    "LT matrix built in {} hours {} minutes {:.2f} seconds.".format(
+                        h, m, s
+                    )
+                )
 
-            print(self.LT)
         return kernel_matrix
 
     def train(self, inputs, targets, data=None):