Skip to content

Commit

Permalink
Improved features and models modules.
Browse files Browse the repository at this point in the history
- Pyflakes cleanup.
- Black cleanup.
- features.coulombmatrix module improved.
- models.kernelridge: improved efficiency of KernelRidge regression
  class.
  • Loading branch information
muammar committed Jan 16, 2020
1 parent 94b452e commit 823f188
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 27 deletions.
1 change: 0 additions & 1 deletion ml4chem/features/base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import dask
import torch
import numpy as np
from abc import ABC, abstractmethod
Expand Down
15 changes: 4 additions & 11 deletions ml4chem/features/coulombmatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import os
import time
import torch
import numpy as np
import pandas as pd
from collections import OrderedDict
from dscribe.descriptors import CoulombMatrix as CoulombMatrixDscribe
Expand All @@ -20,7 +19,7 @@
class CoulombMatrix(AtomisticFeatures, CoulombMatrixDscribe):
"""Coulomb Matrix features
Parameters
----------
filename : str
Expand Down Expand Up @@ -81,7 +80,6 @@ def __init__(
# This is a very general way of not forgetting to save variables
_params = vars()


# Delete useless variables
delete = [
"self",
Expand All @@ -92,7 +90,7 @@ def __init__(
"value",
"keys",
"batch_size",
"__class__"
"__class__",
]

for param in delete:
Expand Down Expand Up @@ -246,19 +244,12 @@ def calculate(self, images=None, purpose="training", data=None, svm=False):
feature_space = []

if svm and purpose == "training":
logger.info("Building array with reference space.")
reference_space = []

for i, image in enumerate(images.items()):
restacked = client.submit(
self.restack_image, *(i, image, scaled_feature_space, svm)
)

# image = (hash, ase_image) -> tuple
# for atom in image[1]:
# restacked_atom = client.submit(self.restack_atom, *(i, atom, scaled_feature_space))
# reference_space.append(restacked_atom)

feature_space.append(restacked)

elif svm is False and purpose == "training":
Expand Down Expand Up @@ -288,9 +279,11 @@ def calculate(self, images=None, purpose="training", data=None, svm=False):

if svm and purpose == "training":
# FIXME This might need to be improved
logger.info("Building array with reference space.")
hashes, reference_space = list(zip(*feature_space))
del hashes
reference_space = list(itertools.chain.from_iterable(reference_space))
logger.info("Finished reference space.")

feature_space = OrderedDict(feature_space)

Expand Down
3 changes: 1 addition & 2 deletions ml4chem/features/gaussian.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import logging
import os
import time
import torch
import dask.array as da
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -142,7 +141,7 @@ def __init__(
"value",
"keys",
"batch_size",
"__class__"
"__class__",
]

for param in delete:
Expand Down
37 changes: 24 additions & 13 deletions ml4chem/models/kernelridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,6 @@ def prepare_model(
logger.info("Computing Kernel Matrix...")
# We start populating computations with delayed functions to
# operate with dask's scheduler
client = dask.distributed.get_client()
kernel_matrix = self.get_kernel_matrix(
feature_space, reference_features, purpose=purpose
)
Expand Down Expand Up @@ -259,15 +258,14 @@ def get_kernel_matrix(self, feature_space, reference_features, purpose):
"""
initial_time = time.time()

client = dask.distributed.get_client()
call = {"exponential": exponential, "laplacian": laplacian, "rbf": rbf}

if self.batch_size is None:
chunks = [feature_space]
else:
chunks = list(get_chunks(feature_space, self.batch_size))
logger.info(
" The calculations are distributed in {} batches of {}.".format(
" The calculations are distributed in {} batches of {} molecules.".format(
len(chunks), self.batch_size
)
)
Expand Down Expand Up @@ -342,20 +340,33 @@ def get_kernel_matrix(self, feature_space, reference_features, purpose):
"""
# We build the LT matrix needed for ADA
if purpose == "training":
self.LT = []
logger.info("Building LT matrix")
intermediates = []
computations = []
for index, feature_space in enumerate(feature_space.items()):
intermediates.append(self.get_lt(index))
intermediates = dask.compute(*intermediates, scheduler=self.scheduler)
computations.append(self.get_lt(index))

self.LT = np.array(intermediates)
lt_time = time.time() - initial_time
h, m, s = convert_elapsed_time(lt_time)
logger.info(
"LT matrix built in {} hours {} minutes {:.2f} seconds.".format(h, m, s)
)
if self.batch_size is not None:
computations = list(get_chunks(computations, self.batch_size))
logger.info(
" The calculations are distributed in {} batches of {} molecules.".format(
len(computations), self.batch_size
)
)
for chunk in computations:
self.LT += dask.compute(*chunk, scheduler=self.scheduler)

self.LT = np.array(self.LT)
del computations
del chunk
lt_time = time.time() - initial_time
h, m, s = convert_elapsed_time(lt_time)
logger.info(
"LT matrix built in {} hours {} minutes {:.2f} seconds.".format(
h, m, s
)
)

print(self.LT)
return kernel_matrix

def train(self, inputs, targets, data=None):
Expand Down

0 comments on commit 823f188

Please sign in to comment.