Skip to content

Commit

Permalink
Merge pull request #91 from nlesc-nano/gpytorch
Browse files Browse the repository at this point in the history
Add support for GPytorch
  • Loading branch information
felipeZ committed Jun 18, 2021
2 parents 0888025 + 1b760a8 commit 98ff8b6
Show file tree
Hide file tree
Showing 34 changed files with 1,543 additions and 1,110 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
## New
* Add interface to scikit regressors (#85)
* Add interface to HDF5 to store the training results (#88)
* Add interface to GPyTorch (#90)

## Changed
* Fix prediction functionality (#81)
* Return predicted and expected values scaled back (#90)

# 0.5.0 [04/05/2021]
## Changed
Expand Down
6 changes: 2 additions & 4 deletions README.rst
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@

.. image:: https://api.codacy.com/project/badge/Grade/e410d9da7b654d2caf67481f33ae2de7
:target: https://www.codacy.com/app/nlesc-jcer/swan?utm_source=github.com&utm_medium=referral&utm_content=nlesc-nano/swan&utm_campaign=Badge_Grade
.. image:: https://readthedocs.org/projects/swan/badge/?version=latest
:target: https://swan.readthedocs.io/en/latest/?badge=latest
.. image:: https://github.com/nlesc-nano/swan/workflows/build%20with%20conda/badge.svg
:target: https://github.com/nlesc-nano/swan/actions
.. image:: https://codecov.io/gh/nlesc-nano/swan/branch/main/graph/badge.svg?token=1527ficjjx
:target: https://codecov.io/gh/nlesc-nano/swan
.. image:: https://zenodo.org/badge/191957101.svg
:target: https://zenodo.org/badge/latestdoi/191957101
.. image:: https://readthedocs.org/projects/swan/badge/?version=latest
:target: https://swan.readthedocs.io/en/latest/?badge=latest

#####################################
Screening Workflows And Nanomaterials
Expand Down
2 changes: 1 addition & 1 deletion scripts/predict.py → scripts/predict_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
torch.set_default_dtype(torch.float32)

path_files = Path("data/Carboxylic_acids/CDFT")
PATH_DATA = "smiles.csv" # path_files / "cdft_random_500.csv"
PATH_DATA = path_files / "cdft_random_500.csv"

# Datasets
NUMLABELS = 1
Expand Down
68 changes: 68 additions & 0 deletions scripts/run_gp_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env python

import logging
from pathlib import Path

import torch

from swan.dataset import FingerprintsData, split_dataset
from swan.modeller import GPModeller
from swan.modeller.models import GaussianProcess
from swan.utils.log_config import configure_logger
from swan.utils.plot import create_confidence_plot, create_scatter_plot

# Starting logger
configure_logger(Path("."))
LOGGER = logging.getLogger(__name__)

# Set float size default
torch.set_default_dtype(torch.float32)

# Path to the DATASET
path_data = Path("tests/files/thousand.csv")

# Training variables
nepoch = 100
properties = [
# "Dissocation energy (nucleofuge)",
# "Dissociation energy (electrofuge)",
# "Electroaccepting power(w+)",
# "Electrodonating power (w-)",
# "Electronegativity (chi=-mu)",
# "Electronic chemical potential (mu)",
# "Electronic chemical potential (mu+)",
# "Electronic chemical potential (mu-)",
# "Electrophilicity index (w=omega)",
# "Global Dual Descriptor Deltaf+",
# "Global Dual Descriptor Deltaf-",
"Hardness (eta)",
# "Hyperhardness (gamma)",
# "Net Electrophilicity",
# "Softness (S)"
]
num_labels = len(properties)

# Datasets
data = FingerprintsData(path_data, properties=properties, sanitize=False)

# Split the data into training and validation set
partition = split_dataset(data.fingerprints, data.labels, frac=(0.8, 0.2))

# Model
model = GaussianProcess(partition.features_trainset, partition.labels_trainset.flatten())

# training and validation
researcher = GPModeller(model, data, use_cuda=False, replace_state=True)
researcher.set_optimizer("Adam", lr=0.5)
researcher.set_scheduler(None)
trained_multivariate, expected_train = researcher.train_model(nepoch, partition)

# # Print validation scatterplot
print("validation regression")
multi, label_validset = researcher.validate_model()

create_confidence_plot(
multi, label_validset.flatten(), properties[0], "validation_scatterplot")

create_scatter_plot(
multi.mean.reshape(-1, 1), label_validset, properties, "simple_scatterplot")
36 changes: 19 additions & 17 deletions scripts/run_scikit_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from swan.dataset import FingerprintsData
from swan.modeller import SKModeller
from swan.utils.log_config import configure_logger
from swan.utils.plot import create_scatter_plot

configure_logger(Path("."))

Expand All @@ -27,26 +28,26 @@

# Training variables
properties = [
"Dissocation energy (nucleofuge)",
"Dissociation energy (electrofuge)",
"Electroaccepting power(w+)",
"Electrodonating power (w-)",
"Electronegativity (chi=-mu)",
"Electronic chemical potential (mu)",
"Electronic chemical potential (mu+)",
"Electronic chemical potential (mu-)",
"Electrophilicity index (w=omega)",
"Global Dual Descriptor Deltaf+",
"Global Dual Descriptor Deltaf-",
# "Dissocation energy (nucleofuge)",
# "Dissociation energy (electrofuge)",
# "Electroaccepting power(w+)",
# "Electrodonating power (w-)",
# "Electronegativity (chi=-mu)",
# "Electronic chemical potential (mu)",
# "Electronic chemical potential (mu+)",
# "Electronic chemical potential (mu-)",
# "Electrophilicity index (w=omega)",
# "Global Dual Descriptor Deltaf+",
# "Global Dual Descriptor Deltaf-",
"Hardness (eta)",
"Hyperhardness (gamma)",
"Net Electrophilicity",
"Softness (S)"
# "Hyperhardness (gamma)",
# "Net Electrophilicity",
# "Softness (S)"
]


def run_all(path_data: str, output_file: str):
nruns = 3
nruns = 1
models = ["gaussian_process"] # ["decision_tree", "svm"]
rvalues = {}
for p in properties:
Expand All @@ -65,10 +66,11 @@ def run_all(path_data: str, output_file: str):

def run_scikit_model(name_model: str, data: FingerprintsData):
parameters = dict_parameters[name_model]
modeller = SKModeller(data, name_model, **parameters)
modeller = SKModeller(name_model, data, **parameters)
modeller.train_model()
predicted, expected = modeller.validate_model()
reg = stats.linregress(predicted, expected.flatten())
create_scatter_plot(predicted, expected, ["Hardness (eta)"], "scikit_validation")
reg = stats.linregress(predicted.flatten(), expected.flatten())
return reg.rvalue


Expand Down
4 changes: 2 additions & 2 deletions scripts/run_model.py → scripts/run_torch_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pathlib import Path
import torch
from swan.dataset import TorchGeometricGraphData, FingerprintsData, DGLGraphData
from swan.modeller import Modeller
from swan.modeller import TorchModeller
from swan.modeller.models import FingerprintFullyConnected, MPNN, SE3Transformer
from swan.utils.log_config import configure_logger
from swan.utils.plot import create_scatter_plot
Expand Down Expand Up @@ -70,7 +70,7 @@

# training and validation
torch.set_default_dtype(torch.float32)
researcher = Modeller(net, data, use_cuda=False)
researcher = TorchModeller(net, data, use_cuda=False)
researcher.set_optimizer("Adam", lr=0.0005)
researcher.set_scheduler("StepLR", 0.1)
researcher.data.scale_labels()
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
install_requires=[
'e3nn@git+https://github.com/e3nn/e3nn@main',
'equivariant_attention@git+https://github.com/nlesc-nano/se3-transformer-public@dev',
'h5py', 'mendeleev', 'numpy', 'pandas', 'pyyaml', 'scikit-learn',
'gpytorch', 'h5py', 'mendeleev', 'numpy', 'pandas', 'pyyaml', 'scikit-learn',
'scipy', 'seaborn', 'schema',
'torch-geometric'],

Expand Down
12 changes: 6 additions & 6 deletions swan/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""Swan API."""
from .__version__ import __version__

from .modeller import Modeller, SKModeller
from swan.dataset import TorchGeometricGraphData, FingerprintsData, DGLGraphData
from .modeller.models import FingerprintFullyConnected, MPNN, SE3Transformer
from .dataset import DGLGraphData, FingerprintsData, TorchGeometricGraphData
from .modeller import SKModeller, TorchModeller
from .modeller.models import (MPNN, FingerprintFullyConnected, GaussianProcess,
SE3Transformer)

__all__ = [
"__version__", "Modeller", "SKModeller",
"__version__", "TorchModeller", "SKModeller",
"TorchGeometricGraphData", "FingerprintsData", "DGLGraphData",
"FingerprintFullyConnected", "MPNN", "SE3Transformer"]
"FingerprintFullyConnected", "MPNN", "SE3Transformer", "GaussianProcess"]
5 changes: 3 additions & 2 deletions swan/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .dgl_graph_data import DGLGraphData
from .fingerprints_data import FingerprintsData
from .splitter import split_dataset, load_split_dataset
from .torch_geometric_graph_data import TorchGeometricGraphData
from .dgl_graph_data import DGLGraphData

__all__ = ["DGLGraphData", "FingerprintsData", "TorchGeometricGraphData"]
__all__ = ["DGLGraphData", "FingerprintsData", "TorchGeometricGraphData", "load_split_dataset", "split_dataset"]
52 changes: 52 additions & 0 deletions swan/dataset/splitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from typing import Generic, NamedTuple, Tuple, TypeVar, Union

import numpy as np
import torch

from ..state import StateH5
from ..type_hints import PathLike

T_co = TypeVar('T_co', bound=Union[np.ndarray, torch.Tensor], covariant=True)


class SplitDataset(NamedTuple, Generic[T_co]):
indices: np.ndarray # Shuffled indices to split the data
ntrain: int # Number of points used for training
features_trainset: T_co # Features for training
features_validset: T_co # Features for validation
labels_trainset: T_co # Labels for training
labels_validset: T_co # Labels for validation


def split_dataset(features: T_co, labels: T_co, frac: Tuple[float, float] = (0.8, 0.2)) -> SplitDataset:
"""Split the fingerprint dataset into a training and validation set.
Parameters
----------
features
Dataset features
labels
Dataset labels
frac
fraction to divide the dataset, by default [0.8, 0.2]
"""
# Generate random indices to train and validate the model
size = len(features)
indices = np.arange(size)
np.random.shuffle(indices)

ntrain = int(size * frac[0])
features_trainset = features[indices[:ntrain]]
features_validset = features[indices[ntrain:]]
labels_trainset = labels[indices[:ntrain]]
labels_validset = labels[indices[ntrain:]]

return SplitDataset(indices, ntrain, features_trainset, features_validset, labels_trainset, labels_validset)


def load_split_dataset(state_file: PathLike = "swan_state.h5"):
"""Load the split data used for training from the state file."""
state = StateH5(state_file)
return SplitDataset(*[
state.retrieve_data(x) for x in (
'indices', 'ntrain', 'features_trainset', 'features_validset', 'labels_trainset', 'labels_validset')])
5 changes: 3 additions & 2 deletions swan/modeller/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .modeller import Modeller
from .gp_modeller import GPModeller
from .scikit_modeller import SKModeller
from .torch_modeller import TorchModeller

_all__ = ["Modeller", "SKModeller"]
__all__ = ["GPModeller", "SKModeller", "TorchModeller"]
3 changes: 2 additions & 1 deletion swan/modeller/base_modeller.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@ class BaseModeller(Generic[T_co]):
"""Base class for the modellers."""

def __init__(self, data: SwanDataBase, replace_state: bool) -> None:
self.data = data
self.state = StateH5(replace_state=replace_state)
self.smiles = data.dataframe.smiles.to_numpy()

@abc.abstractmethod
def train_model(self, frac: Tuple[float, float] = (0.8, 0.2), **kwargs):
def train_model(self, nepoch: int, frac: Tuple[float, float] = (0.8, 0.2), **kwargs):
"""Train the model using the given data.
Parameters
Expand Down

0 comments on commit 98ff8b6

Please sign in to comment.