Skip to content

Commit

Permalink
Merge pull request #92 from nlesc-nano/dev
Browse files Browse the repository at this point in the history
Minor clean
  • Loading branch information
felipeZ committed Jun 18, 2021
2 parents 8615ec8 + 192d5d4 commit b75afea
Show file tree
Hide file tree
Showing 10 changed files with 61 additions and 35 deletions.
1 change: 0 additions & 1 deletion scripts/predict_gp.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
researcher.load_model("swan_chk.pt")

fingers = data.fingerprints
print("shape fingers: ", fingers.shape)
predicted = researcher.predict(fingers)
df = pd.DataFrame(
{"smiles": data.dataframe.smiles, "mean": predicted.mean, "lower": predicted.lower, "upper": predicted.upper})
Expand Down
22 changes: 22 additions & 0 deletions swan/modeller/base_modeller.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,25 @@ def load_model(self, path_model: Optional[PathLike]) -> None:
def save_model(self, *args, **kwargs):
"""Store the trained model."""
raise NotImplementedError

def store_trainset_in_state(self, indices: T_co, ntrain: int, store_features: bool = True) -> None:
"""Store features, indices, smiles, etc. into the state file."""
self.state.store_array("indices", indices, "int")
self.state.store_array("ntrain", ntrain, "int")
self.state.store_array("smiles_train", self.smiles[indices[:ntrain]], dtype="str")
self.state.store_array("smiles_validate", self.smiles[indices[ntrain:]], dtype="str")

if isinstance(self.labels_trainset, torch.Tensor):
self.state.store_array("labels_trainset", self.labels_trainset.numpy())
self.state.store_array("labels_validset", self.labels_validset.numpy())
else:
self.state.store_array("labels_trainset", self.labels_trainset)
self.state.store_array("labels_validset", self.labels_validset)

if store_features:
if isinstance(self.features_trainset, torch.Tensor):
self.state.store_array("features_trainset", self.features_trainset.numpy())
self.state.store_array("features_validset", self.features_validset.numpy())
else:
self.state.store_array("features_trainset", self.features_trainset)
self.state.store_array("features_validset", self.features_validset)
13 changes: 1 addition & 12 deletions swan/modeller/gp_modeller.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,7 @@ def split_data(self, partition: SplitDataset) -> None:
self.labels_validset = partition.labels_validset
warnings.warn("The labels have not been scaled. Is this the intended behavior?", UserWarning)

indices = partition.indices
ntrain = partition.ntrain
self.state.store_array("smiles_train", self.smiles[indices[:ntrain]], dtype="str")
self.state.store_array("smiles_validate", self.smiles[indices[ntrain:]], dtype="str")
self.state.store_array("features_trainset", self.features_trainset.numpy())
self.state.store_array("features_validset", self.features_validset.numpy())
self.state.store_array("labels_trainset", self.labels_trainset.numpy())
self.state.store_array("labels_validset", self.labels_validset.numpy())
self.state.store_array("indices", indices, "int")
self.state.store_array("ntrain", ntrain, "int")
self.store_trainset_in_state(partition.indices, partition.ntrain)

def train_model(self,
nepoch: int,
Expand Down Expand Up @@ -174,8 +165,6 @@ def predict(self, inp_data: Tensor) -> GPMultivariate:
self.network.likelihood.eval()

with torch.no_grad(), gp.settings.fast_pred_var():
first = self.network(inp_data)
print(first.mean)
output = self.network.likelihood(self.network(inp_data))
return self._create_result_object(output)

Expand Down
7 changes: 2 additions & 5 deletions swan/modeller/scikit_modeller.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,7 @@ def split_data(self, frac: Tuple[float, float]) -> None:
self.labels_validset = partition.labels_validset

# Split the smiles using the same partition than the features
indices = partition.indices
ntrain = partition.ntrain
self.state.store_array("smiles_train", self.smiles[indices[:ntrain]], dtype="str")
self.state.store_array("smiles_validate", self.smiles[indices[ntrain:]], dtype="str")
self.store_trainset_in_state(partition.indices, partition.ntrain)

def save_model(self):
"""Store the trained model."""
Expand Down Expand Up @@ -105,7 +102,7 @@ def predict(self, inp_data: np.ndarray) -> np.ndarray:
-------
Array containing the predicted results
"""
return self.model.predict(inp_data)
return self.inverse_transform(self.model.predict(inp_data))

def inverse_transform(self, arr: np.ndarray) -> np.ndarray:
"""Unscale ``arr`` using the fitted scaler.
Expand Down
7 changes: 3 additions & 4 deletions swan/modeller/torch_modeller.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,9 @@ def split_data(self, frac: Tuple[float, float], batch_size: int):
"""
# create the dataloader
indices_train, indices_validate = self.data.create_data_loader(frac=frac, batch_size=batch_size)

# Store the smiles used for training and validation
self.state.store_array("smiles_train", self.smiles[indices_train], dtype="str")
self.state.store_array("smiles_validate", self.smiles[indices_validate], dtype="str")
self.labels_trainset = self.data.labels[indices_train]
self.labels_validset = self.data.labels[indices_validate]
self.store_trainset_in_state(np.concatenate((indices_train, indices_validate)), len(indices_validate), store_features=False)

def train_model(self,
nepoch: int,
Expand Down
4 changes: 2 additions & 2 deletions swan/state/state.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Module to interact with HDF5."""

from pathlib import Path
from typing import Any, Optional
from typing import Any, List, Optional, Union

import h5py
import numpy as np
Expand All @@ -22,7 +22,7 @@ def __init__(self, path_hdf5: Optional[PathLike] = None, replace_state: bool = F
if not self.path.exists():
self.path.touch()

def has_data(self, data: ArrayLike) -> bool:
def has_data(self, data: Union[str, List[str]]) -> bool:
"""Search if the node exists in the HDF5 file.
Parameters
Expand Down
2 changes: 1 addition & 1 deletion tests/test_mpnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def setUp(self):
self.data = PATH_TEST / "thousand.csv"
self.data = TorchGeometricGraphData(self.data, properties=["Hardness (eta)"])
self.net = MPNN()
self.modeller = TorchModeller(self.net, self.data)
self.modeller = TorchModeller(self.net, self.data, replace_state=True)

def test_train_data_mpnn(self):

Expand Down
29 changes: 22 additions & 7 deletions tests/test_scikit_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,30 +7,45 @@

from .utils_test import PATH_TEST

DATA = FingerprintsData(PATH_TEST / "thousand.csv", properties=["Hardness (eta)"], sanitize=False)
DATA.scale_labels()


def run_test(model: str, **kwargs):
"""Run the training and validation step for the given model."""
modeller = SKModeller(model, DATA)
data = FingerprintsData(PATH_TEST / "thousand.csv", properties=["Hardness (eta)"], sanitize=False)
data.scale_labels()
modeller = SKModeller(model, data)
modeller.train_model()
predicted, expected = modeller.validate_model()
reg = stats.linregress(predicted.flatten(), expected.flatten())
assert not np.isnan(reg.rvalue)


def run_prediction(model: str):
"""Check the prediction functionality."""
data = FingerprintsData(PATH_TEST / "smiles.csv", sanitize=False)
modeller = SKModeller(model, data)
modeller.load_model("swan_skmodeller.pkl")
modeller.data.load_scale()
predicted = modeller.predict(data.fingerprints)
assert not np.isnan(predicted).all()


def test_decision_tree():
"""Check the interface to the Decisiontree class."""
run_test("decision_tree")
model = "decision_tree"
run_test(model)
run_prediction(model)


def test_svm():
"""Check the interface to the support vector machine."""
run_test("svm")
model = "svm"
run_test(model)
run_prediction(model)


def test_gaussian_process():
"""Check the interface to the support vector machine."""
kernel = ConstantKernel(constant_value=10)
run_test("gaussian_process", kernel=kernel)
model = "gaussian_process"
run_test(model, kernel=kernel)
run_prediction(model)
4 changes: 2 additions & 2 deletions tests/test_se3_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
torch.set_default_dtype(torch.float32)

CSV_FILE = PATH_TEST / "thousand.csv"
DATA = DGLGraphData(CSV_FILE, properties=["Hardness (eta)"])
DATA = DGLGraphData(CSV_FILE, properties=["Hardness (eta)"], optimize_molecule=True)


def run_modeller(net: torch.nn.Module):
"""Run a given model."""
modeller = TorchModeller(net, DATA, use_cuda=False, replace_state=False)
modeller = TorchModeller(net, DATA, use_cuda=False, replace_state=True)

modeller.data.scale_labels()
modeller.train_model(nepoch=1, batch_size=64)
Expand Down
7 changes: 6 additions & 1 deletion tests/test_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,14 @@ def test_state(tmp_path: Path, capsys):
out, _ = capsys.readouterr()
assert "Available data" in out

assert not all(state.has_data(f"non_existing_{i}") for i in range(2))


def test_state_unknown_key(tmp_path: Path):
"""Check that an error is raised if there is not data."""
path_hdf5 = tmp_path / "swan_state.h5"
state = StateH5(path_hdf5)
path_hdf5.touch()
state = StateH5(path_hdf5, replace_state=True)

with pytest.raises(KeyError):
state.retrieve_data("nonexisting property")
Expand All @@ -46,3 +49,5 @@ def store_smiles_in_state(tmp_path: Path):
state.store_array("smiles", smiles, "str")
data = [x.decode() for x in state.retrieve_data("smiles")]
assert data == smiles.tolist()


0 comments on commit b75afea

Please sign in to comment.