Merge pull request #86 from nlesc-nano/dev

add interface to scikit regressors
nlesc-nano · May 20, 2021 · 282e86e · 282e86e
2 parents 6cb80e6 + 422f884
commit 282e86e
Show file tree

Hide file tree

Showing 7 changed files with 164 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,16 @@
 # Change Log
 
+# 0.6.0 [Unreleased]
+## New
+* Add interface to scikit regressors (#85)
+
+## Changed
+* Fix prediction functionality (#81)
+
 # 0.5.0 [04/05/2021]
 ## Changed
 * Fix graph neural network implementation (#59)
 * Rename the graph neural network to MPNN (message passing NN)
-* Fix prediction functionality (#81)
 
 ## New
 * Interface to [se3-transformer](https://www.dgl.ai/pages/start.html) (#57)

diff --git a/swan/__version__.py b/swan/__version__.py
@@ -1 +1 @@
-__version__ = '0.5.0'
+__version__ = '0.6.0'
diff --git a/swan/dataset/swan_data_base.py b/swan/dataset/swan_data_base.py
@@ -113,15 +113,15 @@ def clean_dataframe(self, sanitize: bool) -> None:
         self.dataframe.reset_index(drop=True, inplace=True)
 
     def create_data_loader(self,
-                           frac=[0.8, 0.2],
+                           frac: Tuple[float, float] = (0.8, 0.2),
                            batch_size: int = 64) -> None:
         """create the train/valid data loaders
 
         Parameters
         ----------
-        frac : list, optional
+        frac
             fraction to divide the dataset, by default [0.8, 0.2]
-        batch_size : int, optional
+        batch_size
             batchsize, by default 64
         """
         ntotal = len(self.dataset)

diff --git a/swan/modeller/__init__.py b/swan/modeller/__init__.py
@@ -1,2 +1,4 @@
 from .modeller import Modeller
-_all__ = ["Modeller"]
+from .scikit_modeller import SKModeller
+
+_all__ = ["Modeller", "SKModeller"]
diff --git a/swan/modeller/modeller.py b/swan/modeller/modeller.py
@@ -2,7 +2,8 @@
 
 import logging
 from pathlib import Path
-from typing import List, Tuple
+from swan.dataset.fingerprints_data import PathLike
+from typing import Tuple
 
 import torch
 from torch import Tensor, nn
@@ -104,7 +105,7 @@ def set_scheduler(self, name, *args, **kwargs) -> None:
 
     def train_model(self,
                     nepoch: int,
-                    frac: List[float] = [0.8, 0.2],
+                    frac: Tuple[float, float] = (0.8, 0.2),
                     batch_size: int = 64) -> Tuple[Tensor, Tensor]:
         """Train the model
 
@@ -245,7 +246,7 @@ def save_model(self,
                 'loss': loss
             }, path)
 
-    def load_model(self, filename) -> None:
+    def load_model(self, filename: PathLike) -> None:
         """Load the model from the state file."""
         checkpoint = torch.load(filename)
         self.network.load_state_dict(checkpoint['model_state_dict'])

diff --git a/swan/modeller/scikit_modeller.py b/swan/modeller/scikit_modeller.py
@@ -0,0 +1,110 @@
+"""Module to create statistical models using scikit learn."""
+
+import logging
+import pickle
+from pathlib import Path
+from typing import Optional, Tuple, Union
+
+import numpy as np
+from sklearn import gaussian_process, svm, tree
+
+from ..dataset.fingerprints_data import FingerprintsData
+
+PathLike = Union[str, Path]
+
+LOGGER = logging.getLogger(__name__)
+
+
+class SKModeller:
+    """Create statistical models using the scikit learn library."""
+
+    def __init__(self, data: FingerprintsData, name: str, **kwargs):
+        """Class constructor.
+
+        Parameters
+        ----------
+        data
+            FingerprintsData object containing the dataset
+        name
+            scikit learn model to use
+        """
+        self.fingerprints = data.fingerprints.numpy()
+        self.labels = data.dataset.labels.numpy()
+        self.path_model = "swan_skmodeller.pkl"
+
+        supported_models = {
+            "decisiontree": tree.DecisionTreeRegressor,
+            "svm": svm.SVR,
+            "gaussianprocess": gaussian_process.GaussianProcessRegressor
+        }
+
+        if name.lower() in supported_models:
+            self.model = supported_models[name.lower()](**kwargs)
+        else:
+            raise RuntimeError(f"There is not model name: {name}")
+
+        LOGGER.info(f"Created {name} model")
+
+    def split_data(self, frac: Tuple[float, float]):
+        """Split the data into a training and validation set.
+
+        Parameters
+        ----------
+        frac
+            fraction to divide the dataset, by default [0.8, 0.2]
+        """
+        # Generate random indices to train and validate the model
+        size = len(self.fingerprints)
+        indices = np.arange(size)
+        np.random.shuffle(indices)
+
+        ntrain = int(size * frac[0])
+        self.features_trainset = self.fingerprints[indices[:ntrain]]
+        self.features_validset = self.fingerprints[indices[ntrain:]]
+        self.labels_trainset = self.labels[indices[:ntrain]]
+        self.labels_validset = self.labels[indices[ntrain:]]
+
+    def train_model(self, frac: Tuple[float, float] = (0.8, 0.2)):
+        """Train the model using the given data.
+
+        Parameters
+        ----------
+        frac
+            fraction to divide the dataset, by default [0.8, 0.2]
+        """
+        self.split_data(frac)
+        self.model.fit(self.features_trainset, self.labels_trainset)
+        self.save_model()
+
+    def save_model(self):
+        """Store the trained model."""
+        with open(self.path_model, 'wb') as handler:
+            pickle.dump(self.model, handler)
+
+    def validate_model(self) -> Tuple[np.ndarray, np.ndarray]:
+        """Check the model prediction power."""
+        predicted = self.model.predict(self.features_validset)
+        expected = self.labels_validset
+        score = self.model.score(self.features_validset, expected)
+        LOGGER.info(f"Validation R^2 score: {score}")
+        return predicted, expected
+
+    def load_model(self, path_model: Optional[PathLike]) -> None:
+        """Load the model from the state file."""
+        path_model = self.path_model if path_model is None else path_model
+        with open(path_model, 'rb') as handler:
+            self.model = pickle.load(handler)
+
+    def predict(self, inp_data: np.ndarray) -> np.ndarray:
+        """Used the previously trained model to predict properties.
+
+        Parameters
+        ----------
+        inp_data
+            Matrix containing a given fingerprint for each row
+
+        Returns
+        -------
+        Array containing the predicted results
+        """
+        return self.model.predict(inp_data)
diff --git a/tests/test_scikit_models.py b/tests/test_scikit_models.py
@@ -0,0 +1,36 @@
+import numpy as np
+from scipy import stats
+from sklearn.gaussian_process.kernels import ConstantKernel
+
+from swan.dataset import FingerprintsData
+from swan.modeller import SKModeller
+
+from .utils_test import PATH_TEST
+
+DATA = FingerprintsData(PATH_TEST / "thousand.csv", properties=["gammas"], sanitize=False)
+DATA.scale_labels()
+
+
+def run_test(model: str, **kwargs):
+    """Run the training and validation step for the given model."""
+    modeller = SKModeller(DATA, model)
+    modeller.train_model()
+    predicted, expected = modeller.validate_model()
+    reg = stats.linregress(predicted.flatten(), expected.flatten())
+    assert not np.isnan(reg.rvalue)
+
+
+def test_decision_tree():
+    """Check the interface to the Decisiontree class."""
+    run_test("decisiontree")
+
+
+def test_svm():
+    """Check the interface to the support vector machine."""
+    run_test("svm")
+
+
+def test_gaussian_process():
+    """Check the interface to the support vector machine."""
+    kernel = ConstantKernel(constant_value=10)
+    run_test("gaussianprocess", kernel=kernel)