Merge pull request #106 from GENESIS-EFRC/dev

Add support for multiprocessing via ray, add builders and pydantic models, bug fixes
materialsproject · Mar 31, 2022 · b0b6536 · b0b6536
2 parents bf6a458 + f11c27f
commit b0b6536
Show file tree

Hide file tree

Showing 30 changed files with 5,843 additions and 880 deletions.
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,7 @@ notebooks/metadata.json.gz
 notebooks/pathways.json.gz
 
 notebooks/rxns.json.gz
+
+notebooks/network-Copy1.ipynb
+
+notebooks/entries.json.gz
diff --git a/README.md b/README.md
@@ -20,7 +20,14 @@ conda activate gt
 conda install -c conda-forge graph-tool
 ```
 
-To install an editable version of the rxn-network code, simply download (clone) the
+Reaction network can then simply be installed via pip:
+
+```properties
+pip install reaction-network
+```
+
+## For developers: 
+To install an editable version of the rxn-network code, simply clone the
 code from this repository, navigate to its directory, and then run the
 following command to install the requirements:
 
@@ -29,6 +36,9 @@ pip install -r requirements.txt
 pip install -e .
 ```
 
+Note that this only works if the repository is cloned from GitHub, such that it contains
+the proper metadata.
+
 # Tutorial notebooks
 
 The `notebooks` folder contains two (2) demonstration notebooks: 

diff --git a/notebooks/Bi-Fe-O-K-F.png b/notebooks/Bi-Fe-O-K-F.png
diff --git a/notebooks/enumerators.ipynb b/notebooks/enumerators.ipynb
diff --git a/notebooks/network-Copy1.ipynb b/notebooks/network-Copy1.ipynb
diff --git a/notebooks/network.ipynb b/notebooks/network.ipynb
diff --git a/requirements-docs.txt b/requirements-docs.txt
@@ -1,4 +1,4 @@
-mkdocs==1.2.3
+mkdocs==1.3.0
 mkdocs-autorefs==0.3.1
 mkdocs-coverage==0.2.5
 mkdocs-jupyter==0.20.0

diff --git a/requirements-testing.txt b/requirements-testing.txt
@@ -1,10 +1,10 @@
 pre-commit==2.17.0
-pytest==6.2.5
+pytest==7.0.1
 pytest-cov==3.0.0
 pycodestyle==2.8.0
 pydocstyle==6.1.1
 flake8==4.0.1
 mypy==0.931
 mypy-extensions==0.4.3
 pylint==2.12.2
-black==22.1.0
+black==22.3.0
diff --git a/requirements.txt b/requirements.txt
@@ -3,3 +3,4 @@ numpy==1.22.2
 pymatgen==2022.2.10
 fireworks==2.0.2
 maggma==0.41.0
+ray==1.10.0
diff --git a/setup.cfg b/setup.cfg
@@ -42,5 +42,6 @@ disable=unsubscriptable-object,
         too-many-statements,
         too-many-boolean-expressions,
         R0902,
+        R0903,
         R0913,
-        R0914
+        R0914,
diff --git a/src/rxn_network/builders/__init__.py b/src/rxn_network/builders/__init__.py
diff --git a/src/rxn_network/builders/retrosynthesis.py b/src/rxn_network/builders/retrosynthesis.py
@@ -0,0 +1,196 @@
+""" Builder(s) for generating synthesis recipe documents."""
+from datetime import datetime
+from math import ceil
+from typing import Any, Dict, Optional
+
+from maggma.builders import Builder
+from maggma.core import Store
+from maggma.stores import GridFSStore
+from maggma.utils import grouper
+from monty.json import MontyDecoder, jsanitize
+from pymatgen.core.composition import Composition
+
+from rxn_network.core.cost_function import CostFunction
+from rxn_network.utils.models import (
+    ComputedSynthesisRecipe,
+    ComputedSynthesisRecipesDoc,
+)
+
+
+class SynthesisRecipeBuilder(Builder):
+    """
+    Build a synthesis recipe document from the reaction results from EnumeratorWF.
+    """
+
+    def __init__(
+        self,
+        tasks: Store,
+        recipes: Store,
+        cf: CostFunction,
+        tasks_fs: Optional[GridFSStore] = None,
+        recipes_fs: Optional[GridFSStore] = None,
+        query: Optional[Dict] = None,
+        **kwargs,
+    ):
+        self.tasks = tasks
+        self.tasks_fs = tasks_fs
+        self.recipes = recipes
+        self.recipes_fs = recipes_fs
+        self.cf = cf
+        self.query = query
+        self.kwargs = kwargs
+
+        sources = [tasks]
+        targets = [recipes]
+
+        if tasks_fs:
+            sources.append(tasks_fs)
+        if recipes_fs:
+            targets.append(recipes_fs)
+
+        super().__init__(sources=sources, targets=targets, **kwargs)
+
+    def ensure_indexes(self):
+        """
+        Ensures indexes for the tasks, (tasks_fs), and recipes collections.
+        """
+        self.tasks.ensure_index(self.tasks.key)
+        self.tasks.ensure_index(self.tasks.last_updated_field)
+        self.recipes.ensure_index(self.recipes.key)
+        self.recipes.ensure_index(self.recipes.last_updated_field)
+
+        if self.tasks_fs:
+            self.tasks_fs.ensure_index(self.tasks_fs.key)
+        if self.recipes_fs:
+            self.recipes_fs.ensure_index(self.recipes_fs.key)
+
+    def prechunk(self, number_splits: int):
+        """
+        Prechunk method to perform chunking by the key field.
+        """
+        keys = self._find_to_process()
+
+        N = ceil(len(keys) / number_splits)
+
+        for split in grouper(keys, N):
+            yield {"query": {self.tasks.key: {"$in": list(split)}}}
+
+    def get_items(self):
+        """Get the items to process."""
+        to_process_task_ids = self._find_to_process()
+
+        self.total = len(to_process_task_ids)
+        self.logger.info(f"Processing {self.total} task docs for synthesis recipes")
+
+        for task_id in to_process_task_ids:
+            task = self.tasks.query_one({"task_id": task_id})
+            if self.tasks_fs:
+                rxns = self.tasks_fs.query_one(
+                    {"task_id": task_id},
+                )["rxns"]
+                task["rxns"] = rxns
+                if not rxns:
+                    self.logger.warning(
+                        f"Missing rxns from GridFSStore for task_id {task_id}"
+                    )
+            else:
+                if not task.get("rxns"):
+                    self.logger.warning(f"Missing rxns in task {task_id}")
+
+            if task is not None:
+                yield task
+            else:
+                pass
+
+    def process_item(self, item):
+        """Creates a synthesis recipe document from the task document."""
+        item = MontyDecoder().process_decoded(item)
+
+        task_id = item["task_id"]
+        task_label = item["task_label"]
+        rxns = item["rxns"]
+        targets = item["targets"]
+        elements = item["elements"]
+        chemsys = item["chemsys"]
+        added_elements = item["added_elements"]
+        added_chemsys = item["added_chemsys"]
+        enumerators = item["enumerators"]
+        mu_func = None  # incorporate this?
+
+        if len(targets) > 1:
+            self.logger.warning(
+                f"Enumerator has multiple targets for task_id {item['task_id']}"
+            )
+            self.logger.warning("Selecting first target...")
+
+        target = item["targets"][0]
+        target_comp = Composition(target)
+
+        self.logger.debug(f"Creating synthesis recipes for {item['task_id']}")
+
+        rxns = rxns.get_rxns()
+        costs = [self.cf.evaluate(rxn) for rxn in rxns]
+        recipes = [
+            ComputedSynthesisRecipe.from_computed_rxn(
+                rxn, cost=cost, target=target_comp, mu_func=mu_func
+            )
+            for rxn, cost in zip(rxns, costs)
+        ]
+
+        d: Dict[str, Any] = {}
+
+        d["task_id"] = task_id
+        d["task_label"] = task_label
+        d["last_updated"] = datetime.utcnow()
+        d["recipes"] = recipes
+        d["target_composition"] = target_comp
+        d["target_formula"] = target
+        d["elements"] = elements
+        d["chemsys"] = chemsys
+        d["added_elements"] = added_elements
+        d["added_chemsys"] = added_chemsys
+        d["enumerators"] = enumerators
+        d["cost_function"] = self.cf
+
+        doc = ComputedSynthesisRecipesDoc(**d)
+
+        return jsanitize(
+            doc.dict(),
+            strict=True,
+            allow_bson=True,
+        )
+
+    def update_targets(self, items):
+        """
+        Inserts the new synthesis recipe docs into the Synthesis Recipes collection.
+        Stores recipes in GridFS if a recipes GridFSStore is provided.
+        """
+        docs = list(filter(None, items))
+
+        if len(docs) > 0:
+            self.logger.info(f"Found {len(docs)} synthesis recipe docs to update")
+
+            if self.recipes_fs:
+                recipes = []
+                for d in docs:
+                    d["use_gridfs"] = True
+                    recipe = {"task_id": d["task_id"], "recipes": d.pop("recipes")}
+                    recipes.append(recipe)
+
+                self.recipes_fs.update(
+                    recipes, key="task_id", additional_metadata=["task_id"]
+                )
+
+            self.recipes.update(docs)
+        else:
+            self.logger.info("No items to update")
+
+    def _find_to_process(self):
+        self.logger.info("Synthesis Recipe builder started.")
+
+        self.logger.info("Setting up indexes.")
+        self.ensure_indexes()
+
+        task_keys = set(self.tasks.distinct("task_id", criteria=self.query))
+        updated_tasks = set(self.recipes.newer_in(self.tasks, exhaustive=True))
+        return updated_tasks & task_keys
diff --git a/src/rxn_network/costs/competitiveness.py b/src/rxn_network/costs/competitiveness.py
@@ -5,7 +5,9 @@
 from typing import Dict, Iterable, List, Optional, Union
 
 import numpy as np
+import ray
 from pymatgen.core.composition import Composition, Element
+from tqdm import tqdm
 
 from rxn_network.core.calculator import Calculator
 from rxn_network.core.cost_function import CostFunction
@@ -17,6 +19,17 @@
 )
 from rxn_network.reactions.computed import ComputedReaction
 from rxn_network.reactions.reaction_set import ReactionSet
+from rxn_network.utils import initialize_ray, to_iterator
+
+
+@ray.remote
+def _calculate_ray(obj, rxn):
+    return obj.calculate(rxn)
+
+
+@ray.remote
+def _decorate_ray(obj, rxn):
+    return obj.decorate(rxn)
 
 
 class CompetitivenessScoreCalculator(Calculator):
@@ -97,6 +110,53 @@ def calculate(self, rxn: ComputedReaction) -> float:
 
         return c_score
 
+    def calculate_many(self, rxns: List[ComputedReaction]) -> List[float]:
+        """
+        Calculates the competitiveness score for a list of reactions by enumerating
+        competing reactions, evaluating their cost with the supplied cost function, and
+        then using the c-score formula, i.e. the _get_c_score() method, to determine the
+        competitiveness score. Parallelized with ray.
+
+        Args:
+            rxns: the list of ComputedReaction objects to be evaluated
+
+        Returns:
+            The list of competitiveness scores
+        """
+        initialize_ray()
+        obj = ray.put(self)
+
+        costs = [_calculate_ray.remote(obj, rxn) for rxn in rxns]
+        iterator = tqdm(to_iterator(costs), total=len(costs))
+
+        results = []
+        for r in iterator:
+            results.append(r)
+
+        return results
+
+    def decorate_many(self, rxns: List[ComputedReaction]) -> List[ComputedReaction]:
+        """
+        Decorates a list of reactions with the competitiveness score. Parallelized with
+        ray.
+
+        Args:
+            rxns: the list of ComputedReaction objects to be decorated
+
+        Returns:
+            The list of decorated ComputedReaction objects
+        """
+        obj = ray.put(self)
+
+        new_rxns = [_decorate_ray.remote(obj, rxn) for rxn in rxns]
+        iterator = tqdm(to_iterator(new_rxns), total=len(new_rxns))
+
+        results = []
+        for r in iterator:
+            results.append(r)
+
+        return results
+
     @lru_cache(maxsize=1)
     def get_competing_rxns(self, rxn: ComputedReaction) -> List[ComputedReaction]:
         """

diff --git a/src/rxn_network/enumerators/__init__.py b/src/rxn_network/enumerators/__init__.py
@@ -1,7 +1,9 @@
 """
 Reaction enumerator classes and associated utilities.
 """
+
 from rxn_network.enumerators.basic import BasicEnumerator, BasicOpenEnumerator
+
 from rxn_network.enumerators.minimize import (
     MinimizeGibbsEnumerator,
     MinimizeGrandPotentialEnumerator,