Merge remote-tracking branch 'origin/main' into elec_dev

materialsproject · Mar 23, 2021 · 2139c36 · 2139c36
2 parents f2c8a87 + 30566da
commit 2139c36
Show file tree

Hide file tree

Showing 16 changed files with 454 additions and 57 deletions.
diff --git a/emmet-builders/emmet/builders/materials/provenance.py b/emmet-builders/emmet/builders/materials/provenance.py
@@ -0,0 +1,213 @@
+from collections import defaultdict
+from itertools import chain
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+from maggma.core import Builder, Store
+from maggma.utils import grouper
+from pymatgen.analysis.structure_matcher import StructureMatcher
+from pymatgen.core import Structure
+from pymatgen.util.provenance import StructureNL
+
+from emmet.builders import SETTINGS
+from emmet.builders.settings import EmmetBuildSettings
+from emmet.core.provenance import ProvenanceDoc
+from emmet.core.utils import group_structures
+from emmet.core.vasp.calc_types import run_type, task_type
+from emmet.core.vasp.validation import DeprecationMessage, ValidationDoc
+
+
+class ProvenanceBuilder(Builder):
+    def __init__(
+        self,
+        materials: Store,
+        provenance: Store,
+        source_snls: List[Store],
+        settings: Optional[EmmetBuildSettings] = None,
+        query: Optional[Dict] = None,
+        **kwargs,
+    ):
+        """
+        Creates provenance from source SNLs and materials
+
+        Args:
+            materials: Store of materials docs to tag with SNLs
+            provenance: Store to update with provenance data
+            source_snls: List of locations to grab SNLs
+            query : query on materials to limit search
+        """
+        self.materials = materials
+        self.provenance = provenance
+        self.source_snls = source_snls
+        self.settings = EmmetBuildSettings.autoload(settings)
+        self.query = query
+        self.kwargs = kwargs
+
+        super().__init__(
+            sources=[materials, *source_snls], targets=[provenance], **kwargs
+        )
+
+    def ensure_indicies(self):
+
+        self.materials.ensure_index("material_id", unique=True)
+        self.materials.ensure_index("formula_pretty")
+
+        self.provenance.ensure_index("material_id", unique=True)
+        self.provenance.ensure_index("formula_pretty")
+
+        for s in self.source_snls:
+            s.ensure_index("snl_id")
+            s.ensure_index("formula_pretty")
+
+    def get_items(self) -> Tuple[List[Dict], List[Dict]]:
+        """
+        Gets all materials to assocaite with SNLs
+        Returns:
+            generator of materials and SNLs that could match
+        """
+        self.logger.info("Provenance Builder Started")
+
+        self.logger.info("Setting indexes")
+        self.ensure_indicies()
+
+        # Find all formulas for materials that have been updated since this
+        # builder was last ran
+        q = {**self.query, "property_name": ProvenanceDoc.property_name}
+        updated_materials = self.provenance.newer_in(
+            self.materials,
+            criteria=q,
+            exhaustive=True,
+        )
+        forms_to_update = set(
+            self.materials.distinct(
+                "formula_pretty", {"material_id": {"$in": updated_materials}}
+            )
+        )
+
+        # Find all new SNL formulas since the builder was last run
+        for source in self.source_snls:
+            new_snls = self.provenance.newer_in(source)
+            forms_to_update |= set(source.distinct("formula_pretty", new_snls))
+
+        # Now reduce to the set of formulas we actually have
+        forms_avail = set(self.materials.distinct("formula_pretty", self.query))
+        forms_to_update = forms_to_update & forms_avail
+
+        self.logger.info(f"Found {len(forms_to_update)} new/updated systems to proces")
+
+        self.total = len(forms_to_update)
+
+        for formulas in grouper(forms_to_update, self.chunk_size):
+            snls = []
+            for source in self.source_snls:
+                snls.extend(
+                    source.query(criteria={"formula_pretty": {"$in": formulas}})
+                )
+
+            mats = list(
+                self.materials.query(
+                    properties=[
+                        "material_id",
+                        "last_updated",
+                        "structure",
+                        "initial_structures",
+                        "formula_pretty",
+                    ],
+                    criteria={"formula_pretty": {"$in": formulas}},
+                )
+            )
+
+            form_groups = defaultdict(list)
+            for snl in snls:
+                form_groups[snl["formula_pretty"]].append(snl)
+
+            mat_groups = defaultdict(list)
+            for mat in mats:
+                mat_groups[mat["formula_pretty"]].append(mat)
+
+            for formula, snl_group in form_groups.items():
+
+                mat_group = mat_groups[formula]
+
+                self.logger.debug(
+                    f"Found {len(snl_group)} snls and {len(mat_group)} mats"
+                )
+                yield mat_group, snl_group
+
+    def process_item(self, item) -> List[Dict]:
+        """
+        Matches SNLS and Materials
+        Args:
+            item (tuple): a tuple of materials and snls
+        Returns:
+            list(dict): a list of collected snls with material ids
+        """
+        mats, source_snls = item
+        formula_pretty = mats[0]["formula_pretty"]
+        snl_docs = list()
+        self.logger.debug(f"Finding Provenance {formula_pretty}")
+
+        # Match up SNLS with materials
+        for mat in mats:
+            matched_snls = list(self.match(source_snls, mat))
+            if len(matched_snls) > 0:
+                doc = ProvenanceDoc.from_SNLs(
+                    material_id=mat["material_id"], snls=matched_snls
+                )
+
+                doc.authors.append(self.settings.DEFAULT_AUTHOR)
+                doc.history.append(self.settings.DEFAULT_HISTORY)
+                doc.references.append(self.settings.DEFAULT_REFERENCE)
+
+                snl_docs.append(doc.dict())
+
+        return snl_docs
+
+    def match(self, snls, mat):
+        """
+        Finds a material doc that matches with the given snl
+        Args:
+            snl ([dict]): the snls list
+            mat (dict): a materials doc
+        Returns:
+            generator of materials doc keys
+        """
+
+        m_strucs = [Structure.from_dict(mat["structure"])] + [
+            Structure.from_dict(init_struc) for init_struc in mat["initial_structures"]
+        ]
+        snl_strucs = [StructureNL.from_dict(snl) for snl in snls]
+
+        groups = group_structures(
+            m_strucs + snl_strucs,
+            ltol=self.settings.LTOL,
+            stol=self.settings.STOL,
+            angle_tol=self.settings.ANGLE_TOL,
+        )
+        matched_groups = [
+            group
+            for group in groups
+            if any(isinstance(struc, Structure) for struc in group)
+        ]
+        snls = [
+            struc
+            for struc in group
+            for group in matched_groups
+            if isinstance(struc, StructureNL)
+        ]
+
+        self.logger.debug(f"Found {len(snls)} SNLs for {mat['material_id']}")
+        return snls
+
+    def update_targets(self, items):
+        """
+        Inserts the new SNL docs into the SNL collection
+        """
+
+        snls = list(filter(None, chain.from_iterable(items)))
+
+        if len(snls) > 0:
+            self.logger.info(f"Found {len(snls)} SNLs to update")
+            self.provenance.update(snls)
+        else:
+            self.logger.info("No items to update")
diff --git a/emmet-builders/emmet/builders/settings.py b/emmet-builders/emmet/builders/settings.py
@@ -5,6 +5,7 @@
 from pydantic.fields import Field
 from emmet.core.settings import EmmetSettings
 from emmet.core.vasp.calc_types import TaskType
+from emmet.core.provenance import Author, History
 
 
 class EmmetBuildSettings(EmmetSettings):
@@ -30,3 +31,31 @@ class EmmetBuildSettings(EmmetSettings):
         [t.value for t in TaskType],
         description="Allowed task_types to build materials from",
     )
+
+    DEFAULT_REFERENCE: str = Field(
+        "@article{Jain2013,\nauthor = {Jain, Anubhav and Ong, Shyue Ping and "
+        "Hautier, Geoffroy and Chen, Wei and Richards, William Davidson and "
+        "Dacek, Stephen and Cholia, Shreyas and Gunter, Dan and Skinner, David "
+        "and Ceder, Gerbrand and Persson, Kristin a.},\n"
+        "doi = {10.1063/1.4812323},\nissn = {2166532X},\n"
+        "journal = {APL Materials},\nnumber = {1},\npages = {011002},\n"
+        "title = {{The Materials Project: A materials genome approach to "
+        "accelerating materials innovation}},\n"
+        "url = {http://link.aip.org/link/AMPADS/v1/i1/p011002/s1\\&Agg=doi},\n"
+        "volume = {1},\nyear = {2013}\n}\n\n@misc{MaterialsProject,\n"
+        "title = {{Materials Project}},\nurl = {http://www.materialsproject.org}\n}",
+        description="Default bibtex citation for all provenance",
+    )
+
+    DEFAULT_AUTHOR: Author = Field(
+        Author(name="Materials Project", email="feedback@materialsproject.org"),
+        description="Default Author for provenance ",
+    )
+
+    DEFAULT_HISTORY: History = Field(
+        History(
+            name="Materials Project Optimized Structure",
+            url="http://www.materialsproject.org",
+        ),
+        description="Default History for provenance ",
+    )
diff --git a/emmet-builders/emmet/builders/vasp/thermo.py b/emmet-builders/emmet/builders/vasp/thermo.py
@@ -19,6 +19,7 @@
 from emmet.core.thermo import ThermoDoc
 from emmet.core.vasp.calc_types import run_type
 
+
 class Thermo(Builder):
     def __init__(
         self,
@@ -143,7 +144,9 @@ def process_item(self, item: Tuple[List[str], List[ComputedEntry]]):
             )
             return []
         except Exception as e:
-            self.logger.error(f"Got unexpected error while processing {[ent_.entry_id for ent_ in entries]}: {e}")
+            self.logger.error(
+                f"Got unexpected error while processing {[ent_.entry_id for ent_ in entries]}: {e}"
+            )
             return []
 
         return [d.dict() for d in docs]

diff --git a/emmet-builders/setup.py b/emmet-builders/setup.py
@@ -1,6 +1,7 @@
 import datetime
 from pathlib import Path
 from setuptools import setup, find_namespace_packages
+
 required = []
 
 with open(Path(__file__).parent / "requirements.txt") as f:

diff --git a/emmet-cli/emmet/cli/calc.py b/emmet-cli/emmet/cli/calc.py
@@ -37,6 +37,7 @@ def get_format(fname):
 
 def load_canonical_structures(ctx, full_name, formula):
     from emmet.core.vasp.calc_types import task_type  # TODO import error
+
     collection = ctx.obj["COLLECTIONS"][full_name]
 
     if formula not in canonical_structures[full_name]:
@@ -169,7 +170,7 @@ def calc(ctx, specs, nmax, skip):
     help="Author to assign to all structures.",
 )
 @click.pass_context
-def prep(ctx, archive, authors):
+def prep(ctx, archive, authors):  # noqa: C901
     """prep structures from an archive for submission"""
     run = ctx.obj["RUN"]
     collections = ctx.obj["COLLECTIONS"]

diff --git a/emmet-cli/emmet/cli/decorators.py b/emmet-cli/emmet/cli/decorators.py
@@ -106,7 +106,10 @@ def wrapper(*args, **kwargs):
         run = ctx.grand_parent.params["run"]
         ntries = ctx.grand_parent.params["ntries"]
         if run:
-            click.secho(f"SBATCH MODE! Submitting to SLURM queue with {ntries} tries.", fg="green")
+            click.secho(
+                f"SBATCH MODE! Submitting to SLURM queue with {ntries} tries.",
+                fg="green",
+            )
 
         directory = ctx.parent.params.get("directory")
         if not directory:

diff --git a/emmet-cli/emmet/cli/entry_point.py b/emmet-cli/emmet/cli/entry_point.py
@@ -31,7 +31,12 @@ def opt_prompt():
 @click.option("--run", is_flag=True, help="Run DB/filesystem write operations.")
 @click.option("--issue", type=int, help="Production tracker issue (required if --run).")
 @click.option("--sbatch", is_flag=True, help="Switch to SBatch mode.")
-@click.option("--ntries", default=1, show_default=True, help="Number of jobs (for walltime > 48h).")
+@click.option(
+    "--ntries",
+    default=1,
+    show_default=True,
+    help="Number of jobs (for walltime > 48h).",
+)
 @click.option("--bb", is_flag=True, help="Use burst buffer.")
 @click.option("--yes", is_flag=True, help="Automatic yes to all prompts.")
 @click.option("--no-dupe-check", is_flag=True, help="Skip duplicate check(s).")
@@ -66,7 +71,7 @@ def emmet(spec_or_dbfile, run, issue, sbatch, ntries, bb, yes, no_dupe_check, ve
 
     if run:
         if not issue:
-            raise EmmetCliError(f"Need issue number via --issue!")
+            raise EmmetCliError("Need issue number via --issue!")
 
         ctx.obj["LOG_STREAM"] = StringIO()
         memory_handler = logging.StreamHandler(ctx.obj["LOG_STREAM"])

diff --git a/emmet-cli/emmet/cli/tasks.py b/emmet-cli/emmet/cli/tasks.py
@@ -110,11 +110,15 @@ def check_pattern(nested_allowed=False):
     if not nested_allowed and os.sep in pattern:
         raise EmmetCliError(f"Nested pattern ({pattern}) not allowed!")
     elif not any(pattern.startswith(p) for p in PREFIXES):
-        raise EmmetCliError(f"Pattern ({pattern}) only allowed to start with one of {PREFIXES}!")
+        raise EmmetCliError(
+            f"Pattern ({pattern}) only allowed to start with one of {PREFIXES}!"
+        )
 
 
 def load_block_launchers():
-    prefix = "block_"  # TODO old prefixes (e.g. res/aflow) might not be needed for backup
+    prefix = (
+        "block_"  # TODO old prefixes (e.g. res/aflow) might not be needed for backup
+    )
     block_launchers = defaultdict(list)
     gen = VaspDirsGenerator()
     for idx, vasp_dir in enumerate(gen):
@@ -136,7 +140,7 @@ def extract_filename(line):
 @sbatch
 @click.option("--clean", is_flag=True, help="Remove original launchers.")
 @click.option("--check", is_flag=True, help="Check backup consistency.")
-def backup(clean, check):
+def backup(clean, check):  # noqa: C901
     """Backup directory to HPSS"""
     ctx = click.get_current_context()
     run = ctx.parent.parent.params["run"]
@@ -232,7 +236,7 @@ def backup(clean, check):
     default=FILE_FILTERS_DEFAULT,
     help="Set the file filter(s) to match files against in each launcher.",
 )
-def restore(inputfile, file_filter):
+def restore(inputfile, file_filter):  # noqa: C901
     """Restore launchers from HPSS"""
     ctx = click.get_current_context()
     run = ctx.parent.parent.params["run"]
@@ -357,7 +361,7 @@ def restore(inputfile, file_filter):
     default=STORE_VOLUMETRIC_DATA,
     help="Store any of CHGCAR, LOCPOT, AECCAR0, AECCAR1, AECCAR2, ELFCAR.",
 )
-def parse(task_ids, snl_metas, nproc, store_volumetric_data):
+def parse(task_ids, snl_metas, nproc, store_volumetric_data):  # noqa: C901
     """Parse VASP launchers into tasks"""
     ctx = click.get_current_context()
     if "CLIENT" not in ctx.obj:
@@ -398,7 +402,9 @@ def parse(task_ids, snl_metas, nproc, store_volumetric_data):
         # insert empty doc with max ID + 1 into target collection for parallel SLURM jobs
         # NOTE use regex first to reduce size of distinct below 16MB
         q = {"task_id": {"$regex": r"^mp-\d{7,}$"}}
-        all_task_ids = [t["task_id"] for t in target.collection.find(q, {"_id": 0, "task_id": 1})]
+        all_task_ids = [
+            t["task_id"] for t in target.collection.find(q, {"_id": 0, "task_id": 1})
+        ]
         if not all_task_ids:
             all_task_ids = target.collection.distinct("task_id")