From c7f8d1d8a794360d8e363b61f98679ca8423643b Mon Sep 17 00:00:00 2001 From: Shyam D Date: Sun, 7 Mar 2021 17:14:14 -0800 Subject: [PATCH 01/19] default bson_compatible types --- emmet-core/emmet/core/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emmet-core/emmet/core/utils.py b/emmet-core/emmet/core/utils.py index 046e79946f..0baec7e683 100644 --- a/emmet-core/emmet/core/utils.py +++ b/emmet-core/emmet/core/utils.py @@ -62,7 +62,7 @@ def _get_sg(struc): yield group -def jsanitize(obj, strict=False, allow_bson=False): +def jsanitize(obj, strict=False, allow_bson=True): """ This method cleans an input json-like object, either a list or a dict or some sequence, nested or otherwise, by converting all non-string From a059d2be2f2e039db760759eb6d30a18310754a3 Mon Sep 17 00:00:00 2001 From: Shyam D Date: Sun, 7 Mar 2021 17:15:41 -0800 Subject: [PATCH 02/19] update provenance doc structure --- emmet-core/emmet/core/provenance.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/emmet-core/emmet/core/provenance.py b/emmet-core/emmet/core/provenance.py index 6e0da58e39..da70b82fd0 100644 --- a/emmet-core/emmet/core/provenance.py +++ b/emmet-core/emmet/core/provenance.py @@ -1,11 +1,15 @@ """ Core definition of a Provenance Document """ +from collections import defaultdict from datetime import datetime -from typing import ClassVar, Dict, List +from typing import ClassVar, Dict, List, Union from pybtex.database import BibliographyData, parse_string from pydantic import BaseModel, EmailStr, Field, HttpUrl, validator +from pydash.objects import get +from pymatgen.util.provenance import StructureNL from emmet.core.material_property import PropertyDoc +from emmet.core.mpid import MPID from emmet.core.utils import ValueEnum @@ -48,32 +52,31 @@ class Provenance(PropertyDoc): property_name: ClassVar[str] = "provenance" created_at: datetime = Field( - None, + ..., description="creation date for the first structure corresponding to this material", ) - projects: List[str] = Field( - None, description="List of projects this material belongs to" - ) - bibtex_string: str = Field( - None, description="Bibtex reference string for this material" + references: List[str] = Field( + None, description="Bibtex reference strings for this material" ) + + authors: List[Author] = Field(None, description="List of authors for this material") + remarks: List[str] = Field( None, description="List of remarks for the provenance of this material" ) - authors: List[Author] = Field(None, description="List of authors for this material") theoretical: bool = Field( True, description="If this material has any experimental provenance or not" ) - database_IDs: Dict[Database, List[str]] = Field( + database_IDs: Dict[str, List[str]] = Field( None, description="Database IDs corresponding to this material" ) history: List[History] = Field( None, - description="List of history nodes specifying the transformations or orignation of this material", + description="List of history nodes specifying the transformations or orignation of this material for the entry closest matching the material input", ) @validator("authors") From c6f415e03685f4ab04463012dd3474b2df9ddd6f Mon Sep 17 00:00:00 2001 From: Shyam D Date: Wed, 10 Mar 2021 12:03:40 -0800 Subject: [PATCH 03/19] add to provenance document --- emmet-core/emmet/core/provenance.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/emmet-core/emmet/core/provenance.py b/emmet-core/emmet/core/provenance.py index da70b82fd0..3794c53f99 100644 --- a/emmet-core/emmet/core/provenance.py +++ b/emmet-core/emmet/core/provenance.py @@ -57,25 +57,27 @@ class Provenance(PropertyDoc): ) references: List[str] = Field( - None, description="Bibtex reference strings for this material" + [], description="Bibtex reference strings for this material" ) - authors: List[Author] = Field(None, description="List of authors for this material") + authors: List[Author] = Field([], description="List of authors for this material") remarks: List[str] = Field( - None, description="List of remarks for the provenance of this material" + [], description="List of remarks for the provenance of this material" ) + tags: List[str] = Field([]) + theoretical: bool = Field( True, description="If this material has any experimental provenance or not" ) database_IDs: Dict[str, List[str]] = Field( - None, description="Database IDs corresponding to this material" + dict(), description="Database IDs corresponding to this material" ) history: List[History] = Field( - None, + [], description="List of history nodes specifying the transformations or orignation of this material for the entry closest matching the material input", ) From 528c508fb92cbe48913eacbdcf7d4ec886e293c9 Mon Sep 17 00:00:00 2001 From: Shyam D Date: Wed, 10 Mar 2021 12:03:57 -0800 Subject: [PATCH 04/19] Add method to convert from SNLs --- emmet-core/emmet/core/provenance.py | 88 ++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 2 deletions(-) diff --git a/emmet-core/emmet/core/provenance.py b/emmet-core/emmet/core/provenance.py index 3794c53f99..ccd03323f2 100644 --- a/emmet-core/emmet/core/provenance.py +++ b/emmet-core/emmet/core/provenance.py @@ -6,11 +6,12 @@ from pybtex.database import BibliographyData, parse_string from pydantic import BaseModel, EmailStr, Field, HttpUrl, validator from pydash.objects import get +from pymatgen.core import Structure from pymatgen.util.provenance import StructureNL from emmet.core.material_property import PropertyDoc from emmet.core.mpid import MPID -from emmet.core.utils import ValueEnum +from emmet.core.utils import ValueEnum, group_structures class Database(ValueEnum): @@ -19,7 +20,7 @@ class Database(ValueEnum): """ ICSD = "icsd" - PaulingFiles = "pf" + Pauling_Files = "pf" COD = "cod" @@ -85,3 +86,86 @@ class Provenance(PropertyDoc): def remove_duplicate_authors(cls, authors): authors_dict = {entry.name.lower(): entry for entry in authors} return list(authors_dict.items()) + + @classmethod + def from_SNLs( + cls, + material_id: Union[MPID, int], + snls: List[StructureNL], + ) -> "Provenance": + """ + Converts legacy Pymatgen SNLs into a single provenance document + """ + + # Choose earliest created_at + created_at = sorted( + [get(snl, "about.created_at.string", datetime.max) for snl in snls] + )[0] + + # Choose earliest history + history = sorted( + snls, key=lambda snl: get(snl, "about.created_at.string", datetime.max) + )[0]["about"]["history"] + + # Aggregate all references into one dict to remove duplicates + refs = {} + for snl in snls: + try: + entries = parse_string(snl["about"]["references"], bib_format="bibtex") + refs.update(entries.entries) + except Exception: + logger.debug(f"Failed parsing bibtex: {snl['about']['references']}") + + bib_data = BibliographyData(entries=refs) + references = [ref.to_string("bibtex") for ref in bib_data.entries] + + # TODO: Maybe we should combine this robocrystallographer? + # TODO: Refine these tags / remarks + remarks = list( + set([remark for snl in snls for remark in snl["about"]["remarks"]]) + ) + tags = [r for r in remarks if len(r) < 140] + + # Aggregate all authors - Converting a single dictionary first + # performs duplicate checking + authors = { + entry["name"].lower(): entry["email"] + for snl in snls + for entry in snl["about"]["authors"] + } + authors = [ + {"name": name.title(), "email": email} for name, email in authors.items() + ] + + # Check if this entry is experimental + if any(get(snl, "about.history.0.experimental", False) for snl in snls): + experimental = True + + # Aggregate all the database IDs + snl_ids = [snl.snl_id for snl in snls] + db_ids = { + Database[db_id]: [snl_id for snl_id in snl_ids if db_id in snl_id] + for db_id in map(str, Database) + } + + # remove Nones and empty lists + db_ids = {k: list(filter(None, v)) for k, v in db_ids.items()} + db_ids = {k: v for k, v in db_ids.items() if len(v) > 0} + + # Get experimental bool + experimental = any( + snl.get("about.history.0.experimental", False) for snl in snls + ) + + snl_fields = { + "created_at": created_at, + "references": references, + "authors": authors, + "remarks": remarks, + "tags": tags, + "database_IDs": db_ids, + "theoretical": not experimental, + "history": history, + } + + return Provenance(material_id=material_id, **snl_fields) From cea55df5bbadc93908df6900b3edba38fafe0d38 Mon Sep 17 00:00:00 2001 From: Shyam D Date: Thu, 11 Mar 2021 08:56:24 -0800 Subject: [PATCH 05/19] remove stale file --- tests/emmet-core/test_settings.json | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 tests/emmet-core/test_settings.json diff --git a/tests/emmet-core/test_settings.json b/tests/emmet-core/test_settings.json deleted file mode 100644 index 087cb5ae6d..0000000000 --- a/tests/emmet-core/test_settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "ANGLE_TOL": 1.0 -} From 637945ca5cca0acb2e1c6280aa9405f6e08b80b8 Mon Sep 17 00:00:00 2001 From: Shyam D Date: Thu, 11 Mar 2021 09:20:13 -0800 Subject: [PATCH 06/19] test for provenance from SNLs --- tests/emmet-core/test_provenance.py | 52 +++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 tests/emmet-core/test_provenance.py diff --git a/tests/emmet-core/test_provenance.py b/tests/emmet-core/test_provenance.py new file mode 100644 index 0000000000..0db465eab3 --- /dev/null +++ b/tests/emmet-core/test_provenance.py @@ -0,0 +1,52 @@ +from datetime import datetime + +import pytest +from pymatgen import Element, Lattice +from pymatgen.core import Structure +from pymatgen.util.provenance import Author, HistoryNode, StructureNL + +from emmet.core.provenance import Database, ProvenanceDoc + + +@pytest.fixture +def structure(): + test_latt = Lattice.cubic(3.0) + test_struc = Structure(lattice=test_latt, species=["Fe"], coords=[[0, 0, 0]]) + return test_struc + + +@pytest.fixture +def snls(structure): + + docs = [ + StructureNL( + structure, + authors=[Author("test{i}", "test@test.com").as_dict()], + history=[HistoryNode("nothing", "url.com", {})], + created_at=datetime.utcnow(), + ).as_dict() + for i in range(3) + ] + docs[0]["snl_id"] = "icsd-2" + docs[1]["snl_id"] = "user-1" + docs[2]["snl_id"] = "pf-3" + + return docs + + +def test_from_snls(snls): + + doc = ProvenanceDoc.from_SNLs(material_id="mp-3", snls=snls) + + assert isinstance(doc, ProvenanceDoc) + assert doc.property_name == "provenance" + assert doc.material_id == "mp-3" + assert doc.theoretical is True + assert doc.database_IDs == { + Database.ICSD: ["icsd-2"], + Database.Pauling_Files: ["pf-3"], + } + + # Test experimental detection + snls[0]["about"]["history"][0]["experimental"] = True + assert ProvenanceDoc.from_SNLs(material_id="mp-3", snls=snls).theoretical is False From f8efbac89ccd2fd9443b4581629fcfcd964d0c22 Mon Sep 17 00:00:00 2001 From: Shyam D Date: Thu, 11 Mar 2021 09:20:37 -0800 Subject: [PATCH 07/19] clean up --- .../emmet/builders/vasp/task_validator.py | 3 --- emmet-core/emmet/core/provenance.py | 20 +++++++++---------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/emmet-builders/emmet/builders/vasp/task_validator.py b/emmet-builders/emmet/builders/vasp/task_validator.py index a2ab69b6c7..64f0029c8c 100644 --- a/emmet-builders/emmet/builders/vasp/task_validator.py +++ b/emmet-builders/emmet/builders/vasp/task_validator.py @@ -11,9 +11,6 @@ from emmet.core.vasp.validation import DeprecationMessage, ValidationDoc from emmet.builders.settings import EmmetBuildSettings -__author__ = "Shyam Dwaraknath" -__email__ = "shyamd@lbl.gov" - class TaskValidator(MapBuilder): def __init__( diff --git a/emmet-core/emmet/core/provenance.py b/emmet-core/emmet/core/provenance.py index ccd03323f2..254761ed4f 100644 --- a/emmet-core/emmet/core/provenance.py +++ b/emmet-core/emmet/core/provenance.py @@ -1,7 +1,7 @@ """ Core definition of a Provenance Document """ from collections import defaultdict from datetime import datetime -from typing import ClassVar, Dict, List, Union +from typing import ClassVar, Dict, List, Optional, Union from pybtex.database import BibliographyData, parse_string from pydantic import BaseModel, EmailStr, Field, HttpUrl, validator @@ -39,13 +39,13 @@ class History(BaseModel): """ name: str - url: HttpUrl - description: Dict = Field( + url: str + description: Optional[Dict] = Field( None, description="Dictionary of exra data for this history node" ) -class Provenance(PropertyDoc): +class ProvenanceDoc(PropertyDoc): """ A provenance property block """ @@ -73,7 +73,7 @@ class Provenance(PropertyDoc): True, description="If this material has any experimental provenance or not" ) - database_IDs: Dict[str, List[str]] = Field( + database_IDs: Dict[Database, List[str]] = Field( dict(), description="Database IDs corresponding to this material" ) @@ -91,7 +91,7 @@ def remove_duplicate_authors(cls, authors): def from_SNLs( cls, material_id: Union[MPID, int], - snls: List[StructureNL], + snls: List[Dict], ) -> "Provenance": """ Converts legacy Pymatgen SNLs into a single provenance document @@ -142,9 +142,9 @@ def from_SNLs( experimental = True # Aggregate all the database IDs - snl_ids = [snl.snl_id for snl in snls] + snl_ids = [snl.get("snl_id", "") for snl in snls] db_ids = { - Database[db_id]: [snl_id for snl_id in snl_ids if db_id in snl_id] + Database(db_id): [snl_id for snl_id in snl_ids if db_id in snl_id] for db_id in map(str, Database) } @@ -154,7 +154,7 @@ def from_SNLs( # Get experimental bool experimental = any( - snl.get("about.history.0.experimental", False) for snl in snls + get(snl, "about.history.0.experimental", False) for snl in snls ) snl_fields = { @@ -168,4 +168,4 @@ def from_SNLs( "history": history, } - return Provenance(material_id=material_id, **snl_fields) + return ProvenanceDoc(material_id=material_id, **snl_fields) From 618824af1aaf4edfdbaa07438c05eb43acdf7081 Mon Sep 17 00:00:00 2001 From: Shyam D Date: Thu, 11 Mar 2021 09:20:49 -0800 Subject: [PATCH 08/19] Add provenance settings to build settings --- emmet-builders/emmet/builders/settings.py | 29 +++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/emmet-builders/emmet/builders/settings.py b/emmet-builders/emmet/builders/settings.py index 4b6c3cf6c0..72aa1cc6d1 100644 --- a/emmet-builders/emmet/builders/settings.py +++ b/emmet-builders/emmet/builders/settings.py @@ -5,6 +5,7 @@ from pydantic.fields import Field from emmet.core.settings import EmmetSettings from emmet.core.vasp.calc_types import TaskType +from emmet.core.provenance import Author, History class EmmetBuildSettings(EmmetSettings): @@ -30,3 +31,31 @@ class EmmetBuildSettings(EmmetSettings): [t.value for t in TaskType], description="Allowed task_types to build materials from", ) + + DEFAULT_REFERENCE: str = Field( + "@article{Jain2013,\nauthor = {Jain, Anubhav and Ong, Shyue Ping and " + "Hautier, Geoffroy and Chen, Wei and Richards, William Davidson and " + "Dacek, Stephen and Cholia, Shreyas and Gunter, Dan and Skinner, David " + "and Ceder, Gerbrand and Persson, Kristin a.},\n" + "doi = {10.1063/1.4812323},\nissn = {2166532X},\n" + "journal = {APL Materials},\nnumber = {1},\npages = {011002},\n" + "title = {{The Materials Project: A materials genome approach to " + "accelerating materials innovation}},\n" + "url = {http://link.aip.org/link/AMPADS/v1/i1/p011002/s1\\&Agg=doi},\n" + "volume = {1},\nyear = {2013}\n}\n\n@misc{MaterialsProject,\n" + "title = {{Materials Project}},\nurl = {http://www.materialsproject.org}\n}", + description="Default bibtex citation for all provenance", + ) + + DEFAULT_AUTHOR: Author = Field( + Author(name="Materials Project", email="feedback@materialsproject.org"), + description="Default Author for provenance ", + ) + + DEFAULT_HISTORY: History = Field( + History( + name="Materials Project Optimized Structure", + url="http://www.materialsproject.org", + ), + description="Default History for provenance ", + ) From 559806184844092a83e04cd666b15b4ac7ebed3f Mon Sep 17 00:00:00 2001 From: Shyam D Date: Thu, 11 Mar 2021 12:19:30 -0800 Subject: [PATCH 09/19] Add provenance builder --- .../emmet/builders/materials/provenance.py | 213 ++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 emmet-builders/emmet/builders/materials/provenance.py diff --git a/emmet-builders/emmet/builders/materials/provenance.py b/emmet-builders/emmet/builders/materials/provenance.py new file mode 100644 index 0000000000..e1e7552e6e --- /dev/null +++ b/emmet-builders/emmet/builders/materials/provenance.py @@ -0,0 +1,213 @@ +from collections import defaultdict +from itertools import chain +from typing import Dict, List, Optional, Tuple, Union + +import numpy as np +from maggma.core import Builder, Store +from maggma.utils import grouper +from pymatgen.analysis.structure_matcher import StructureMatcher +from pymatgen.core import Structure +from pymatgen.util.provenance import StructureNL + +from emmet.builders import SETTINGS +from emmet.builders.settings import EmmetBuildSettings +from emmet.core.provenance import ProvenanceDoc +from emmet.core.utils import group_structures +from emmet.core.vasp.calc_types import run_type, task_type +from emmet.core.vasp.validation import DeprecationMessage, ValidationDoc + + +class ProvenanceBuilder(Builder): + def __init__( + self, + materials: Store, + provenance: Store, + source_snls: List[Store], + settings: Optional[EmmetBuildSettings] = None, + query: Optional[Dict] = None, + **kwargs, + ): + """ + Creates provenance from source SNLs and materials + + Args: + materials: Store of materials docs to tag with SNLs + provenance: Store to update with provenance data + source_snls: List of locations to grab SNLs + query : query on materials to limit search + """ + self.materials = materials + self.provenance = provenance + self.source_snls = source_snls + self.settings = EmmetBuildSettings.autoload(settings) + self.query = query + self.kwargs = kwargs + + super().__init__( + sources=[materials, *source_snls], targets=[provenance], **kwargs + ) + + def ensure_indicies(self): + + self.materials.ensure_index("material_id", unique=True) + self.materials.ensure_index("formula_pretty") + + self.provenance.ensure_index("material_id", unique=True) + self.provenance.ensure_index("formula_pretty") + + for s in self.source_snls: + s.ensure_index("snl_id") + s.ensure_index("formula_pretty") + + def get_items(self) -> Tuple[List[Dict], List[Dict]]: + """ + Gets all materials to assocaite with SNLs + Returns: + generator of materials and SNLs that could match + """ + self.logger.info("Provenance Builder Started") + + self.logger.info("Setting indexes") + self.ensure_indicies() + + # Find all formulas for materials that have been updated since this + # builder was last ran + q = {**self.query, "property_name": ProvenanceDoc.property_name} + updated_materials = self.provenance.newer_in( + self.materials, + criteria=q, + exhaustive=True, + ) + forms_to_update = set( + self.materials.distinct( + "formula_pretty", {"material_id": {"$in": updated_materials}} + ) + ) + + # Find all new SNL formulas since the builder was last run + for source in self.source_snls: + new_snls = self.provenance.newer_in(source) + forms_to_update |= set(source.distinct("formula_pretty", new_snls)) + + # Now reduce to the set of formulas we actually have + forms_avail = set(self.materials.distinct("formula_pretty", self.query)) + forms_to_update = forms_to_update & forms_avail + + self.logger.info(f"Found {len(forms_to_update)} new/updated systems to proces") + + self.total = len(forms_to_update) + + for formulas in grouper(forms_to_update, self.chunk_size): + snls = [] + for source in self.source_snls: + snls.extend( + source.query(criteria={"formula_pretty": {"$in": formulas}}) + ) + + mats = list( + self.materials.query( + properties=[ + "material_id", + "last_updated", + "structure", + "initial_structures", + "formula_pretty", + ], + criteria={"formula_pretty": {"$in": formulas}}, + ) + ) + + form_groups = defaultdict(list) + for snl in snls: + form_groups[snl["formula_pretty"]].append(snl) + + mat_groups = defaultdict(list) + for mat in mats: + mat_groups[mat["formula_pretty"]].append(mat) + + for formula, snl_group in form_groups.items(): + + mat_group = mat_groups[formula] + + self.logger.debug( + f"Found {len(snl_group)} snls and {len(mat_group)} mats" + ) + yield mat_group, snl_group + + def process_item(self, item) -> List[Dict]: + """ + Matches SNLS and Materials + Args: + item (tuple): a tuple of materials and snls + Returns: + list(dict): a list of collected snls with material ids + """ + mats, source_snls = item + formula_pretty = mats[0]["formula_pretty"] + snl_docs = list() + self.logger.debug(f"Finding Provenance {formula_pretty}") + + # Match up SNLS with materials + for mat in mats: + matched_snls = list(self.match(source_snls, mat)) + if len(matched_snls) > 0: + doc = ProvenanceDoc.from_SNLs( + material_id=mat["material_id"], snls=matched_snls + ) + + doc.authors.append(self.settings.DEFAULT_AUTHOR) + doc.history.append(self.settings.DEFAULT_HISTORY) + doc.references.append(self.settings.DEFAULT_REFERENCE) + + snl_docs.append(doc.dict()) + + return snl_docs + + def match(self, snls, mat): + """ + Finds a material doc that matches with the given snl + Args: + snl ([dict]): the snls list + mat (dict): a materials doc + Returns: + generator of materials doc keys + """ + + m_strucs = [Structure.from_dict(mat["structure"])] + [ + Structure.from_dict(init_struc) for init_struc in mat["initial_structures"] + ] + snl_strucs = [StructureNL.from_dict(snl) for snl in snls] + + groups = group_structures( + m_strucs + snl_strucs, + ltol=self.settings.LTOL, + stol=self.settings.STOL, + angle_tol=self.settings.ANGLE_TOL, + ) + matched_groups = [ + group + for group in groups + if any(isinstance(struc, Structure) for struc in group) + ] + snls = [ + struc + for struc in group + for group in matched_groups + if isinstance(struc, StructureNL) + ] + + self.logger.debug(f"Found {len(snls)} SNLs for {mat['material_id']}") + return snls + + def update_targets(self, items): + """ + Inserts the new SNL docs into the SNL collection + """ + + snls = list(filter(None, chain.from_iterable(items))) + + if len(snls) > 0: + self.logger.info(f"Found {len(snls)} SNLs to update") + self.provenance.update(snls) + else: + self.logger.info("No items to update") From e3209b9c06e4c474ebc112d9796396c545bd8626 Mon Sep 17 00:00:00 2001 From: Shyam D Date: Thu, 11 Mar 2021 12:28:55 -0800 Subject: [PATCH 10/19] Fix linting problems --- emmet-core/emmet/core/provenance.py | 10 ++++++---- emmet-core/emmet/core/vasp/task.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/emmet-core/emmet/core/provenance.py b/emmet-core/emmet/core/provenance.py index 254761ed4f..8b5268e372 100644 --- a/emmet-core/emmet/core/provenance.py +++ b/emmet-core/emmet/core/provenance.py @@ -1,10 +1,11 @@ """ Core definition of a Provenance Document """ +import warnings from collections import defaultdict from datetime import datetime from typing import ClassVar, Dict, List, Optional, Union from pybtex.database import BibliographyData, parse_string -from pydantic import BaseModel, EmailStr, Field, HttpUrl, validator +from pydantic import BaseModel, EmailStr, Field, validator from pydash.objects import get from pymatgen.core import Structure from pymatgen.util.provenance import StructureNL @@ -79,7 +80,8 @@ class ProvenanceDoc(PropertyDoc): history: List[History] = Field( [], - description="List of history nodes specifying the transformations or orignation of this material for the entry closest matching the material input", + description="List of history nodes specifying the transformations or orignation" + " of this material for the entry closest matching the material input", ) @validator("authors") @@ -92,7 +94,7 @@ def from_SNLs( cls, material_id: Union[MPID, int], snls: List[Dict], - ) -> "Provenance": + ) -> "ProvenanceDoc": """ Converts legacy Pymatgen SNLs into a single provenance document """ @@ -114,7 +116,7 @@ def from_SNLs( entries = parse_string(snl["about"]["references"], bib_format="bibtex") refs.update(entries.entries) except Exception: - logger.debug(f"Failed parsing bibtex: {snl['about']['references']}") + warnings.warn(f"Failed parsing bibtex: {snl['about']['references']}") bib_data = BibliographyData(entries=refs) references = [ref.to_string("bibtex") for ref in bib_data.entries] diff --git a/emmet-core/emmet/core/vasp/task.py b/emmet-core/emmet/core/vasp/task.py index c8b289582f..82bb793e6b 100644 --- a/emmet-core/emmet/core/vasp/task.py +++ b/emmet-core/emmet/core/vasp/task.py @@ -1,7 +1,7 @@ """ Core definition of a VASP Task Document """ from datetime import datetime from functools import lru_cache, partial -from typing import ClassVar, Dict, List, Optional, Union, Any +from typing import Any, ClassVar, Dict, List, Optional, Union from pydantic import BaseModel, Field, validator from pymatgen.analysis.magnetism import CollinearMagneticStructureAnalyzer, Ordering From 1fc3845832f22a3f29a54743e5ebe0d69c402aa8 Mon Sep 17 00:00:00 2001 From: Shyam D Date: Thu, 11 Mar 2021 12:29:15 -0800 Subject: [PATCH 11/19] revert behavior and change in self-contained PR --- emmet-core/emmet/core/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emmet-core/emmet/core/utils.py b/emmet-core/emmet/core/utils.py index 0baec7e683..046e79946f 100644 --- a/emmet-core/emmet/core/utils.py +++ b/emmet-core/emmet/core/utils.py @@ -62,7 +62,7 @@ def _get_sg(struc): yield group -def jsanitize(obj, strict=False, allow_bson=True): +def jsanitize(obj, strict=False, allow_bson=False): """ This method cleans an input json-like object, either a list or a dict or some sequence, nested or otherwise, by converting all non-string From 0df27eb5b8522ea5e773fbad9f47c2a31bae7ee9 Mon Sep 17 00:00:00 2001 From: Shyam D Date: Thu, 11 Mar 2021 12:38:36 -0800 Subject: [PATCH 12/19] fix linting --- emmet-core/emmet/core/provenance.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/emmet-core/emmet/core/provenance.py b/emmet-core/emmet/core/provenance.py index 8b5268e372..b83bb6f330 100644 --- a/emmet-core/emmet/core/provenance.py +++ b/emmet-core/emmet/core/provenance.py @@ -130,13 +130,14 @@ def from_SNLs( # Aggregate all authors - Converting a single dictionary first # performs duplicate checking - authors = { + authors_dict = { entry["name"].lower(): entry["email"] for snl in snls for entry in snl["about"]["authors"] } authors = [ - {"name": name.title(), "email": email} for name, email in authors.items() + {"name": name.title(), "email": email} + for name, email in authors_dict.items() ] # Check if this entry is experimental From 9cdf1db9266524bba4b25d1a8d9a89247e6907b9 Mon Sep 17 00:00:00 2001 From: Shyam D Date: Thu, 11 Mar 2021 12:38:59 -0800 Subject: [PATCH 13/19] update mypy --- requirements-testing.txt | 2 +- setup.cfg | 20 +------------------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/requirements-testing.txt b/requirements-testing.txt index fde65cc68e..e8387f199f 100644 --- a/requirements-testing.txt +++ b/requirements-testing.txt @@ -4,5 +4,5 @@ pytest-cov==2.8.1 pycodestyle==2.5.0 pydocstyle==5.0.2 flake8==3.7.9 -mypy==0.770 +mypy==0.812 mypy-extensions==0.4.3 diff --git a/setup.cfg b/setup.cfg index 75e9591afa..531b69dc09 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,23 +19,5 @@ profile=black [pydocstyle] ignore = D105,D2,D4 -[mypy-numpy.*] -ignore_missing_imports = True - -[mypy-bson.*] -ignore_missing_imports = True - -[mypy-pymatgen.*] -ignore_missing_imports = True - -[mypy-pytest] -ignore_missing_imports = True - -[mypy-monty.*] -ignore_missing_imports = True - -[mypy-pybtex.*] -ignore_missing_imports = True - -[mypy-ruamel.*] +[mypy] ignore_missing_imports = True From 0784751020e6a1c7f33257f069d691e06bed3b42 Mon Sep 17 00:00:00 2001 From: Shyam D Date: Thu, 11 Mar 2021 15:24:12 -0800 Subject: [PATCH 14/19] revert mypy version --- requirements-testing.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-testing.txt b/requirements-testing.txt index e8387f199f..fde65cc68e 100644 --- a/requirements-testing.txt +++ b/requirements-testing.txt @@ -4,5 +4,5 @@ pytest-cov==2.8.1 pycodestyle==2.5.0 pydocstyle==5.0.2 flake8==3.7.9 -mypy==0.812 +mypy==0.770 mypy-extensions==0.4.3 From 026bc42e61fba2999427ca47f356193254267d2f Mon Sep 17 00:00:00 2001 From: Shyam Dwaraknath Date: Mon, 22 Mar 2021 16:33:05 -0700 Subject: [PATCH 15/19] Update emmet-core/emmet/core/vasp/material.py --- emmet-core/emmet/core/vasp/material.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emmet-core/emmet/core/vasp/material.py b/emmet-core/emmet/core/vasp/material.py index 7613abd3c4..8d2dfbeb8f 100644 --- a/emmet-core/emmet/core/vasp/material.py +++ b/emmet-core/emmet/core/vasp/material.py @@ -54,7 +54,7 @@ def from_tasks( quality_scores: quality scores for various calculation types use_statics: Use statics to define a material """ - if task_group == 0: + if len(task_group) == 0: raise Exception("Must have more than one task in the group.") # Material ID From aaa394206954f021f6765b3857f9a523d9be1ec4 Mon Sep 17 00:00:00 2001 From: Shyam D Date: Mon, 22 Mar 2021 16:50:49 -0700 Subject: [PATCH 16/19] fix import --- tests/emmet-core/test_provenance.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/emmet-core/test_provenance.py b/tests/emmet-core/test_provenance.py index 0db465eab3..30111c5896 100644 --- a/tests/emmet-core/test_provenance.py +++ b/tests/emmet-core/test_provenance.py @@ -1,8 +1,7 @@ from datetime import datetime import pytest -from pymatgen import Element, Lattice -from pymatgen.core import Structure +from pymatgen.core import Element, Lattice, Structure from pymatgen.util.provenance import Author, HistoryNode, StructureNL from emmet.core.provenance import Database, ProvenanceDoc From ae6306001abfc87b787724141d3654890825ba47 Mon Sep 17 00:00:00 2001 From: Shyam D Date: Mon, 22 Mar 2021 16:59:25 -0700 Subject: [PATCH 17/19] fix missing test file --- tests/emmet-core/test_settings.py | 2 +- tests/test_files/test_settings.json | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 tests/test_files/test_settings.json diff --git a/tests/emmet-core/test_settings.py b/tests/emmet-core/test_settings.py index c98c8aeaca..0a9df9b37c 100644 --- a/tests/emmet-core/test_settings.py +++ b/tests/emmet-core/test_settings.py @@ -38,7 +38,7 @@ def test_from_url(): os.environ[ "EMMET_CONFIG_FILE" - ] = "https://raw.githubusercontent.com/materialsproject/emmet/master/tests/emmet-core/test_settings.json" + ] = "https://raw.githubusercontent.com/materialsproject/emmet/master/tests/test_files/test_settings.json" test_config = EmmetSettings() diff --git a/tests/test_files/test_settings.json b/tests/test_files/test_settings.json new file mode 100644 index 0000000000..087cb5ae6d --- /dev/null +++ b/tests/test_files/test_settings.json @@ -0,0 +1,3 @@ +{ + "ANGLE_TOL": 1.0 +} From c45f44870d2b32b68545e240a30568138dff17e9 Mon Sep 17 00:00:00 2001 From: Shyam D Date: Mon, 22 Mar 2021 17:05:11 -0700 Subject: [PATCH 18/19] use conventional standard structure from VASP --- emmet-core/emmet/core/vasp/material.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/emmet-core/emmet/core/vasp/material.py b/emmet-core/emmet/core/vasp/material.py index 8d2dfbeb8f..1eac69d829 100644 --- a/emmet-core/emmet/core/vasp/material.py +++ b/emmet-core/emmet/core/vasp/material.py @@ -4,6 +4,7 @@ from typing import ClassVar, List, Mapping, Optional, Sequence, Tuple, TypeVar, Union from pydantic import BaseModel, Field, create_model +from pymatgen.analysis.structure_analyzer import SpacegroupAnalyzer from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher from pymatgen.core import Structure from pymatgen.entries.computed_entries import ComputedStructureEntry @@ -105,7 +106,9 @@ def _structure_eval(task: TaskDocument): ) best_structure_calc = sorted(structure_calcs, key=_structure_eval)[0] - structure = best_structure_calc.output.structure + structure = SpacegroupAnalyzer( + best_structure_calc.output.structure, symprec=0.1 + ).get_conventional_standard_structure() # Initial Structures initial_structures = [task.input.structure for task in task_group] From 30566daaf0af0b72b4efcb7ae24a38d2c506f32b Mon Sep 17 00:00:00 2001 From: Shyam D Date: Mon, 22 Mar 2021 17:09:45 -0700 Subject: [PATCH 19/19] fix linting issues --- emmet-builders/emmet/builders/vasp/thermo.py | 5 ++++- emmet-builders/setup.py | 1 + emmet-cli/emmet/cli/calc.py | 3 ++- emmet-cli/emmet/cli/decorators.py | 5 ++++- emmet-cli/emmet/cli/entry_point.py | 9 +++++++-- emmet-cli/emmet/cli/tasks.py | 18 ++++++++++++------ emmet-cli/emmet/cli/utils.py | 18 +++++++++++++----- 7 files changed, 43 insertions(+), 16 deletions(-) diff --git a/emmet-builders/emmet/builders/vasp/thermo.py b/emmet-builders/emmet/builders/vasp/thermo.py index 124c83222d..191a9955d6 100644 --- a/emmet-builders/emmet/builders/vasp/thermo.py +++ b/emmet-builders/emmet/builders/vasp/thermo.py @@ -19,6 +19,7 @@ from emmet.core.thermo import ThermoDoc from emmet.core.vasp.calc_types import run_type + class Thermo(Builder): def __init__( self, @@ -143,7 +144,9 @@ def process_item(self, item: Tuple[List[str], List[ComputedEntry]]): ) return [] except Exception as e: - self.logger.error(f"Got unexpected error while processing {[ent_.entry_id for ent_ in entries]}: {e}") + self.logger.error( + f"Got unexpected error while processing {[ent_.entry_id for ent_ in entries]}: {e}" + ) return [] return [d.dict() for d in docs] diff --git a/emmet-builders/setup.py b/emmet-builders/setup.py index ae5d444266..2f1655a2a4 100644 --- a/emmet-builders/setup.py +++ b/emmet-builders/setup.py @@ -1,6 +1,7 @@ import datetime from pathlib import Path from setuptools import setup, find_namespace_packages + required = [] with open(Path(__file__).parent / "requirements.txt") as f: diff --git a/emmet-cli/emmet/cli/calc.py b/emmet-cli/emmet/cli/calc.py index f17a2d37a7..5b33d72e34 100644 --- a/emmet-cli/emmet/cli/calc.py +++ b/emmet-cli/emmet/cli/calc.py @@ -37,6 +37,7 @@ def get_format(fname): def load_canonical_structures(ctx, full_name, formula): from emmet.core.vasp.calc_types import task_type # TODO import error + collection = ctx.obj["COLLECTIONS"][full_name] if formula not in canonical_structures[full_name]: @@ -169,7 +170,7 @@ def calc(ctx, specs, nmax, skip): help="Author to assign to all structures.", ) @click.pass_context -def prep(ctx, archive, authors): +def prep(ctx, archive, authors): # noqa: C901 """prep structures from an archive for submission""" run = ctx.obj["RUN"] collections = ctx.obj["COLLECTIONS"] diff --git a/emmet-cli/emmet/cli/decorators.py b/emmet-cli/emmet/cli/decorators.py index 22bfb31f12..ea9f60dd2a 100644 --- a/emmet-cli/emmet/cli/decorators.py +++ b/emmet-cli/emmet/cli/decorators.py @@ -106,7 +106,10 @@ def wrapper(*args, **kwargs): run = ctx.grand_parent.params["run"] ntries = ctx.grand_parent.params["ntries"] if run: - click.secho(f"SBATCH MODE! Submitting to SLURM queue with {ntries} tries.", fg="green") + click.secho( + f"SBATCH MODE! Submitting to SLURM queue with {ntries} tries.", + fg="green", + ) directory = ctx.parent.params.get("directory") if not directory: diff --git a/emmet-cli/emmet/cli/entry_point.py b/emmet-cli/emmet/cli/entry_point.py index 610998c28f..0e8cd963c4 100644 --- a/emmet-cli/emmet/cli/entry_point.py +++ b/emmet-cli/emmet/cli/entry_point.py @@ -31,7 +31,12 @@ def opt_prompt(): @click.option("--run", is_flag=True, help="Run DB/filesystem write operations.") @click.option("--issue", type=int, help="Production tracker issue (required if --run).") @click.option("--sbatch", is_flag=True, help="Switch to SBatch mode.") -@click.option("--ntries", default=1, show_default=True, help="Number of jobs (for walltime > 48h).") +@click.option( + "--ntries", + default=1, + show_default=True, + help="Number of jobs (for walltime > 48h).", +) @click.option("--bb", is_flag=True, help="Use burst buffer.") @click.option("--yes", is_flag=True, help="Automatic yes to all prompts.") @click.option("--no-dupe-check", is_flag=True, help="Skip duplicate check(s).") @@ -66,7 +71,7 @@ def emmet(spec_or_dbfile, run, issue, sbatch, ntries, bb, yes, no_dupe_check, ve if run: if not issue: - raise EmmetCliError(f"Need issue number via --issue!") + raise EmmetCliError("Need issue number via --issue!") ctx.obj["LOG_STREAM"] = StringIO() memory_handler = logging.StreamHandler(ctx.obj["LOG_STREAM"]) diff --git a/emmet-cli/emmet/cli/tasks.py b/emmet-cli/emmet/cli/tasks.py index 47e75fc386..02e9d1fa59 100644 --- a/emmet-cli/emmet/cli/tasks.py +++ b/emmet-cli/emmet/cli/tasks.py @@ -110,11 +110,15 @@ def check_pattern(nested_allowed=False): if not nested_allowed and os.sep in pattern: raise EmmetCliError(f"Nested pattern ({pattern}) not allowed!") elif not any(pattern.startswith(p) for p in PREFIXES): - raise EmmetCliError(f"Pattern ({pattern}) only allowed to start with one of {PREFIXES}!") + raise EmmetCliError( + f"Pattern ({pattern}) only allowed to start with one of {PREFIXES}!" + ) def load_block_launchers(): - prefix = "block_" # TODO old prefixes (e.g. res/aflow) might not be needed for backup + prefix = ( + "block_" # TODO old prefixes (e.g. res/aflow) might not be needed for backup + ) block_launchers = defaultdict(list) gen = VaspDirsGenerator() for idx, vasp_dir in enumerate(gen): @@ -136,7 +140,7 @@ def extract_filename(line): @sbatch @click.option("--clean", is_flag=True, help="Remove original launchers.") @click.option("--check", is_flag=True, help="Check backup consistency.") -def backup(clean, check): +def backup(clean, check): # noqa: C901 """Backup directory to HPSS""" ctx = click.get_current_context() run = ctx.parent.parent.params["run"] @@ -232,7 +236,7 @@ def backup(clean, check): default=FILE_FILTERS_DEFAULT, help="Set the file filter(s) to match files against in each launcher.", ) -def restore(inputfile, file_filter): +def restore(inputfile, file_filter): # noqa: C901 """Restore launchers from HPSS""" ctx = click.get_current_context() run = ctx.parent.parent.params["run"] @@ -357,7 +361,7 @@ def restore(inputfile, file_filter): default=STORE_VOLUMETRIC_DATA, help="Store any of CHGCAR, LOCPOT, AECCAR0, AECCAR1, AECCAR2, ELFCAR.", ) -def parse(task_ids, snl_metas, nproc, store_volumetric_data): +def parse(task_ids, snl_metas, nproc, store_volumetric_data): # noqa: C901 """Parse VASP launchers into tasks""" ctx = click.get_current_context() if "CLIENT" not in ctx.obj: @@ -398,7 +402,9 @@ def parse(task_ids, snl_metas, nproc, store_volumetric_data): # insert empty doc with max ID + 1 into target collection for parallel SLURM jobs # NOTE use regex first to reduce size of distinct below 16MB q = {"task_id": {"$regex": r"^mp-\d{7,}$"}} - all_task_ids = [t["task_id"] for t in target.collection.find(q, {"_id": 0, "task_id": 1})] + all_task_ids = [ + t["task_id"] for t in target.collection.find(q, {"_id": 0, "task_id": 1}) + ] if not all_task_ids: all_task_ids = target.collection.distinct("task_id") diff --git a/emmet-cli/emmet/cli/utils.py b/emmet-cli/emmet/cli/utils.py index 1155b09efc..edf3e5e760 100644 --- a/emmet-cli/emmet/cli/utils.py +++ b/emmet-cli/emmet/cli/utils.py @@ -327,7 +327,7 @@ def reconstruct_command(sbatch=False): return " ".join(command).strip().strip("\\") -def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas): +def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas): # noqa: C901 process = multiprocessing.current_process() name = process.name chunk_idx = int(name.rsplit("-")[1]) - 1 @@ -345,7 +345,7 @@ def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas): count = 0 drone = VaspDrone( additional_fields={"tags": tags}, - store_volumetric_data=ctx.params['store_volumetric_data'] + store_volumetric_data=ctx.params["store_volumetric_data"], ) for vaspdir in vaspdirs: @@ -393,7 +393,9 @@ def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas): snl_meta = snl_metas.get(launcher) if snl_meta: references = snl_meta.get("references") - authors = snl_meta.get("authors", ["Materials Project "]) + authors = snl_meta.get( + "authors", ["Materials Project "] + ) kwargs = {"projects": [tag]} if references: kwargs["references"] = references @@ -416,7 +418,11 @@ def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas): target.insert_task(task_doc, use_gridfs=True) except DocumentTooLarge: output = dotty(task_doc["calcs_reversed"][0]["output"]) - pop_keys = ["normalmode_eigenvecs", "force_constants", "outcar.onsite_density_matrices"] + pop_keys = [ + "normalmode_eigenvecs", + "force_constants", + "outcar.onsite_density_matrices", + ] for k in pop_keys: if k not in output: @@ -436,7 +442,9 @@ def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas): if target.collection.count(query): if snl_dct: result = snl_collection.insert_one(snl_dct) - logger.info(f"SNL {result.inserted_id} inserted into {snl_collection.full_name}.") + logger.info( + f"SNL {result.inserted_id} inserted into {snl_collection.full_name}." + ) shutil.rmtree(vaspdir) logger.info(f"{name} Successfully parsed and removed {launcher}.")