Skip to content

Commit

Permalink
Merge pull request #176 from materialsproject/provenance_builder
Browse files Browse the repository at this point in the history
Provenance Builder
  • Loading branch information
shyamd committed Mar 22, 2021
2 parents 2be2b1a + 0784751 commit 37016f7
Show file tree
Hide file tree
Showing 7 changed files with 406 additions and 41 deletions.
213 changes: 213 additions & 0 deletions emmet-builders/emmet/builders/materials/provenance.py
@@ -0,0 +1,213 @@
from collections import defaultdict
from itertools import chain
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
from maggma.core import Builder, Store
from maggma.utils import grouper
from pymatgen.analysis.structure_matcher import StructureMatcher
from pymatgen.core import Structure
from pymatgen.util.provenance import StructureNL

from emmet.builders import SETTINGS
from emmet.builders.settings import EmmetBuildSettings
from emmet.core.provenance import ProvenanceDoc
from emmet.core.utils import group_structures
from emmet.core.vasp.calc_types import run_type, task_type
from emmet.core.vasp.validation import DeprecationMessage, ValidationDoc


class ProvenanceBuilder(Builder):
def __init__(
self,
materials: Store,
provenance: Store,
source_snls: List[Store],
settings: Optional[EmmetBuildSettings] = None,
query: Optional[Dict] = None,
**kwargs,
):
"""
Creates provenance from source SNLs and materials
Args:
materials: Store of materials docs to tag with SNLs
provenance: Store to update with provenance data
source_snls: List of locations to grab SNLs
query : query on materials to limit search
"""
self.materials = materials
self.provenance = provenance
self.source_snls = source_snls
self.settings = EmmetBuildSettings.autoload(settings)
self.query = query
self.kwargs = kwargs

super().__init__(
sources=[materials, *source_snls], targets=[provenance], **kwargs
)

def ensure_indicies(self):

self.materials.ensure_index("material_id", unique=True)
self.materials.ensure_index("formula_pretty")

self.provenance.ensure_index("material_id", unique=True)
self.provenance.ensure_index("formula_pretty")

for s in self.source_snls:
s.ensure_index("snl_id")
s.ensure_index("formula_pretty")

def get_items(self) -> Tuple[List[Dict], List[Dict]]:
"""
Gets all materials to assocaite with SNLs
Returns:
generator of materials and SNLs that could match
"""
self.logger.info("Provenance Builder Started")

self.logger.info("Setting indexes")
self.ensure_indicies()

# Find all formulas for materials that have been updated since this
# builder was last ran
q = {**self.query, "property_name": ProvenanceDoc.property_name}
updated_materials = self.provenance.newer_in(
self.materials,
criteria=q,
exhaustive=True,
)
forms_to_update = set(
self.materials.distinct(
"formula_pretty", {"material_id": {"$in": updated_materials}}
)
)

# Find all new SNL formulas since the builder was last run
for source in self.source_snls:
new_snls = self.provenance.newer_in(source)
forms_to_update |= set(source.distinct("formula_pretty", new_snls))

# Now reduce to the set of formulas we actually have
forms_avail = set(self.materials.distinct("formula_pretty", self.query))
forms_to_update = forms_to_update & forms_avail

self.logger.info(f"Found {len(forms_to_update)} new/updated systems to proces")

self.total = len(forms_to_update)

for formulas in grouper(forms_to_update, self.chunk_size):
snls = []
for source in self.source_snls:
snls.extend(
source.query(criteria={"formula_pretty": {"$in": formulas}})
)

mats = list(
self.materials.query(
properties=[
"material_id",
"last_updated",
"structure",
"initial_structures",
"formula_pretty",
],
criteria={"formula_pretty": {"$in": formulas}},
)
)

form_groups = defaultdict(list)
for snl in snls:
form_groups[snl["formula_pretty"]].append(snl)

mat_groups = defaultdict(list)
for mat in mats:
mat_groups[mat["formula_pretty"]].append(mat)

for formula, snl_group in form_groups.items():

mat_group = mat_groups[formula]

self.logger.debug(
f"Found {len(snl_group)} snls and {len(mat_group)} mats"
)
yield mat_group, snl_group

def process_item(self, item) -> List[Dict]:
"""
Matches SNLS and Materials
Args:
item (tuple): a tuple of materials and snls
Returns:
list(dict): a list of collected snls with material ids
"""
mats, source_snls = item
formula_pretty = mats[0]["formula_pretty"]
snl_docs = list()
self.logger.debug(f"Finding Provenance {formula_pretty}")

# Match up SNLS with materials
for mat in mats:
matched_snls = list(self.match(source_snls, mat))
if len(matched_snls) > 0:
doc = ProvenanceDoc.from_SNLs(
material_id=mat["material_id"], snls=matched_snls
)

doc.authors.append(self.settings.DEFAULT_AUTHOR)
doc.history.append(self.settings.DEFAULT_HISTORY)
doc.references.append(self.settings.DEFAULT_REFERENCE)

snl_docs.append(doc.dict())

return snl_docs

def match(self, snls, mat):
"""
Finds a material doc that matches with the given snl
Args:
snl ([dict]): the snls list
mat (dict): a materials doc
Returns:
generator of materials doc keys
"""

m_strucs = [Structure.from_dict(mat["structure"])] + [
Structure.from_dict(init_struc) for init_struc in mat["initial_structures"]
]
snl_strucs = [StructureNL.from_dict(snl) for snl in snls]

groups = group_structures(
m_strucs + snl_strucs,
ltol=self.settings.LTOL,
stol=self.settings.STOL,
angle_tol=self.settings.ANGLE_TOL,
)
matched_groups = [
group
for group in groups
if any(isinstance(struc, Structure) for struc in group)
]
snls = [
struc
for struc in group
for group in matched_groups
if isinstance(struc, StructureNL)
]

self.logger.debug(f"Found {len(snls)} SNLs for {mat['material_id']}")
return snls

def update_targets(self, items):
"""
Inserts the new SNL docs into the SNL collection
"""

snls = list(filter(None, chain.from_iterable(items)))

if len(snls) > 0:
self.logger.info(f"Found {len(snls)} SNLs to update")
self.provenance.update(snls)
else:
self.logger.info("No items to update")
29 changes: 29 additions & 0 deletions emmet-builders/emmet/builders/settings.py
Expand Up @@ -5,6 +5,7 @@
from pydantic.fields import Field
from emmet.core.settings import EmmetSettings
from emmet.core.vasp.calc_types import TaskType
from emmet.core.provenance import Author, History


class EmmetBuildSettings(EmmetSettings):
Expand All @@ -30,3 +31,31 @@ class EmmetBuildSettings(EmmetSettings):
[t.value for t in TaskType],
description="Allowed task_types to build materials from",
)

DEFAULT_REFERENCE: str = Field(
"@article{Jain2013,\nauthor = {Jain, Anubhav and Ong, Shyue Ping and "
"Hautier, Geoffroy and Chen, Wei and Richards, William Davidson and "
"Dacek, Stephen and Cholia, Shreyas and Gunter, Dan and Skinner, David "
"and Ceder, Gerbrand and Persson, Kristin a.},\n"
"doi = {10.1063/1.4812323},\nissn = {2166532X},\n"
"journal = {APL Materials},\nnumber = {1},\npages = {011002},\n"
"title = {{The Materials Project: A materials genome approach to "
"accelerating materials innovation}},\n"
"url = {http://link.aip.org/link/AMPADS/v1/i1/p011002/s1\\&Agg=doi},\n"
"volume = {1},\nyear = {2013}\n}\n\n@misc{MaterialsProject,\n"
"title = {{Materials Project}},\nurl = {http://www.materialsproject.org}\n}",
description="Default bibtex citation for all provenance",
)

DEFAULT_AUTHOR: Author = Field(
Author(name="Materials Project", email="feedback@materialsproject.org"),
description="Default Author for provenance ",
)

DEFAULT_HISTORY: History = Field(
History(
name="Materials Project Optimized Structure",
url="http://www.materialsproject.org",
),
description="Default History for provenance ",
)

0 comments on commit 37016f7

Please sign in to comment.