Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into elec_dev
Browse files Browse the repository at this point in the history
  • Loading branch information
jmmshn committed Mar 23, 2021
2 parents f2c8a87 + 30566da commit 2139c36
Show file tree
Hide file tree
Showing 16 changed files with 454 additions and 57 deletions.
213 changes: 213 additions & 0 deletions emmet-builders/emmet/builders/materials/provenance.py
@@ -0,0 +1,213 @@
from collections import defaultdict
from itertools import chain
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
from maggma.core import Builder, Store
from maggma.utils import grouper
from pymatgen.analysis.structure_matcher import StructureMatcher
from pymatgen.core import Structure
from pymatgen.util.provenance import StructureNL

from emmet.builders import SETTINGS
from emmet.builders.settings import EmmetBuildSettings
from emmet.core.provenance import ProvenanceDoc
from emmet.core.utils import group_structures
from emmet.core.vasp.calc_types import run_type, task_type
from emmet.core.vasp.validation import DeprecationMessage, ValidationDoc


class ProvenanceBuilder(Builder):
def __init__(
self,
materials: Store,
provenance: Store,
source_snls: List[Store],
settings: Optional[EmmetBuildSettings] = None,
query: Optional[Dict] = None,
**kwargs,
):
"""
Creates provenance from source SNLs and materials
Args:
materials: Store of materials docs to tag with SNLs
provenance: Store to update with provenance data
source_snls: List of locations to grab SNLs
query : query on materials to limit search
"""
self.materials = materials
self.provenance = provenance
self.source_snls = source_snls
self.settings = EmmetBuildSettings.autoload(settings)
self.query = query
self.kwargs = kwargs

super().__init__(
sources=[materials, *source_snls], targets=[provenance], **kwargs
)

def ensure_indicies(self):

self.materials.ensure_index("material_id", unique=True)
self.materials.ensure_index("formula_pretty")

self.provenance.ensure_index("material_id", unique=True)
self.provenance.ensure_index("formula_pretty")

for s in self.source_snls:
s.ensure_index("snl_id")
s.ensure_index("formula_pretty")

def get_items(self) -> Tuple[List[Dict], List[Dict]]:
"""
Gets all materials to assocaite with SNLs
Returns:
generator of materials and SNLs that could match
"""
self.logger.info("Provenance Builder Started")

self.logger.info("Setting indexes")
self.ensure_indicies()

# Find all formulas for materials that have been updated since this
# builder was last ran
q = {**self.query, "property_name": ProvenanceDoc.property_name}
updated_materials = self.provenance.newer_in(
self.materials,
criteria=q,
exhaustive=True,
)
forms_to_update = set(
self.materials.distinct(
"formula_pretty", {"material_id": {"$in": updated_materials}}
)
)

# Find all new SNL formulas since the builder was last run
for source in self.source_snls:
new_snls = self.provenance.newer_in(source)
forms_to_update |= set(source.distinct("formula_pretty", new_snls))

# Now reduce to the set of formulas we actually have
forms_avail = set(self.materials.distinct("formula_pretty", self.query))
forms_to_update = forms_to_update & forms_avail

self.logger.info(f"Found {len(forms_to_update)} new/updated systems to proces")

self.total = len(forms_to_update)

for formulas in grouper(forms_to_update, self.chunk_size):
snls = []
for source in self.source_snls:
snls.extend(
source.query(criteria={"formula_pretty": {"$in": formulas}})
)

mats = list(
self.materials.query(
properties=[
"material_id",
"last_updated",
"structure",
"initial_structures",
"formula_pretty",
],
criteria={"formula_pretty": {"$in": formulas}},
)
)

form_groups = defaultdict(list)
for snl in snls:
form_groups[snl["formula_pretty"]].append(snl)

mat_groups = defaultdict(list)
for mat in mats:
mat_groups[mat["formula_pretty"]].append(mat)

for formula, snl_group in form_groups.items():

mat_group = mat_groups[formula]

self.logger.debug(
f"Found {len(snl_group)} snls and {len(mat_group)} mats"
)
yield mat_group, snl_group

def process_item(self, item) -> List[Dict]:
"""
Matches SNLS and Materials
Args:
item (tuple): a tuple of materials and snls
Returns:
list(dict): a list of collected snls with material ids
"""
mats, source_snls = item
formula_pretty = mats[0]["formula_pretty"]
snl_docs = list()
self.logger.debug(f"Finding Provenance {formula_pretty}")

# Match up SNLS with materials
for mat in mats:
matched_snls = list(self.match(source_snls, mat))
if len(matched_snls) > 0:
doc = ProvenanceDoc.from_SNLs(
material_id=mat["material_id"], snls=matched_snls
)

doc.authors.append(self.settings.DEFAULT_AUTHOR)
doc.history.append(self.settings.DEFAULT_HISTORY)
doc.references.append(self.settings.DEFAULT_REFERENCE)

snl_docs.append(doc.dict())

return snl_docs

def match(self, snls, mat):
"""
Finds a material doc that matches with the given snl
Args:
snl ([dict]): the snls list
mat (dict): a materials doc
Returns:
generator of materials doc keys
"""

m_strucs = [Structure.from_dict(mat["structure"])] + [
Structure.from_dict(init_struc) for init_struc in mat["initial_structures"]
]
snl_strucs = [StructureNL.from_dict(snl) for snl in snls]

groups = group_structures(
m_strucs + snl_strucs,
ltol=self.settings.LTOL,
stol=self.settings.STOL,
angle_tol=self.settings.ANGLE_TOL,
)
matched_groups = [
group
for group in groups
if any(isinstance(struc, Structure) for struc in group)
]
snls = [
struc
for struc in group
for group in matched_groups
if isinstance(struc, StructureNL)
]

self.logger.debug(f"Found {len(snls)} SNLs for {mat['material_id']}")
return snls

def update_targets(self, items):
"""
Inserts the new SNL docs into the SNL collection
"""

snls = list(filter(None, chain.from_iterable(items)))

if len(snls) > 0:
self.logger.info(f"Found {len(snls)} SNLs to update")
self.provenance.update(snls)
else:
self.logger.info("No items to update")
29 changes: 29 additions & 0 deletions emmet-builders/emmet/builders/settings.py
Expand Up @@ -5,6 +5,7 @@
from pydantic.fields import Field
from emmet.core.settings import EmmetSettings
from emmet.core.vasp.calc_types import TaskType
from emmet.core.provenance import Author, History


class EmmetBuildSettings(EmmetSettings):
Expand All @@ -30,3 +31,31 @@ class EmmetBuildSettings(EmmetSettings):
[t.value for t in TaskType],
description="Allowed task_types to build materials from",
)

DEFAULT_REFERENCE: str = Field(
"@article{Jain2013,\nauthor = {Jain, Anubhav and Ong, Shyue Ping and "
"Hautier, Geoffroy and Chen, Wei and Richards, William Davidson and "
"Dacek, Stephen and Cholia, Shreyas and Gunter, Dan and Skinner, David "
"and Ceder, Gerbrand and Persson, Kristin a.},\n"
"doi = {10.1063/1.4812323},\nissn = {2166532X},\n"
"journal = {APL Materials},\nnumber = {1},\npages = {011002},\n"
"title = {{The Materials Project: A materials genome approach to "
"accelerating materials innovation}},\n"
"url = {http://link.aip.org/link/AMPADS/v1/i1/p011002/s1\\&Agg=doi},\n"
"volume = {1},\nyear = {2013}\n}\n\n@misc{MaterialsProject,\n"
"title = {{Materials Project}},\nurl = {http://www.materialsproject.org}\n}",
description="Default bibtex citation for all provenance",
)

DEFAULT_AUTHOR: Author = Field(
Author(name="Materials Project", email="feedback@materialsproject.org"),
description="Default Author for provenance ",
)

DEFAULT_HISTORY: History = Field(
History(
name="Materials Project Optimized Structure",
url="http://www.materialsproject.org",
),
description="Default History for provenance ",
)
5 changes: 4 additions & 1 deletion emmet-builders/emmet/builders/vasp/thermo.py
Expand Up @@ -19,6 +19,7 @@
from emmet.core.thermo import ThermoDoc
from emmet.core.vasp.calc_types import run_type


class Thermo(Builder):
def __init__(
self,
Expand Down Expand Up @@ -143,7 +144,9 @@ def process_item(self, item: Tuple[List[str], List[ComputedEntry]]):
)
return []
except Exception as e:
self.logger.error(f"Got unexpected error while processing {[ent_.entry_id for ent_ in entries]}: {e}")
self.logger.error(
f"Got unexpected error while processing {[ent_.entry_id for ent_ in entries]}: {e}"
)
return []

return [d.dict() for d in docs]
Expand Down
1 change: 1 addition & 0 deletions emmet-builders/setup.py
@@ -1,6 +1,7 @@
import datetime
from pathlib import Path
from setuptools import setup, find_namespace_packages

required = []

with open(Path(__file__).parent / "requirements.txt") as f:
Expand Down
3 changes: 2 additions & 1 deletion emmet-cli/emmet/cli/calc.py
Expand Up @@ -37,6 +37,7 @@ def get_format(fname):

def load_canonical_structures(ctx, full_name, formula):
from emmet.core.vasp.calc_types import task_type # TODO import error

collection = ctx.obj["COLLECTIONS"][full_name]

if formula not in canonical_structures[full_name]:
Expand Down Expand Up @@ -169,7 +170,7 @@ def calc(ctx, specs, nmax, skip):
help="Author to assign to all structures.",
)
@click.pass_context
def prep(ctx, archive, authors):
def prep(ctx, archive, authors): # noqa: C901
"""prep structures from an archive for submission"""
run = ctx.obj["RUN"]
collections = ctx.obj["COLLECTIONS"]
Expand Down
5 changes: 4 additions & 1 deletion emmet-cli/emmet/cli/decorators.py
Expand Up @@ -106,7 +106,10 @@ def wrapper(*args, **kwargs):
run = ctx.grand_parent.params["run"]
ntries = ctx.grand_parent.params["ntries"]
if run:
click.secho(f"SBATCH MODE! Submitting to SLURM queue with {ntries} tries.", fg="green")
click.secho(
f"SBATCH MODE! Submitting to SLURM queue with {ntries} tries.",
fg="green",
)

directory = ctx.parent.params.get("directory")
if not directory:
Expand Down
9 changes: 7 additions & 2 deletions emmet-cli/emmet/cli/entry_point.py
Expand Up @@ -31,7 +31,12 @@ def opt_prompt():
@click.option("--run", is_flag=True, help="Run DB/filesystem write operations.")
@click.option("--issue", type=int, help="Production tracker issue (required if --run).")
@click.option("--sbatch", is_flag=True, help="Switch to SBatch mode.")
@click.option("--ntries", default=1, show_default=True, help="Number of jobs (for walltime > 48h).")
@click.option(
"--ntries",
default=1,
show_default=True,
help="Number of jobs (for walltime > 48h).",
)
@click.option("--bb", is_flag=True, help="Use burst buffer.")
@click.option("--yes", is_flag=True, help="Automatic yes to all prompts.")
@click.option("--no-dupe-check", is_flag=True, help="Skip duplicate check(s).")
Expand Down Expand Up @@ -66,7 +71,7 @@ def emmet(spec_or_dbfile, run, issue, sbatch, ntries, bb, yes, no_dupe_check, ve

if run:
if not issue:
raise EmmetCliError(f"Need issue number via --issue!")
raise EmmetCliError("Need issue number via --issue!")

ctx.obj["LOG_STREAM"] = StringIO()
memory_handler = logging.StreamHandler(ctx.obj["LOG_STREAM"])
Expand Down
18 changes: 12 additions & 6 deletions emmet-cli/emmet/cli/tasks.py
Expand Up @@ -110,11 +110,15 @@ def check_pattern(nested_allowed=False):
if not nested_allowed and os.sep in pattern:
raise EmmetCliError(f"Nested pattern ({pattern}) not allowed!")
elif not any(pattern.startswith(p) for p in PREFIXES):
raise EmmetCliError(f"Pattern ({pattern}) only allowed to start with one of {PREFIXES}!")
raise EmmetCliError(
f"Pattern ({pattern}) only allowed to start with one of {PREFIXES}!"
)


def load_block_launchers():
prefix = "block_" # TODO old prefixes (e.g. res/aflow) might not be needed for backup
prefix = (
"block_" # TODO old prefixes (e.g. res/aflow) might not be needed for backup
)
block_launchers = defaultdict(list)
gen = VaspDirsGenerator()
for idx, vasp_dir in enumerate(gen):
Expand All @@ -136,7 +140,7 @@ def extract_filename(line):
@sbatch
@click.option("--clean", is_flag=True, help="Remove original launchers.")
@click.option("--check", is_flag=True, help="Check backup consistency.")
def backup(clean, check):
def backup(clean, check): # noqa: C901
"""Backup directory to HPSS"""
ctx = click.get_current_context()
run = ctx.parent.parent.params["run"]
Expand Down Expand Up @@ -232,7 +236,7 @@ def backup(clean, check):
default=FILE_FILTERS_DEFAULT,
help="Set the file filter(s) to match files against in each launcher.",
)
def restore(inputfile, file_filter):
def restore(inputfile, file_filter): # noqa: C901
"""Restore launchers from HPSS"""
ctx = click.get_current_context()
run = ctx.parent.parent.params["run"]
Expand Down Expand Up @@ -357,7 +361,7 @@ def restore(inputfile, file_filter):
default=STORE_VOLUMETRIC_DATA,
help="Store any of CHGCAR, LOCPOT, AECCAR0, AECCAR1, AECCAR2, ELFCAR.",
)
def parse(task_ids, snl_metas, nproc, store_volumetric_data):
def parse(task_ids, snl_metas, nproc, store_volumetric_data): # noqa: C901
"""Parse VASP launchers into tasks"""
ctx = click.get_current_context()
if "CLIENT" not in ctx.obj:
Expand Down Expand Up @@ -398,7 +402,9 @@ def parse(task_ids, snl_metas, nproc, store_volumetric_data):
# insert empty doc with max ID + 1 into target collection for parallel SLURM jobs
# NOTE use regex first to reduce size of distinct below 16MB
q = {"task_id": {"$regex": r"^mp-\d{7,}$"}}
all_task_ids = [t["task_id"] for t in target.collection.find(q, {"_id": 0, "task_id": 1})]
all_task_ids = [
t["task_id"] for t in target.collection.find(q, {"_id": 0, "task_id": 1})
]
if not all_task_ids:
all_task_ids = target.collection.distinct("task_id")

Expand Down

0 comments on commit 2139c36

Please sign in to comment.