From c7f8d1d8a794360d8e363b61f98679ca8423643b Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Sun, 7 Mar 2021 17:14:14 -0800
Subject: [PATCH 01/19] default bson_compatible types

---
 emmet-core/emmet/core/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/emmet-core/emmet/core/utils.py b/emmet-core/emmet/core/utils.py
index 046e79946f..0baec7e683 100644
--- a/emmet-core/emmet/core/utils.py
+++ b/emmet-core/emmet/core/utils.py
@@ -62,7 +62,7 @@ def _get_sg(struc):
             yield group
 
 
-def jsanitize(obj, strict=False, allow_bson=False):
+def jsanitize(obj, strict=False, allow_bson=True):
     """
     This method cleans an input json-like object, either a list or a dict or
     some sequence, nested or otherwise, by converting all non-string

From a059d2be2f2e039db760759eb6d30a18310754a3 Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Sun, 7 Mar 2021 17:15:41 -0800
Subject: [PATCH 02/19] update provenance doc structure

---
 emmet-core/emmet/core/provenance.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/emmet-core/emmet/core/provenance.py b/emmet-core/emmet/core/provenance.py
index 6e0da58e39..da70b82fd0 100644
--- a/emmet-core/emmet/core/provenance.py
+++ b/emmet-core/emmet/core/provenance.py
@@ -1,11 +1,15 @@
 """ Core definition of a Provenance Document """
+from collections import defaultdict
 from datetime import datetime
-from typing import ClassVar, Dict, List
+from typing import ClassVar, Dict, List, Union
 
 from pybtex.database import BibliographyData, parse_string
 from pydantic import BaseModel, EmailStr, Field, HttpUrl, validator
+from pydash.objects import get
+from pymatgen.util.provenance import StructureNL
 
 from emmet.core.material_property import PropertyDoc
+from emmet.core.mpid import MPID
 from emmet.core.utils import ValueEnum
 
 
@@ -48,32 +52,31 @@ class Provenance(PropertyDoc):
     property_name: ClassVar[str] = "provenance"
 
     created_at: datetime = Field(
-        None,
+        ...,
         description="creation date for the first structure corresponding to this material",
     )
 
-    projects: List[str] = Field(
-        None, description="List of projects this material belongs to"
-    )
-    bibtex_string: str = Field(
-        None, description="Bibtex reference string for this material"
+    references: List[str] = Field(
+        None, description="Bibtex reference strings for this material"
     )
+
+    authors: List[Author] = Field(None, description="List of authors for this material")
+
     remarks: List[str] = Field(
         None, description="List of remarks for the provenance of this material"
     )
-    authors: List[Author] = Field(None, description="List of authors for this material")
 
     theoretical: bool = Field(
         True, description="If this material has any experimental provenance or not"
     )
 
-    database_IDs: Dict[Database, List[str]] = Field(
+    database_IDs: Dict[str, List[str]] = Field(
         None, description="Database IDs corresponding to this material"
     )
 
     history: List[History] = Field(
         None,
-        description="List of history nodes specifying the transformations or orignation of this material",
+        description="List of history nodes specifying the transformations or orignation of this material for the entry closest matching the material input",
     )
 
     @validator("authors")

From c6f415e03685f4ab04463012dd3474b2df9ddd6f Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Wed, 10 Mar 2021 12:03:40 -0800
Subject: [PATCH 03/19] add to provenance document

---
 emmet-core/emmet/core/provenance.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/emmet-core/emmet/core/provenance.py b/emmet-core/emmet/core/provenance.py
index da70b82fd0..3794c53f99 100644
--- a/emmet-core/emmet/core/provenance.py
+++ b/emmet-core/emmet/core/provenance.py
@@ -57,25 +57,27 @@ class Provenance(PropertyDoc):
     )
 
     references: List[str] = Field(
-        None, description="Bibtex reference strings for this material"
+        [], description="Bibtex reference strings for this material"
     )
 
-    authors: List[Author] = Field(None, description="List of authors for this material")
+    authors: List[Author] = Field([], description="List of authors for this material")
 
     remarks: List[str] = Field(
-        None, description="List of remarks for the provenance of this material"
+        [], description="List of remarks for the provenance of this material"
     )
 
+    tags: List[str] = Field([])
+
     theoretical: bool = Field(
         True, description="If this material has any experimental provenance or not"
     )
 
     database_IDs: Dict[str, List[str]] = Field(
-        None, description="Database IDs corresponding to this material"
+        dict(), description="Database IDs corresponding to this material"
     )
 
     history: List[History] = Field(
-        None,
+        [],
         description="List of history nodes specifying the transformations or orignation of this material for the entry closest matching the material input",
     )
 

From 528c508fb92cbe48913eacbdcf7d4ec886e293c9 Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Wed, 10 Mar 2021 12:03:57 -0800
Subject: [PATCH 04/19] Add method to convert from SNLs

---
 emmet-core/emmet/core/provenance.py | 88 ++++++++++++++++++++++++++++-
 1 file changed, 86 insertions(+), 2 deletions(-)

diff --git a/emmet-core/emmet/core/provenance.py b/emmet-core/emmet/core/provenance.py
index 3794c53f99..ccd03323f2 100644
--- a/emmet-core/emmet/core/provenance.py
+++ b/emmet-core/emmet/core/provenance.py
@@ -6,11 +6,12 @@
 from pybtex.database import BibliographyData, parse_string
 from pydantic import BaseModel, EmailStr, Field, HttpUrl, validator
 from pydash.objects import get
+from pymatgen.core import Structure
 from pymatgen.util.provenance import StructureNL
 
 from emmet.core.material_property import PropertyDoc
 from emmet.core.mpid import MPID
-from emmet.core.utils import ValueEnum
+from emmet.core.utils import ValueEnum, group_structures
 
 
 class Database(ValueEnum):
@@ -19,7 +20,7 @@ class Database(ValueEnum):
     """
 
     ICSD = "icsd"
-    PaulingFiles = "pf"
+    Pauling_Files = "pf"
     COD = "cod"
 
 
@@ -85,3 +86,86 @@ class Provenance(PropertyDoc):
     def remove_duplicate_authors(cls, authors):
         authors_dict = {entry.name.lower(): entry for entry in authors}
         return list(authors_dict.items())
+
+    @classmethod
+    def from_SNLs(
+        cls,
+        material_id: Union[MPID, int],
+        snls: List[StructureNL],
+    ) -> "Provenance":
+        """
+        Converts legacy Pymatgen SNLs into a single provenance document
+        """
+
+        # Choose earliest created_at
+        created_at = sorted(
+            [get(snl, "about.created_at.string", datetime.max) for snl in snls]
+        )[0]
+
+        # Choose earliest history
+        history = sorted(
+            snls, key=lambda snl: get(snl, "about.created_at.string", datetime.max)
+        )[0]["about"]["history"]
+
+        # Aggregate all references into one dict to remove duplicates
+        refs = {}
+        for snl in snls:
+            try:
+                entries = parse_string(snl["about"]["references"], bib_format="bibtex")
+                refs.update(entries.entries)
+            except Exception:
+                logger.debug(f"Failed parsing bibtex: {snl['about']['references']}")
+
+        bib_data = BibliographyData(entries=refs)
+        references = [ref.to_string("bibtex") for ref in bib_data.entries]
+
+        # TODO: Maybe we should combine this robocrystallographer?
+        # TODO: Refine these tags / remarks
+        remarks = list(
+            set([remark for snl in snls for remark in snl["about"]["remarks"]])
+        )
+        tags = [r for r in remarks if len(r) < 140]
+
+        # Aggregate all authors - Converting a single dictionary first
+        # performs duplicate checking
+        authors = {
+            entry["name"].lower(): entry["email"]
+            for snl in snls
+            for entry in snl["about"]["authors"]
+        }
+        authors = [
+            {"name": name.title(), "email": email} for name, email in authors.items()
+        ]
+
+        # Check if this entry is experimental
+        if any(get(snl, "about.history.0.experimental", False) for snl in snls):
+            experimental = True
+
+        # Aggregate all the database IDs
+        snl_ids = [snl.snl_id for snl in snls]
+        db_ids = {
+            Database[db_id]: [snl_id for snl_id in snl_ids if db_id in snl_id]
+            for db_id in map(str, Database)
+        }
+
+        # remove Nones and empty lists
+        db_ids = {k: list(filter(None, v)) for k, v in db_ids.items()}
+        db_ids = {k: v for k, v in db_ids.items() if len(v) > 0}
+
+        # Get experimental bool
+        experimental = any(
+            snl.get("about.history.0.experimental", False) for snl in snls
+        )
+
+        snl_fields = {
+            "created_at": created_at,
+            "references": references,
+            "authors": authors,
+            "remarks": remarks,
+            "tags": tags,
+            "database_IDs": db_ids,
+            "theoretical": not experimental,
+            "history": history,
+        }
+
+        return Provenance(material_id=material_id, **snl_fields)

From cea55df5bbadc93908df6900b3edba38fafe0d38 Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Thu, 11 Mar 2021 08:56:24 -0800
Subject: [PATCH 05/19] remove stale file

---
 tests/emmet-core/test_settings.json | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 tests/emmet-core/test_settings.json

diff --git a/tests/emmet-core/test_settings.json b/tests/emmet-core/test_settings.json
deleted file mode 100644
index 087cb5ae6d..0000000000
--- a/tests/emmet-core/test_settings.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "ANGLE_TOL": 1.0
-}

From 637945ca5cca0acb2e1c6280aa9405f6e08b80b8 Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Thu, 11 Mar 2021 09:20:13 -0800
Subject: [PATCH 06/19] test for provenance from SNLs

---
 tests/emmet-core/test_provenance.py | 52 +++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 tests/emmet-core/test_provenance.py

diff --git a/tests/emmet-core/test_provenance.py b/tests/emmet-core/test_provenance.py
new file mode 100644
index 0000000000..0db465eab3
--- /dev/null
+++ b/tests/emmet-core/test_provenance.py
@@ -0,0 +1,52 @@
+from datetime import datetime
+
+import pytest
+from pymatgen import Element, Lattice
+from pymatgen.core import Structure
+from pymatgen.util.provenance import Author, HistoryNode, StructureNL
+
+from emmet.core.provenance import Database, ProvenanceDoc
+
+
+@pytest.fixture
+def structure():
+    test_latt = Lattice.cubic(3.0)
+    test_struc = Structure(lattice=test_latt, species=["Fe"], coords=[[0, 0, 0]])
+    return test_struc
+
+
+@pytest.fixture
+def snls(structure):
+
+    docs = [
+        StructureNL(
+            structure,
+            authors=[Author("test{i}", "test@test.com").as_dict()],
+            history=[HistoryNode("nothing", "url.com", {})],
+            created_at=datetime.utcnow(),
+        ).as_dict()
+        for i in range(3)
+    ]
+    docs[0]["snl_id"] = "icsd-2"
+    docs[1]["snl_id"] = "user-1"
+    docs[2]["snl_id"] = "pf-3"
+
+    return docs
+
+
+def test_from_snls(snls):
+
+    doc = ProvenanceDoc.from_SNLs(material_id="mp-3", snls=snls)
+
+    assert isinstance(doc, ProvenanceDoc)
+    assert doc.property_name == "provenance"
+    assert doc.material_id == "mp-3"
+    assert doc.theoretical is True
+    assert doc.database_IDs == {
+        Database.ICSD: ["icsd-2"],
+        Database.Pauling_Files: ["pf-3"],
+    }
+
+    # Test experimental detection
+    snls[0]["about"]["history"][0]["experimental"] = True
+    assert ProvenanceDoc.from_SNLs(material_id="mp-3", snls=snls).theoretical is False

From f8efbac89ccd2fd9443b4581629fcfcd964d0c22 Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Thu, 11 Mar 2021 09:20:37 -0800
Subject: [PATCH 07/19] clean up

---
 .../emmet/builders/vasp/task_validator.py     |  3 ---
 emmet-core/emmet/core/provenance.py           | 20 +++++++++----------
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/emmet-builders/emmet/builders/vasp/task_validator.py b/emmet-builders/emmet/builders/vasp/task_validator.py
index a2ab69b6c7..64f0029c8c 100644
--- a/emmet-builders/emmet/builders/vasp/task_validator.py
+++ b/emmet-builders/emmet/builders/vasp/task_validator.py
@@ -11,9 +11,6 @@
 from emmet.core.vasp.validation import DeprecationMessage, ValidationDoc
 from emmet.builders.settings import EmmetBuildSettings
 
-__author__ = "Shyam Dwaraknath"
-__email__ = "shyamd@lbl.gov"
-
 
 class TaskValidator(MapBuilder):
     def __init__(
diff --git a/emmet-core/emmet/core/provenance.py b/emmet-core/emmet/core/provenance.py
index ccd03323f2..254761ed4f 100644
--- a/emmet-core/emmet/core/provenance.py
+++ b/emmet-core/emmet/core/provenance.py
@@ -1,7 +1,7 @@
 """ Core definition of a Provenance Document """
 from collections import defaultdict
 from datetime import datetime
-from typing import ClassVar, Dict, List, Union
+from typing import ClassVar, Dict, List, Optional, Union
 
 from pybtex.database import BibliographyData, parse_string
 from pydantic import BaseModel, EmailStr, Field, HttpUrl, validator
@@ -39,13 +39,13 @@ class History(BaseModel):
     """
 
     name: str
-    url: HttpUrl
-    description: Dict = Field(
+    url: str
+    description: Optional[Dict] = Field(
         None, description="Dictionary of exra data for this history node"
     )
 
 
-class Provenance(PropertyDoc):
+class ProvenanceDoc(PropertyDoc):
     """
     A provenance property block
     """
@@ -73,7 +73,7 @@ class Provenance(PropertyDoc):
         True, description="If this material has any experimental provenance or not"
     )
 
-    database_IDs: Dict[str, List[str]] = Field(
+    database_IDs: Dict[Database, List[str]] = Field(
         dict(), description="Database IDs corresponding to this material"
     )
 
@@ -91,7 +91,7 @@ def remove_duplicate_authors(cls, authors):
     def from_SNLs(
         cls,
         material_id: Union[MPID, int],
-        snls: List[StructureNL],
+        snls: List[Dict],
     ) -> "Provenance":
         """
         Converts legacy Pymatgen SNLs into a single provenance document
@@ -142,9 +142,9 @@ def from_SNLs(
             experimental = True
 
         # Aggregate all the database IDs
-        snl_ids = [snl.snl_id for snl in snls]
+        snl_ids = [snl.get("snl_id", "") for snl in snls]
         db_ids = {
-            Database[db_id]: [snl_id for snl_id in snl_ids if db_id in snl_id]
+            Database(db_id): [snl_id for snl_id in snl_ids if db_id in snl_id]
             for db_id in map(str, Database)
         }
 
@@ -154,7 +154,7 @@ def from_SNLs(
 
         # Get experimental bool
         experimental = any(
-            snl.get("about.history.0.experimental", False) for snl in snls
+            get(snl, "about.history.0.experimental", False) for snl in snls
         )
 
         snl_fields = {
@@ -168,4 +168,4 @@ def from_SNLs(
             "history": history,
         }
 
-        return Provenance(material_id=material_id, **snl_fields)
+        return ProvenanceDoc(material_id=material_id, **snl_fields)

From 618824af1aaf4edfdbaa07438c05eb43acdf7081 Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Thu, 11 Mar 2021 09:20:49 -0800
Subject: [PATCH 08/19] Add provenance settings to build settings

---
 emmet-builders/emmet/builders/settings.py | 29 +++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/emmet-builders/emmet/builders/settings.py b/emmet-builders/emmet/builders/settings.py
index 4b6c3cf6c0..72aa1cc6d1 100644
--- a/emmet-builders/emmet/builders/settings.py
+++ b/emmet-builders/emmet/builders/settings.py
@@ -5,6 +5,7 @@
 from pydantic.fields import Field
 from emmet.core.settings import EmmetSettings
 from emmet.core.vasp.calc_types import TaskType
+from emmet.core.provenance import Author, History
 
 
 class EmmetBuildSettings(EmmetSettings):
@@ -30,3 +31,31 @@ class EmmetBuildSettings(EmmetSettings):
         [t.value for t in TaskType],
         description="Allowed task_types to build materials from",
     )
+
+    DEFAULT_REFERENCE: str = Field(
+        "@article{Jain2013,\nauthor = {Jain, Anubhav and Ong, Shyue Ping and "
+        "Hautier, Geoffroy and Chen, Wei and Richards, William Davidson and "
+        "Dacek, Stephen and Cholia, Shreyas and Gunter, Dan and Skinner, David "
+        "and Ceder, Gerbrand and Persson, Kristin a.},\n"
+        "doi = {10.1063/1.4812323},\nissn = {2166532X},\n"
+        "journal = {APL Materials},\nnumber = {1},\npages = {011002},\n"
+        "title = {{The Materials Project: A materials genome approach to "
+        "accelerating materials innovation}},\n"
+        "url = {http://link.aip.org/link/AMPADS/v1/i1/p011002/s1\\&Agg=doi},\n"
+        "volume = {1},\nyear = {2013}\n}\n\n@misc{MaterialsProject,\n"
+        "title = {{Materials Project}},\nurl = {http://www.materialsproject.org}\n}",
+        description="Default bibtex citation for all provenance",
+    )
+
+    DEFAULT_AUTHOR: Author = Field(
+        Author(name="Materials Project", email="feedback@materialsproject.org"),
+        description="Default Author for provenance ",
+    )
+
+    DEFAULT_HISTORY: History = Field(
+        History(
+            name="Materials Project Optimized Structure",
+            url="http://www.materialsproject.org",
+        ),
+        description="Default History for provenance ",
+    )

From 559806184844092a83e04cd666b15b4ac7ebed3f Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Thu, 11 Mar 2021 12:19:30 -0800
Subject: [PATCH 09/19] Add provenance builder

---
 .../emmet/builders/materials/provenance.py    | 213 ++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100644 emmet-builders/emmet/builders/materials/provenance.py

diff --git a/emmet-builders/emmet/builders/materials/provenance.py b/emmet-builders/emmet/builders/materials/provenance.py
new file mode 100644
index 0000000000..e1e7552e6e
--- /dev/null
+++ b/emmet-builders/emmet/builders/materials/provenance.py
@@ -0,0 +1,213 @@
+from collections import defaultdict
+from itertools import chain
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+from maggma.core import Builder, Store
+from maggma.utils import grouper
+from pymatgen.analysis.structure_matcher import StructureMatcher
+from pymatgen.core import Structure
+from pymatgen.util.provenance import StructureNL
+
+from emmet.builders import SETTINGS
+from emmet.builders.settings import EmmetBuildSettings
+from emmet.core.provenance import ProvenanceDoc
+from emmet.core.utils import group_structures
+from emmet.core.vasp.calc_types import run_type, task_type
+from emmet.core.vasp.validation import DeprecationMessage, ValidationDoc
+
+
+class ProvenanceBuilder(Builder):
+    def __init__(
+        self,
+        materials: Store,
+        provenance: Store,
+        source_snls: List[Store],
+        settings: Optional[EmmetBuildSettings] = None,
+        query: Optional[Dict] = None,
+        **kwargs,
+    ):
+        """
+        Creates provenance from source SNLs and materials
+
+        Args:
+            materials: Store of materials docs to tag with SNLs
+            provenance: Store to update with provenance data
+            source_snls: List of locations to grab SNLs
+            query : query on materials to limit search
+        """
+        self.materials = materials
+        self.provenance = provenance
+        self.source_snls = source_snls
+        self.settings = EmmetBuildSettings.autoload(settings)
+        self.query = query
+        self.kwargs = kwargs
+
+        super().__init__(
+            sources=[materials, *source_snls], targets=[provenance], **kwargs
+        )
+
+    def ensure_indicies(self):
+
+        self.materials.ensure_index("material_id", unique=True)
+        self.materials.ensure_index("formula_pretty")
+
+        self.provenance.ensure_index("material_id", unique=True)
+        self.provenance.ensure_index("formula_pretty")
+
+        for s in self.source_snls:
+            s.ensure_index("snl_id")
+            s.ensure_index("formula_pretty")
+
+    def get_items(self) -> Tuple[List[Dict], List[Dict]]:
+        """
+        Gets all materials to assocaite with SNLs
+        Returns:
+            generator of materials and SNLs that could match
+        """
+        self.logger.info("Provenance Builder Started")
+
+        self.logger.info("Setting indexes")
+        self.ensure_indicies()
+
+        # Find all formulas for materials that have been updated since this
+        # builder was last ran
+        q = {**self.query, "property_name": ProvenanceDoc.property_name}
+        updated_materials = self.provenance.newer_in(
+            self.materials,
+            criteria=q,
+            exhaustive=True,
+        )
+        forms_to_update = set(
+            self.materials.distinct(
+                "formula_pretty", {"material_id": {"$in": updated_materials}}
+            )
+        )
+
+        # Find all new SNL formulas since the builder was last run
+        for source in self.source_snls:
+            new_snls = self.provenance.newer_in(source)
+            forms_to_update |= set(source.distinct("formula_pretty", new_snls))
+
+        # Now reduce to the set of formulas we actually have
+        forms_avail = set(self.materials.distinct("formula_pretty", self.query))
+        forms_to_update = forms_to_update & forms_avail
+
+        self.logger.info(f"Found {len(forms_to_update)} new/updated systems to proces")
+
+        self.total = len(forms_to_update)
+
+        for formulas in grouper(forms_to_update, self.chunk_size):
+            snls = []
+            for source in self.source_snls:
+                snls.extend(
+                    source.query(criteria={"formula_pretty": {"$in": formulas}})
+                )
+
+            mats = list(
+                self.materials.query(
+                    properties=[
+                        "material_id",
+                        "last_updated",
+                        "structure",
+                        "initial_structures",
+                        "formula_pretty",
+                    ],
+                    criteria={"formula_pretty": {"$in": formulas}},
+                )
+            )
+
+            form_groups = defaultdict(list)
+            for snl in snls:
+                form_groups[snl["formula_pretty"]].append(snl)
+
+            mat_groups = defaultdict(list)
+            for mat in mats:
+                mat_groups[mat["formula_pretty"]].append(mat)
+
+            for formula, snl_group in form_groups.items():
+
+                mat_group = mat_groups[formula]
+
+                self.logger.debug(
+                    f"Found {len(snl_group)} snls and {len(mat_group)} mats"
+                )
+                yield mat_group, snl_group
+
+    def process_item(self, item) -> List[Dict]:
+        """
+        Matches SNLS and Materials
+        Args:
+            item (tuple): a tuple of materials and snls
+        Returns:
+            list(dict): a list of collected snls with material ids
+        """
+        mats, source_snls = item
+        formula_pretty = mats[0]["formula_pretty"]
+        snl_docs = list()
+        self.logger.debug(f"Finding Provenance {formula_pretty}")
+
+        # Match up SNLS with materials
+        for mat in mats:
+            matched_snls = list(self.match(source_snls, mat))
+            if len(matched_snls) > 0:
+                doc = ProvenanceDoc.from_SNLs(
+                    material_id=mat["material_id"], snls=matched_snls
+                )
+
+                doc.authors.append(self.settings.DEFAULT_AUTHOR)
+                doc.history.append(self.settings.DEFAULT_HISTORY)
+                doc.references.append(self.settings.DEFAULT_REFERENCE)
+
+                snl_docs.append(doc.dict())
+
+        return snl_docs
+
+    def match(self, snls, mat):
+        """
+        Finds a material doc that matches with the given snl
+        Args:
+            snl ([dict]): the snls list
+            mat (dict): a materials doc
+        Returns:
+            generator of materials doc keys
+        """
+
+        m_strucs = [Structure.from_dict(mat["structure"])] + [
+            Structure.from_dict(init_struc) for init_struc in mat["initial_structures"]
+        ]
+        snl_strucs = [StructureNL.from_dict(snl) for snl in snls]
+
+        groups = group_structures(
+            m_strucs + snl_strucs,
+            ltol=self.settings.LTOL,
+            stol=self.settings.STOL,
+            angle_tol=self.settings.ANGLE_TOL,
+        )
+        matched_groups = [
+            group
+            for group in groups
+            if any(isinstance(struc, Structure) for struc in group)
+        ]
+        snls = [
+            struc
+            for struc in group
+            for group in matched_groups
+            if isinstance(struc, StructureNL)
+        ]
+
+        self.logger.debug(f"Found {len(snls)} SNLs for {mat['material_id']}")
+        return snls
+
+    def update_targets(self, items):
+        """
+        Inserts the new SNL docs into the SNL collection
+        """
+
+        snls = list(filter(None, chain.from_iterable(items)))
+
+        if len(snls) > 0:
+            self.logger.info(f"Found {len(snls)} SNLs to update")
+            self.provenance.update(snls)
+        else:
+            self.logger.info("No items to update")

From e3209b9c06e4c474ebc112d9796396c545bd8626 Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Thu, 11 Mar 2021 12:28:55 -0800
Subject: [PATCH 10/19] Fix linting problems

---
 emmet-core/emmet/core/provenance.py | 10 ++++++----
 emmet-core/emmet/core/vasp/task.py  |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/emmet-core/emmet/core/provenance.py b/emmet-core/emmet/core/provenance.py
index 254761ed4f..8b5268e372 100644
--- a/emmet-core/emmet/core/provenance.py
+++ b/emmet-core/emmet/core/provenance.py
@@ -1,10 +1,11 @@
 """ Core definition of a Provenance Document """
+import warnings
 from collections import defaultdict
 from datetime import datetime
 from typing import ClassVar, Dict, List, Optional, Union
 
 from pybtex.database import BibliographyData, parse_string
-from pydantic import BaseModel, EmailStr, Field, HttpUrl, validator
+from pydantic import BaseModel, EmailStr, Field, validator
 from pydash.objects import get
 from pymatgen.core import Structure
 from pymatgen.util.provenance import StructureNL
@@ -79,7 +80,8 @@ class ProvenanceDoc(PropertyDoc):
 
     history: List[History] = Field(
         [],
-        description="List of history nodes specifying the transformations or orignation of this material for the entry closest matching the material input",
+        description="List of history nodes specifying the transformations or orignation"
+        " of this material for the entry closest matching the material input",
     )
 
     @validator("authors")
@@ -92,7 +94,7 @@ def from_SNLs(
         cls,
         material_id: Union[MPID, int],
         snls: List[Dict],
-    ) -> "Provenance":
+    ) -> "ProvenanceDoc":
         """
         Converts legacy Pymatgen SNLs into a single provenance document
         """
@@ -114,7 +116,7 @@ def from_SNLs(
                 entries = parse_string(snl["about"]["references"], bib_format="bibtex")
                 refs.update(entries.entries)
             except Exception:
-                logger.debug(f"Failed parsing bibtex: {snl['about']['references']}")
+                warnings.warn(f"Failed parsing bibtex: {snl['about']['references']}")
 
         bib_data = BibliographyData(entries=refs)
         references = [ref.to_string("bibtex") for ref in bib_data.entries]
diff --git a/emmet-core/emmet/core/vasp/task.py b/emmet-core/emmet/core/vasp/task.py
index c8b289582f..82bb793e6b 100644
--- a/emmet-core/emmet/core/vasp/task.py
+++ b/emmet-core/emmet/core/vasp/task.py
@@ -1,7 +1,7 @@
 """ Core definition of a VASP Task Document """
 from datetime import datetime
 from functools import lru_cache, partial
-from typing import ClassVar, Dict, List, Optional, Union, Any
+from typing import Any, ClassVar, Dict, List, Optional, Union
 
 from pydantic import BaseModel, Field, validator
 from pymatgen.analysis.magnetism import CollinearMagneticStructureAnalyzer, Ordering

From 1fc3845832f22a3f29a54743e5ebe0d69c402aa8 Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Thu, 11 Mar 2021 12:29:15 -0800
Subject: [PATCH 11/19] revert behavior and change in self-contained PR

---
 emmet-core/emmet/core/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/emmet-core/emmet/core/utils.py b/emmet-core/emmet/core/utils.py
index 0baec7e683..046e79946f 100644
--- a/emmet-core/emmet/core/utils.py
+++ b/emmet-core/emmet/core/utils.py
@@ -62,7 +62,7 @@ def _get_sg(struc):
             yield group
 
 
-def jsanitize(obj, strict=False, allow_bson=True):
+def jsanitize(obj, strict=False, allow_bson=False):
     """
     This method cleans an input json-like object, either a list or a dict or
     some sequence, nested or otherwise, by converting all non-string

From 0df27eb5b8522ea5e773fbad9f47c2a31bae7ee9 Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Thu, 11 Mar 2021 12:38:36 -0800
Subject: [PATCH 12/19] fix linting

---
 emmet-core/emmet/core/provenance.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/emmet-core/emmet/core/provenance.py b/emmet-core/emmet/core/provenance.py
index 8b5268e372..b83bb6f330 100644
--- a/emmet-core/emmet/core/provenance.py
+++ b/emmet-core/emmet/core/provenance.py
@@ -130,13 +130,14 @@ def from_SNLs(
 
         # Aggregate all authors - Converting a single dictionary first
         # performs duplicate checking
-        authors = {
+        authors_dict = {
             entry["name"].lower(): entry["email"]
             for snl in snls
             for entry in snl["about"]["authors"]
         }
         authors = [
-            {"name": name.title(), "email": email} for name, email in authors.items()
+            {"name": name.title(), "email": email}
+            for name, email in authors_dict.items()
         ]
 
         # Check if this entry is experimental

From 9cdf1db9266524bba4b25d1a8d9a89247e6907b9 Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Thu, 11 Mar 2021 12:38:59 -0800
Subject: [PATCH 13/19] update mypy

---
 requirements-testing.txt |  2 +-
 setup.cfg                | 20 +-------------------
 2 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/requirements-testing.txt b/requirements-testing.txt
index fde65cc68e..e8387f199f 100644
--- a/requirements-testing.txt
+++ b/requirements-testing.txt
@@ -4,5 +4,5 @@ pytest-cov==2.8.1
 pycodestyle==2.5.0
 pydocstyle==5.0.2
 flake8==3.7.9
-mypy==0.770
+mypy==0.812
 mypy-extensions==0.4.3
diff --git a/setup.cfg b/setup.cfg
index 75e9591afa..531b69dc09 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -19,23 +19,5 @@ profile=black
 [pydocstyle]
 ignore = D105,D2,D4
 
-[mypy-numpy.*]
-ignore_missing_imports = True
-
-[mypy-bson.*]
-ignore_missing_imports = True
-
-[mypy-pymatgen.*]
-ignore_missing_imports = True
-
-[mypy-pytest]
-ignore_missing_imports = True
-
-[mypy-monty.*]
-ignore_missing_imports = True
-
-[mypy-pybtex.*]
-ignore_missing_imports = True
-
-[mypy-ruamel.*]
+[mypy]
 ignore_missing_imports = True

From 0784751020e6a1c7f33257f069d691e06bed3b42 Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Thu, 11 Mar 2021 15:24:12 -0800
Subject: [PATCH 14/19] revert mypy version

---
 requirements-testing.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-testing.txt b/requirements-testing.txt
index e8387f199f..fde65cc68e 100644
--- a/requirements-testing.txt
+++ b/requirements-testing.txt
@@ -4,5 +4,5 @@ pytest-cov==2.8.1
 pycodestyle==2.5.0
 pydocstyle==5.0.2
 flake8==3.7.9
-mypy==0.812
+mypy==0.770
 mypy-extensions==0.4.3

From 026bc42e61fba2999427ca47f356193254267d2f Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Mon, 22 Mar 2021 16:33:05 -0700
Subject: [PATCH 15/19] Update emmet-core/emmet/core/vasp/material.py

---
 emmet-core/emmet/core/vasp/material.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/emmet-core/emmet/core/vasp/material.py b/emmet-core/emmet/core/vasp/material.py
index 7613abd3c4..8d2dfbeb8f 100644
--- a/emmet-core/emmet/core/vasp/material.py
+++ b/emmet-core/emmet/core/vasp/material.py
@@ -54,7 +54,7 @@ def from_tasks(
             quality_scores: quality scores for various calculation types
             use_statics: Use statics to define a material
         """
-        if task_group == 0:
+        if len(task_group) == 0:
             raise Exception("Must have more than one task in the group.")
 
         # Material ID

From aaa394206954f021f6765b3857f9a523d9be1ec4 Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Mon, 22 Mar 2021 16:50:49 -0700
Subject: [PATCH 16/19] fix import

---
 tests/emmet-core/test_provenance.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/emmet-core/test_provenance.py b/tests/emmet-core/test_provenance.py
index 0db465eab3..30111c5896 100644
--- a/tests/emmet-core/test_provenance.py
+++ b/tests/emmet-core/test_provenance.py
@@ -1,8 +1,7 @@
 from datetime import datetime
 
 import pytest
-from pymatgen import Element, Lattice
-from pymatgen.core import Structure
+from pymatgen.core import Element, Lattice, Structure
 from pymatgen.util.provenance import Author, HistoryNode, StructureNL
 
 from emmet.core.provenance import Database, ProvenanceDoc

From ae6306001abfc87b787724141d3654890825ba47 Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Mon, 22 Mar 2021 16:59:25 -0700
Subject: [PATCH 17/19] fix missing test file

---
 tests/emmet-core/test_settings.py   | 2 +-
 tests/test_files/test_settings.json | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_files/test_settings.json

diff --git a/tests/emmet-core/test_settings.py b/tests/emmet-core/test_settings.py
index c98c8aeaca..0a9df9b37c 100644
--- a/tests/emmet-core/test_settings.py
+++ b/tests/emmet-core/test_settings.py
@@ -38,7 +38,7 @@ def test_from_url():
 
     os.environ[
         "EMMET_CONFIG_FILE"
-    ] = "https://raw.githubusercontent.com/materialsproject/emmet/master/tests/emmet-core/test_settings.json"
+    ] = "https://raw.githubusercontent.com/materialsproject/emmet/master/tests/test_files/test_settings.json"
 
     test_config = EmmetSettings()
 
diff --git a/tests/test_files/test_settings.json b/tests/test_files/test_settings.json
new file mode 100644
index 0000000000..087cb5ae6d
--- /dev/null
+++ b/tests/test_files/test_settings.json
@@ -0,0 +1,3 @@
+{
+    "ANGLE_TOL": 1.0
+}

From c45f44870d2b32b68545e240a30568138dff17e9 Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Mon, 22 Mar 2021 17:05:11 -0700
Subject: [PATCH 18/19] use conventional standard structure from VASP

---
 emmet-core/emmet/core/vasp/material.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/emmet-core/emmet/core/vasp/material.py b/emmet-core/emmet/core/vasp/material.py
index 8d2dfbeb8f..1eac69d829 100644
--- a/emmet-core/emmet/core/vasp/material.py
+++ b/emmet-core/emmet/core/vasp/material.py
@@ -4,6 +4,7 @@
 from typing import ClassVar, List, Mapping, Optional, Sequence, Tuple, TypeVar, Union
 
 from pydantic import BaseModel, Field, create_model
+from pymatgen.analysis.structure_analyzer import SpacegroupAnalyzer
 from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
 from pymatgen.core import Structure
 from pymatgen.entries.computed_entries import ComputedStructureEntry
@@ -105,7 +106,9 @@ def _structure_eval(task: TaskDocument):
             )
 
         best_structure_calc = sorted(structure_calcs, key=_structure_eval)[0]
-        structure = best_structure_calc.output.structure
+        structure = SpacegroupAnalyzer(
+            best_structure_calc.output.structure, symprec=0.1
+        ).get_conventional_standard_structure()
 
         # Initial Structures
         initial_structures = [task.input.structure for task in task_group]

From 30566daaf0af0b72b4efcb7ae24a38d2c506f32b Mon Sep 17 00:00:00 2001
From: Shyam D <shyamd@lbl.gov>
Date: Mon, 22 Mar 2021 17:09:45 -0700
Subject: [PATCH 19/19] fix linting issues

---
 emmet-builders/emmet/builders/vasp/thermo.py |  5 ++++-
 emmet-builders/setup.py                      |  1 +
 emmet-cli/emmet/cli/calc.py                  |  3 ++-
 emmet-cli/emmet/cli/decorators.py            |  5 ++++-
 emmet-cli/emmet/cli/entry_point.py           |  9 +++++++--
 emmet-cli/emmet/cli/tasks.py                 | 18 ++++++++++++------
 emmet-cli/emmet/cli/utils.py                 | 18 +++++++++++++-----
 7 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/emmet-builders/emmet/builders/vasp/thermo.py b/emmet-builders/emmet/builders/vasp/thermo.py
index 124c83222d..191a9955d6 100644
--- a/emmet-builders/emmet/builders/vasp/thermo.py
+++ b/emmet-builders/emmet/builders/vasp/thermo.py
@@ -19,6 +19,7 @@
 from emmet.core.thermo import ThermoDoc
 from emmet.core.vasp.calc_types import run_type
 
+
 class Thermo(Builder):
     def __init__(
         self,
@@ -143,7 +144,9 @@ def process_item(self, item: Tuple[List[str], List[ComputedEntry]]):
             )
             return []
         except Exception as e:
-            self.logger.error(f"Got unexpected error while processing {[ent_.entry_id for ent_ in entries]}: {e}")
+            self.logger.error(
+                f"Got unexpected error while processing {[ent_.entry_id for ent_ in entries]}: {e}"
+            )
             return []
 
         return [d.dict() for d in docs]
diff --git a/emmet-builders/setup.py b/emmet-builders/setup.py
index ae5d444266..2f1655a2a4 100644
--- a/emmet-builders/setup.py
+++ b/emmet-builders/setup.py
@@ -1,6 +1,7 @@
 import datetime
 from pathlib import Path
 from setuptools import setup, find_namespace_packages
+
 required = []
 
 with open(Path(__file__).parent / "requirements.txt") as f:
diff --git a/emmet-cli/emmet/cli/calc.py b/emmet-cli/emmet/cli/calc.py
index f17a2d37a7..5b33d72e34 100644
--- a/emmet-cli/emmet/cli/calc.py
+++ b/emmet-cli/emmet/cli/calc.py
@@ -37,6 +37,7 @@ def get_format(fname):
 
 def load_canonical_structures(ctx, full_name, formula):
     from emmet.core.vasp.calc_types import task_type  # TODO import error
+
     collection = ctx.obj["COLLECTIONS"][full_name]
 
     if formula not in canonical_structures[full_name]:
@@ -169,7 +170,7 @@ def calc(ctx, specs, nmax, skip):
     help="Author to assign to all structures.",
 )
 @click.pass_context
-def prep(ctx, archive, authors):
+def prep(ctx, archive, authors):  # noqa: C901
     """prep structures from an archive for submission"""
     run = ctx.obj["RUN"]
     collections = ctx.obj["COLLECTIONS"]
diff --git a/emmet-cli/emmet/cli/decorators.py b/emmet-cli/emmet/cli/decorators.py
index 22bfb31f12..ea9f60dd2a 100644
--- a/emmet-cli/emmet/cli/decorators.py
+++ b/emmet-cli/emmet/cli/decorators.py
@@ -106,7 +106,10 @@ def wrapper(*args, **kwargs):
         run = ctx.grand_parent.params["run"]
         ntries = ctx.grand_parent.params["ntries"]
         if run:
-            click.secho(f"SBATCH MODE! Submitting to SLURM queue with {ntries} tries.", fg="green")
+            click.secho(
+                f"SBATCH MODE! Submitting to SLURM queue with {ntries} tries.",
+                fg="green",
+            )
 
         directory = ctx.parent.params.get("directory")
         if not directory:
diff --git a/emmet-cli/emmet/cli/entry_point.py b/emmet-cli/emmet/cli/entry_point.py
index 610998c28f..0e8cd963c4 100644
--- a/emmet-cli/emmet/cli/entry_point.py
+++ b/emmet-cli/emmet/cli/entry_point.py
@@ -31,7 +31,12 @@ def opt_prompt():
 @click.option("--run", is_flag=True, help="Run DB/filesystem write operations.")
 @click.option("--issue", type=int, help="Production tracker issue (required if --run).")
 @click.option("--sbatch", is_flag=True, help="Switch to SBatch mode.")
-@click.option("--ntries", default=1, show_default=True, help="Number of jobs (for walltime > 48h).")
+@click.option(
+    "--ntries",
+    default=1,
+    show_default=True,
+    help="Number of jobs (for walltime > 48h).",
+)
 @click.option("--bb", is_flag=True, help="Use burst buffer.")
 @click.option("--yes", is_flag=True, help="Automatic yes to all prompts.")
 @click.option("--no-dupe-check", is_flag=True, help="Skip duplicate check(s).")
@@ -66,7 +71,7 @@ def emmet(spec_or_dbfile, run, issue, sbatch, ntries, bb, yes, no_dupe_check, ve
 
     if run:
         if not issue:
-            raise EmmetCliError(f"Need issue number via --issue!")
+            raise EmmetCliError("Need issue number via --issue!")
 
         ctx.obj["LOG_STREAM"] = StringIO()
         memory_handler = logging.StreamHandler(ctx.obj["LOG_STREAM"])
diff --git a/emmet-cli/emmet/cli/tasks.py b/emmet-cli/emmet/cli/tasks.py
index 47e75fc386..02e9d1fa59 100644
--- a/emmet-cli/emmet/cli/tasks.py
+++ b/emmet-cli/emmet/cli/tasks.py
@@ -110,11 +110,15 @@ def check_pattern(nested_allowed=False):
     if not nested_allowed and os.sep in pattern:
         raise EmmetCliError(f"Nested pattern ({pattern}) not allowed!")
     elif not any(pattern.startswith(p) for p in PREFIXES):
-        raise EmmetCliError(f"Pattern ({pattern}) only allowed to start with one of {PREFIXES}!")
+        raise EmmetCliError(
+            f"Pattern ({pattern}) only allowed to start with one of {PREFIXES}!"
+        )
 
 
 def load_block_launchers():
-    prefix = "block_"  # TODO old prefixes (e.g. res/aflow) might not be needed for backup
+    prefix = (
+        "block_"  # TODO old prefixes (e.g. res/aflow) might not be needed for backup
+    )
     block_launchers = defaultdict(list)
     gen = VaspDirsGenerator()
     for idx, vasp_dir in enumerate(gen):
@@ -136,7 +140,7 @@ def extract_filename(line):
 @sbatch
 @click.option("--clean", is_flag=True, help="Remove original launchers.")
 @click.option("--check", is_flag=True, help="Check backup consistency.")
-def backup(clean, check):
+def backup(clean, check):  # noqa: C901
     """Backup directory to HPSS"""
     ctx = click.get_current_context()
     run = ctx.parent.parent.params["run"]
@@ -232,7 +236,7 @@ def backup(clean, check):
     default=FILE_FILTERS_DEFAULT,
     help="Set the file filter(s) to match files against in each launcher.",
 )
-def restore(inputfile, file_filter):
+def restore(inputfile, file_filter):  # noqa: C901
     """Restore launchers from HPSS"""
     ctx = click.get_current_context()
     run = ctx.parent.parent.params["run"]
@@ -357,7 +361,7 @@ def restore(inputfile, file_filter):
     default=STORE_VOLUMETRIC_DATA,
     help="Store any of CHGCAR, LOCPOT, AECCAR0, AECCAR1, AECCAR2, ELFCAR.",
 )
-def parse(task_ids, snl_metas, nproc, store_volumetric_data):
+def parse(task_ids, snl_metas, nproc, store_volumetric_data):  # noqa: C901
     """Parse VASP launchers into tasks"""
     ctx = click.get_current_context()
     if "CLIENT" not in ctx.obj:
@@ -398,7 +402,9 @@ def parse(task_ids, snl_metas, nproc, store_volumetric_data):
         # insert empty doc with max ID + 1 into target collection for parallel SLURM jobs
         # NOTE use regex first to reduce size of distinct below 16MB
         q = {"task_id": {"$regex": r"^mp-\d{7,}$"}}
-        all_task_ids = [t["task_id"] for t in target.collection.find(q, {"_id": 0, "task_id": 1})]
+        all_task_ids = [
+            t["task_id"] for t in target.collection.find(q, {"_id": 0, "task_id": 1})
+        ]
         if not all_task_ids:
             all_task_ids = target.collection.distinct("task_id")
 
diff --git a/emmet-cli/emmet/cli/utils.py b/emmet-cli/emmet/cli/utils.py
index 1155b09efc..edf3e5e760 100644
--- a/emmet-cli/emmet/cli/utils.py
+++ b/emmet-cli/emmet/cli/utils.py
@@ -327,7 +327,7 @@ def reconstruct_command(sbatch=False):
     return " ".join(command).strip().strip("\\")
 
 
-def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas):
+def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas):  # noqa: C901
     process = multiprocessing.current_process()
     name = process.name
     chunk_idx = int(name.rsplit("-")[1]) - 1
@@ -345,7 +345,7 @@ def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas):
     count = 0
     drone = VaspDrone(
         additional_fields={"tags": tags},
-        store_volumetric_data=ctx.params['store_volumetric_data']
+        store_volumetric_data=ctx.params["store_volumetric_data"],
     )
 
     for vaspdir in vaspdirs:
@@ -393,7 +393,9 @@ def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas):
             snl_meta = snl_metas.get(launcher)
             if snl_meta:
                 references = snl_meta.get("references")
-                authors = snl_meta.get("authors", ["Materials Project <feedback@materialsproject.org>"])
+                authors = snl_meta.get(
+                    "authors", ["Materials Project <feedback@materialsproject.org>"]
+                )
                 kwargs = {"projects": [tag]}
                 if references:
                     kwargs["references"] = references
@@ -416,7 +418,11 @@ def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas):
                     target.insert_task(task_doc, use_gridfs=True)
                 except DocumentTooLarge:
                     output = dotty(task_doc["calcs_reversed"][0]["output"])
-                    pop_keys = ["normalmode_eigenvecs", "force_constants", "outcar.onsite_density_matrices"]
+                    pop_keys = [
+                        "normalmode_eigenvecs",
+                        "force_constants",
+                        "outcar.onsite_density_matrices",
+                    ]
 
                     for k in pop_keys:
                         if k not in output:
@@ -436,7 +442,9 @@ def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas):
                 if target.collection.count(query):
                     if snl_dct:
                         result = snl_collection.insert_one(snl_dct)
-                        logger.info(f"SNL {result.inserted_id} inserted into {snl_collection.full_name}.")
+                        logger.info(
+                            f"SNL {result.inserted_id} inserted into {snl_collection.full_name}."
+                        )
 
                     shutil.rmtree(vaspdir)
                     logger.info(f"{name} Successfully parsed and removed {launcher}.")