diff --git a/emmet-builders/emmet/builders/materials/provenance.py b/emmet-builders/emmet/builders/materials/provenance.py
new file mode 100644
index 0000000000..e1e7552e6e
--- /dev/null
+++ b/emmet-builders/emmet/builders/materials/provenance.py
@@ -0,0 +1,213 @@
+from collections import defaultdict
+from itertools import chain
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+from maggma.core import Builder, Store
+from maggma.utils import grouper
+from pymatgen.analysis.structure_matcher import StructureMatcher
+from pymatgen.core import Structure
+from pymatgen.util.provenance import StructureNL
+
+from emmet.builders import SETTINGS
+from emmet.builders.settings import EmmetBuildSettings
+from emmet.core.provenance import ProvenanceDoc
+from emmet.core.utils import group_structures
+from emmet.core.vasp.calc_types import run_type, task_type
+from emmet.core.vasp.validation import DeprecationMessage, ValidationDoc
+
+
+class ProvenanceBuilder(Builder):
+    def __init__(
+        self,
+        materials: Store,
+        provenance: Store,
+        source_snls: List[Store],
+        settings: Optional[EmmetBuildSettings] = None,
+        query: Optional[Dict] = None,
+        **kwargs,
+    ):
+        """
+        Creates provenance from source SNLs and materials
+
+        Args:
+            materials: Store of materials docs to tag with SNLs
+            provenance: Store to update with provenance data
+            source_snls: List of locations to grab SNLs
+            query : query on materials to limit search
+        """
+        self.materials = materials
+        self.provenance = provenance
+        self.source_snls = source_snls
+        self.settings = EmmetBuildSettings.autoload(settings)
+        self.query = query
+        self.kwargs = kwargs
+
+        super().__init__(
+            sources=[materials, *source_snls], targets=[provenance], **kwargs
+        )
+
+    def ensure_indicies(self):
+
+        self.materials.ensure_index("material_id", unique=True)
+        self.materials.ensure_index("formula_pretty")
+
+        self.provenance.ensure_index("material_id", unique=True)
+        self.provenance.ensure_index("formula_pretty")
+
+        for s in self.source_snls:
+            s.ensure_index("snl_id")
+            s.ensure_index("formula_pretty")
+
+    def get_items(self) -> Tuple[List[Dict], List[Dict]]:
+        """
+        Gets all materials to assocaite with SNLs
+        Returns:
+            generator of materials and SNLs that could match
+        """
+        self.logger.info("Provenance Builder Started")
+
+        self.logger.info("Setting indexes")
+        self.ensure_indicies()
+
+        # Find all formulas for materials that have been updated since this
+        # builder was last ran
+        q = {**self.query, "property_name": ProvenanceDoc.property_name}
+        updated_materials = self.provenance.newer_in(
+            self.materials,
+            criteria=q,
+            exhaustive=True,
+        )
+        forms_to_update = set(
+            self.materials.distinct(
+                "formula_pretty", {"material_id": {"$in": updated_materials}}
+            )
+        )
+
+        # Find all new SNL formulas since the builder was last run
+        for source in self.source_snls:
+            new_snls = self.provenance.newer_in(source)
+            forms_to_update |= set(source.distinct("formula_pretty", new_snls))
+
+        # Now reduce to the set of formulas we actually have
+        forms_avail = set(self.materials.distinct("formula_pretty", self.query))
+        forms_to_update = forms_to_update & forms_avail
+
+        self.logger.info(f"Found {len(forms_to_update)} new/updated systems to proces")
+
+        self.total = len(forms_to_update)
+
+        for formulas in grouper(forms_to_update, self.chunk_size):
+            snls = []
+            for source in self.source_snls:
+                snls.extend(
+                    source.query(criteria={"formula_pretty": {"$in": formulas}})
+                )
+
+            mats = list(
+                self.materials.query(
+                    properties=[
+                        "material_id",
+                        "last_updated",
+                        "structure",
+                        "initial_structures",
+                        "formula_pretty",
+                    ],
+                    criteria={"formula_pretty": {"$in": formulas}},
+                )
+            )
+
+            form_groups = defaultdict(list)
+            for snl in snls:
+                form_groups[snl["formula_pretty"]].append(snl)
+
+            mat_groups = defaultdict(list)
+            for mat in mats:
+                mat_groups[mat["formula_pretty"]].append(mat)
+
+            for formula, snl_group in form_groups.items():
+
+                mat_group = mat_groups[formula]
+
+                self.logger.debug(
+                    f"Found {len(snl_group)} snls and {len(mat_group)} mats"
+                )
+                yield mat_group, snl_group
+
+    def process_item(self, item) -> List[Dict]:
+        """
+        Matches SNLS and Materials
+        Args:
+            item (tuple): a tuple of materials and snls
+        Returns:
+            list(dict): a list of collected snls with material ids
+        """
+        mats, source_snls = item
+        formula_pretty = mats[0]["formula_pretty"]
+        snl_docs = list()
+        self.logger.debug(f"Finding Provenance {formula_pretty}")
+
+        # Match up SNLS with materials
+        for mat in mats:
+            matched_snls = list(self.match(source_snls, mat))
+            if len(matched_snls) > 0:
+                doc = ProvenanceDoc.from_SNLs(
+                    material_id=mat["material_id"], snls=matched_snls
+                )
+
+                doc.authors.append(self.settings.DEFAULT_AUTHOR)
+                doc.history.append(self.settings.DEFAULT_HISTORY)
+                doc.references.append(self.settings.DEFAULT_REFERENCE)
+
+                snl_docs.append(doc.dict())
+
+        return snl_docs
+
+    def match(self, snls, mat):
+        """
+        Finds a material doc that matches with the given snl
+        Args:
+            snl ([dict]): the snls list
+            mat (dict): a materials doc
+        Returns:
+            generator of materials doc keys
+        """
+
+        m_strucs = [Structure.from_dict(mat["structure"])] + [
+            Structure.from_dict(init_struc) for init_struc in mat["initial_structures"]
+        ]
+        snl_strucs = [StructureNL.from_dict(snl) for snl in snls]
+
+        groups = group_structures(
+            m_strucs + snl_strucs,
+            ltol=self.settings.LTOL,
+            stol=self.settings.STOL,
+            angle_tol=self.settings.ANGLE_TOL,
+        )
+        matched_groups = [
+            group
+            for group in groups
+            if any(isinstance(struc, Structure) for struc in group)
+        ]
+        snls = [
+            struc
+            for struc in group
+            for group in matched_groups
+            if isinstance(struc, StructureNL)
+        ]
+
+        self.logger.debug(f"Found {len(snls)} SNLs for {mat['material_id']}")
+        return snls
+
+    def update_targets(self, items):
+        """
+        Inserts the new SNL docs into the SNL collection
+        """
+
+        snls = list(filter(None, chain.from_iterable(items)))
+
+        if len(snls) > 0:
+            self.logger.info(f"Found {len(snls)} SNLs to update")
+            self.provenance.update(snls)
+        else:
+            self.logger.info("No items to update")
diff --git a/emmet-builders/emmet/builders/settings.py b/emmet-builders/emmet/builders/settings.py
index 4b6c3cf6c0..72aa1cc6d1 100644
--- a/emmet-builders/emmet/builders/settings.py
+++ b/emmet-builders/emmet/builders/settings.py
@@ -5,6 +5,7 @@
 from pydantic.fields import Field
 from emmet.core.settings import EmmetSettings
 from emmet.core.vasp.calc_types import TaskType
+from emmet.core.provenance import Author, History
 
 
 class EmmetBuildSettings(EmmetSettings):
@@ -30,3 +31,31 @@ class EmmetBuildSettings(EmmetSettings):
         [t.value for t in TaskType],
         description="Allowed task_types to build materials from",
     )
+
+    DEFAULT_REFERENCE: str = Field(
+        "@article{Jain2013,\nauthor = {Jain, Anubhav and Ong, Shyue Ping and "
+        "Hautier, Geoffroy and Chen, Wei and Richards, William Davidson and "
+        "Dacek, Stephen and Cholia, Shreyas and Gunter, Dan and Skinner, David "
+        "and Ceder, Gerbrand and Persson, Kristin a.},\n"
+        "doi = {10.1063/1.4812323},\nissn = {2166532X},\n"
+        "journal = {APL Materials},\nnumber = {1},\npages = {011002},\n"
+        "title = {{The Materials Project: A materials genome approach to "
+        "accelerating materials innovation}},\n"
+        "url = {http://link.aip.org/link/AMPADS/v1/i1/p011002/s1\\&Agg=doi},\n"
+        "volume = {1},\nyear = {2013}\n}\n\n@misc{MaterialsProject,\n"
+        "title = {{Materials Project}},\nurl = {http://www.materialsproject.org}\n}",
+        description="Default bibtex citation for all provenance",
+    )
+
+    DEFAULT_AUTHOR: Author = Field(
+        Author(name="Materials Project", email="feedback@materialsproject.org"),
+        description="Default Author for provenance ",
+    )
+
+    DEFAULT_HISTORY: History = Field(
+        History(
+            name="Materials Project Optimized Structure",
+            url="http://www.materialsproject.org",
+        ),
+        description="Default History for provenance ",
+    )
diff --git a/emmet-builders/emmet/builders/vasp/thermo.py b/emmet-builders/emmet/builders/vasp/thermo.py
index 124c83222d..191a9955d6 100644
--- a/emmet-builders/emmet/builders/vasp/thermo.py
+++ b/emmet-builders/emmet/builders/vasp/thermo.py
@@ -19,6 +19,7 @@
 from emmet.core.thermo import ThermoDoc
 from emmet.core.vasp.calc_types import run_type
 
+
 class Thermo(Builder):
     def __init__(
         self,
@@ -143,7 +144,9 @@ def process_item(self, item: Tuple[List[str], List[ComputedEntry]]):
             )
             return []
         except Exception as e:
-            self.logger.error(f"Got unexpected error while processing {[ent_.entry_id for ent_ in entries]}: {e}")
+            self.logger.error(
+                f"Got unexpected error while processing {[ent_.entry_id for ent_ in entries]}: {e}"
+            )
             return []
 
         return [d.dict() for d in docs]
diff --git a/emmet-builders/setup.py b/emmet-builders/setup.py
index ae5d444266..2f1655a2a4 100644
--- a/emmet-builders/setup.py
+++ b/emmet-builders/setup.py
@@ -1,6 +1,7 @@
 import datetime
 from pathlib import Path
 from setuptools import setup, find_namespace_packages
+
 required = []
 
 with open(Path(__file__).parent / "requirements.txt") as f:
diff --git a/emmet-cli/emmet/cli/calc.py b/emmet-cli/emmet/cli/calc.py
index f17a2d37a7..5b33d72e34 100644
--- a/emmet-cli/emmet/cli/calc.py
+++ b/emmet-cli/emmet/cli/calc.py
@@ -37,6 +37,7 @@ def get_format(fname):
 
 def load_canonical_structures(ctx, full_name, formula):
     from emmet.core.vasp.calc_types import task_type  # TODO import error
+
     collection = ctx.obj["COLLECTIONS"][full_name]
 
     if formula not in canonical_structures[full_name]:
@@ -169,7 +170,7 @@ def calc(ctx, specs, nmax, skip):
     help="Author to assign to all structures.",
 )
 @click.pass_context
-def prep(ctx, archive, authors):
+def prep(ctx, archive, authors):  # noqa: C901
     """prep structures from an archive for submission"""
     run = ctx.obj["RUN"]
     collections = ctx.obj["COLLECTIONS"]
diff --git a/emmet-cli/emmet/cli/decorators.py b/emmet-cli/emmet/cli/decorators.py
index 22bfb31f12..ea9f60dd2a 100644
--- a/emmet-cli/emmet/cli/decorators.py
+++ b/emmet-cli/emmet/cli/decorators.py
@@ -106,7 +106,10 @@ def wrapper(*args, **kwargs):
         run = ctx.grand_parent.params["run"]
         ntries = ctx.grand_parent.params["ntries"]
         if run:
-            click.secho(f"SBATCH MODE! Submitting to SLURM queue with {ntries} tries.", fg="green")
+            click.secho(
+                f"SBATCH MODE! Submitting to SLURM queue with {ntries} tries.",
+                fg="green",
+            )
 
         directory = ctx.parent.params.get("directory")
         if not directory:
diff --git a/emmet-cli/emmet/cli/entry_point.py b/emmet-cli/emmet/cli/entry_point.py
index 610998c28f..0e8cd963c4 100644
--- a/emmet-cli/emmet/cli/entry_point.py
+++ b/emmet-cli/emmet/cli/entry_point.py
@@ -31,7 +31,12 @@ def opt_prompt():
 @click.option("--run", is_flag=True, help="Run DB/filesystem write operations.")
 @click.option("--issue", type=int, help="Production tracker issue (required if --run).")
 @click.option("--sbatch", is_flag=True, help="Switch to SBatch mode.")
-@click.option("--ntries", default=1, show_default=True, help="Number of jobs (for walltime > 48h).")
+@click.option(
+    "--ntries",
+    default=1,
+    show_default=True,
+    help="Number of jobs (for walltime > 48h).",
+)
 @click.option("--bb", is_flag=True, help="Use burst buffer.")
 @click.option("--yes", is_flag=True, help="Automatic yes to all prompts.")
 @click.option("--no-dupe-check", is_flag=True, help="Skip duplicate check(s).")
@@ -66,7 +71,7 @@ def emmet(spec_or_dbfile, run, issue, sbatch, ntries, bb, yes, no_dupe_check, ve
 
     if run:
         if not issue:
-            raise EmmetCliError(f"Need issue number via --issue!")
+            raise EmmetCliError("Need issue number via --issue!")
 
         ctx.obj["LOG_STREAM"] = StringIO()
         memory_handler = logging.StreamHandler(ctx.obj["LOG_STREAM"])
diff --git a/emmet-cli/emmet/cli/tasks.py b/emmet-cli/emmet/cli/tasks.py
index 47e75fc386..02e9d1fa59 100644
--- a/emmet-cli/emmet/cli/tasks.py
+++ b/emmet-cli/emmet/cli/tasks.py
@@ -110,11 +110,15 @@ def check_pattern(nested_allowed=False):
     if not nested_allowed and os.sep in pattern:
         raise EmmetCliError(f"Nested pattern ({pattern}) not allowed!")
     elif not any(pattern.startswith(p) for p in PREFIXES):
-        raise EmmetCliError(f"Pattern ({pattern}) only allowed to start with one of {PREFIXES}!")
+        raise EmmetCliError(
+            f"Pattern ({pattern}) only allowed to start with one of {PREFIXES}!"
+        )
 
 
 def load_block_launchers():
-    prefix = "block_"  # TODO old prefixes (e.g. res/aflow) might not be needed for backup
+    prefix = (
+        "block_"  # TODO old prefixes (e.g. res/aflow) might not be needed for backup
+    )
     block_launchers = defaultdict(list)
     gen = VaspDirsGenerator()
     for idx, vasp_dir in enumerate(gen):
@@ -136,7 +140,7 @@ def extract_filename(line):
 @sbatch
 @click.option("--clean", is_flag=True, help="Remove original launchers.")
 @click.option("--check", is_flag=True, help="Check backup consistency.")
-def backup(clean, check):
+def backup(clean, check):  # noqa: C901
     """Backup directory to HPSS"""
     ctx = click.get_current_context()
     run = ctx.parent.parent.params["run"]
@@ -232,7 +236,7 @@ def backup(clean, check):
     default=FILE_FILTERS_DEFAULT,
     help="Set the file filter(s) to match files against in each launcher.",
 )
-def restore(inputfile, file_filter):
+def restore(inputfile, file_filter):  # noqa: C901
     """Restore launchers from HPSS"""
     ctx = click.get_current_context()
     run = ctx.parent.parent.params["run"]
@@ -357,7 +361,7 @@ def restore(inputfile, file_filter):
     default=STORE_VOLUMETRIC_DATA,
     help="Store any of CHGCAR, LOCPOT, AECCAR0, AECCAR1, AECCAR2, ELFCAR.",
 )
-def parse(task_ids, snl_metas, nproc, store_volumetric_data):
+def parse(task_ids, snl_metas, nproc, store_volumetric_data):  # noqa: C901
     """Parse VASP launchers into tasks"""
     ctx = click.get_current_context()
     if "CLIENT" not in ctx.obj:
@@ -398,7 +402,9 @@ def parse(task_ids, snl_metas, nproc, store_volumetric_data):
         # insert empty doc with max ID + 1 into target collection for parallel SLURM jobs
         # NOTE use regex first to reduce size of distinct below 16MB
         q = {"task_id": {"$regex": r"^mp-\d{7,}$"}}
-        all_task_ids = [t["task_id"] for t in target.collection.find(q, {"_id": 0, "task_id": 1})]
+        all_task_ids = [
+            t["task_id"] for t in target.collection.find(q, {"_id": 0, "task_id": 1})
+        ]
         if not all_task_ids:
             all_task_ids = target.collection.distinct("task_id")
 
diff --git a/emmet-cli/emmet/cli/utils.py b/emmet-cli/emmet/cli/utils.py
index 1155b09efc..edf3e5e760 100644
--- a/emmet-cli/emmet/cli/utils.py
+++ b/emmet-cli/emmet/cli/utils.py
@@ -327,7 +327,7 @@ def reconstruct_command(sbatch=False):
     return " ".join(command).strip().strip("\\")
 
 
-def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas):
+def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas):  # noqa: C901
     process = multiprocessing.current_process()
     name = process.name
     chunk_idx = int(name.rsplit("-")[1]) - 1
@@ -345,7 +345,7 @@ def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas):
     count = 0
     drone = VaspDrone(
         additional_fields={"tags": tags},
-        store_volumetric_data=ctx.params['store_volumetric_data']
+        store_volumetric_data=ctx.params["store_volumetric_data"],
     )
 
     for vaspdir in vaspdirs:
@@ -393,7 +393,9 @@ def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas):
             snl_meta = snl_metas.get(launcher)
             if snl_meta:
                 references = snl_meta.get("references")
-                authors = snl_meta.get("authors", ["Materials Project <feedback@materialsproject.org>"])
+                authors = snl_meta.get(
+                    "authors", ["Materials Project <feedback@materialsproject.org>"]
+                )
                 kwargs = {"projects": [tag]}
                 if references:
                     kwargs["references"] = references
@@ -416,7 +418,11 @@ def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas):
                     target.insert_task(task_doc, use_gridfs=True)
                 except DocumentTooLarge:
                     output = dotty(task_doc["calcs_reversed"][0]["output"])
-                    pop_keys = ["normalmode_eigenvecs", "force_constants", "outcar.onsite_density_matrices"]
+                    pop_keys = [
+                        "normalmode_eigenvecs",
+                        "force_constants",
+                        "outcar.onsite_density_matrices",
+                    ]
 
                     for k in pop_keys:
                         if k not in output:
@@ -436,7 +442,9 @@ def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas):
                 if target.collection.count(query):
                     if snl_dct:
                         result = snl_collection.insert_one(snl_dct)
-                        logger.info(f"SNL {result.inserted_id} inserted into {snl_collection.full_name}.")
+                        logger.info(
+                            f"SNL {result.inserted_id} inserted into {snl_collection.full_name}."
+                        )
 
                     shutil.rmtree(vaspdir)
                     logger.info(f"{name} Successfully parsed and removed {launcher}.")
diff --git a/emmet-core/emmet/core/provenance.py b/emmet-core/emmet/core/provenance.py
index 6e0da58e39..b83bb6f330 100644
--- a/emmet-core/emmet/core/provenance.py
+++ b/emmet-core/emmet/core/provenance.py
@@ -1,12 +1,18 @@
 """ Core definition of a Provenance Document """
+import warnings
+from collections import defaultdict
 from datetime import datetime
-from typing import ClassVar, Dict, List
+from typing import ClassVar, Dict, List, Optional, Union
 
 from pybtex.database import BibliographyData, parse_string
-from pydantic import BaseModel, EmailStr, Field, HttpUrl, validator
+from pydantic import BaseModel, EmailStr, Field, validator
+from pydash.objects import get
+from pymatgen.core import Structure
+from pymatgen.util.provenance import StructureNL
 
 from emmet.core.material_property import PropertyDoc
-from emmet.core.utils import ValueEnum
+from emmet.core.mpid import MPID
+from emmet.core.utils import ValueEnum, group_structures
 
 
 class Database(ValueEnum):
@@ -15,7 +21,7 @@ class Database(ValueEnum):
     """
 
     ICSD = "icsd"
-    PaulingFiles = "pf"
+    Pauling_Files = "pf"
     COD = "cod"
 
 
@@ -34,13 +40,13 @@ class History(BaseModel):
     """
 
     name: str
-    url: HttpUrl
-    description: Dict = Field(
+    url: str
+    description: Optional[Dict] = Field(
         None, description="Dictionary of exra data for this history node"
     )
 
 
-class Provenance(PropertyDoc):
+class ProvenanceDoc(PropertyDoc):
     """
     A provenance property block
     """
@@ -48,35 +54,121 @@ class Provenance(PropertyDoc):
     property_name: ClassVar[str] = "provenance"
 
     created_at: datetime = Field(
-        None,
+        ...,
         description="creation date for the first structure corresponding to this material",
     )
 
-    projects: List[str] = Field(
-        None, description="List of projects this material belongs to"
-    )
-    bibtex_string: str = Field(
-        None, description="Bibtex reference string for this material"
+    references: List[str] = Field(
+        [], description="Bibtex reference strings for this material"
     )
+
+    authors: List[Author] = Field([], description="List of authors for this material")
+
     remarks: List[str] = Field(
-        None, description="List of remarks for the provenance of this material"
+        [], description="List of remarks for the provenance of this material"
     )
-    authors: List[Author] = Field(None, description="List of authors for this material")
+
+    tags: List[str] = Field([])
 
     theoretical: bool = Field(
         True, description="If this material has any experimental provenance or not"
     )
 
     database_IDs: Dict[Database, List[str]] = Field(
-        None, description="Database IDs corresponding to this material"
+        dict(), description="Database IDs corresponding to this material"
     )
 
     history: List[History] = Field(
-        None,
-        description="List of history nodes specifying the transformations or orignation of this material",
+        [],
+        description="List of history nodes specifying the transformations or orignation"
+        " of this material for the entry closest matching the material input",
     )
 
     @validator("authors")
     def remove_duplicate_authors(cls, authors):
         authors_dict = {entry.name.lower(): entry for entry in authors}
         return list(authors_dict.items())
+
+    @classmethod
+    def from_SNLs(
+        cls,
+        material_id: Union[MPID, int],
+        snls: List[Dict],
+    ) -> "ProvenanceDoc":
+        """
+        Converts legacy Pymatgen SNLs into a single provenance document
+        """
+
+        # Choose earliest created_at
+        created_at = sorted(
+            [get(snl, "about.created_at.string", datetime.max) for snl in snls]
+        )[0]
+
+        # Choose earliest history
+        history = sorted(
+            snls, key=lambda snl: get(snl, "about.created_at.string", datetime.max)
+        )[0]["about"]["history"]
+
+        # Aggregate all references into one dict to remove duplicates
+        refs = {}
+        for snl in snls:
+            try:
+                entries = parse_string(snl["about"]["references"], bib_format="bibtex")
+                refs.update(entries.entries)
+            except Exception:
+                warnings.warn(f"Failed parsing bibtex: {snl['about']['references']}")
+
+        bib_data = BibliographyData(entries=refs)
+        references = [ref.to_string("bibtex") for ref in bib_data.entries]
+
+        # TODO: Maybe we should combine this robocrystallographer?
+        # TODO: Refine these tags / remarks
+        remarks = list(
+            set([remark for snl in snls for remark in snl["about"]["remarks"]])
+        )
+        tags = [r for r in remarks if len(r) < 140]
+
+        # Aggregate all authors - Converting a single dictionary first
+        # performs duplicate checking
+        authors_dict = {
+            entry["name"].lower(): entry["email"]
+            for snl in snls
+            for entry in snl["about"]["authors"]
+        }
+        authors = [
+            {"name": name.title(), "email": email}
+            for name, email in authors_dict.items()
+        ]
+
+        # Check if this entry is experimental
+        if any(get(snl, "about.history.0.experimental", False) for snl in snls):
+            experimental = True
+
+        # Aggregate all the database IDs
+        snl_ids = [snl.get("snl_id", "") for snl in snls]
+        db_ids = {
+            Database(db_id): [snl_id for snl_id in snl_ids if db_id in snl_id]
+            for db_id in map(str, Database)
+        }
+
+        # remove Nones and empty lists
+        db_ids = {k: list(filter(None, v)) for k, v in db_ids.items()}
+        db_ids = {k: v for k, v in db_ids.items() if len(v) > 0}
+
+        # Get experimental bool
+        experimental = any(
+            get(snl, "about.history.0.experimental", False) for snl in snls
+        )
+
+        snl_fields = {
+            "created_at": created_at,
+            "references": references,
+            "authors": authors,
+            "remarks": remarks,
+            "tags": tags,
+            "database_IDs": db_ids,
+            "theoretical": not experimental,
+            "history": history,
+        }
+
+        return ProvenanceDoc(material_id=material_id, **snl_fields)
diff --git a/emmet-core/emmet/core/vasp/material.py b/emmet-core/emmet/core/vasp/material.py
index 7613abd3c4..1eac69d829 100644
--- a/emmet-core/emmet/core/vasp/material.py
+++ b/emmet-core/emmet/core/vasp/material.py
@@ -4,6 +4,7 @@
 from typing import ClassVar, List, Mapping, Optional, Sequence, Tuple, TypeVar, Union
 
 from pydantic import BaseModel, Field, create_model
+from pymatgen.analysis.structure_analyzer import SpacegroupAnalyzer
 from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
 from pymatgen.core import Structure
 from pymatgen.entries.computed_entries import ComputedStructureEntry
@@ -54,7 +55,7 @@ def from_tasks(
             quality_scores: quality scores for various calculation types
             use_statics: Use statics to define a material
         """
-        if task_group == 0:
+        if len(task_group) == 0:
             raise Exception("Must have more than one task in the group.")
 
         # Material ID
@@ -105,7 +106,9 @@ def _structure_eval(task: TaskDocument):
             )
 
         best_structure_calc = sorted(structure_calcs, key=_structure_eval)[0]
-        structure = best_structure_calc.output.structure
+        structure = SpacegroupAnalyzer(
+            best_structure_calc.output.structure, symprec=0.1
+        ).get_conventional_standard_structure()
 
         # Initial Structures
         initial_structures = [task.input.structure for task in task_group]
diff --git a/emmet-core/emmet/core/vasp/task.py b/emmet-core/emmet/core/vasp/task.py
index c8b289582f..82bb793e6b 100644
--- a/emmet-core/emmet/core/vasp/task.py
+++ b/emmet-core/emmet/core/vasp/task.py
@@ -1,7 +1,7 @@
 """ Core definition of a VASP Task Document """
 from datetime import datetime
 from functools import lru_cache, partial
-from typing import ClassVar, Dict, List, Optional, Union, Any
+from typing import Any, ClassVar, Dict, List, Optional, Union
 
 from pydantic import BaseModel, Field, validator
 from pymatgen.analysis.magnetism import CollinearMagneticStructureAnalyzer, Ordering
diff --git a/setup.cfg b/setup.cfg
index 75e9591afa..531b69dc09 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -19,23 +19,5 @@ profile=black
 [pydocstyle]
 ignore = D105,D2,D4
 
-[mypy-numpy.*]
-ignore_missing_imports = True
-
-[mypy-bson.*]
-ignore_missing_imports = True
-
-[mypy-pymatgen.*]
-ignore_missing_imports = True
-
-[mypy-pytest]
-ignore_missing_imports = True
-
-[mypy-monty.*]
-ignore_missing_imports = True
-
-[mypy-pybtex.*]
-ignore_missing_imports = True
-
-[mypy-ruamel.*]
+[mypy]
 ignore_missing_imports = True
diff --git a/tests/emmet-core/test_provenance.py b/tests/emmet-core/test_provenance.py
new file mode 100644
index 0000000000..30111c5896
--- /dev/null
+++ b/tests/emmet-core/test_provenance.py
@@ -0,0 +1,51 @@
+from datetime import datetime
+
+import pytest
+from pymatgen.core import Element, Lattice, Structure
+from pymatgen.util.provenance import Author, HistoryNode, StructureNL
+
+from emmet.core.provenance import Database, ProvenanceDoc
+
+
+@pytest.fixture
+def structure():
+    test_latt = Lattice.cubic(3.0)
+    test_struc = Structure(lattice=test_latt, species=["Fe"], coords=[[0, 0, 0]])
+    return test_struc
+
+
+@pytest.fixture
+def snls(structure):
+
+    docs = [
+        StructureNL(
+            structure,
+            authors=[Author("test{i}", "test@test.com").as_dict()],
+            history=[HistoryNode("nothing", "url.com", {})],
+            created_at=datetime.utcnow(),
+        ).as_dict()
+        for i in range(3)
+    ]
+    docs[0]["snl_id"] = "icsd-2"
+    docs[1]["snl_id"] = "user-1"
+    docs[2]["snl_id"] = "pf-3"
+
+    return docs
+
+
+def test_from_snls(snls):
+
+    doc = ProvenanceDoc.from_SNLs(material_id="mp-3", snls=snls)
+
+    assert isinstance(doc, ProvenanceDoc)
+    assert doc.property_name == "provenance"
+    assert doc.material_id == "mp-3"
+    assert doc.theoretical is True
+    assert doc.database_IDs == {
+        Database.ICSD: ["icsd-2"],
+        Database.Pauling_Files: ["pf-3"],
+    }
+
+    # Test experimental detection
+    snls[0]["about"]["history"][0]["experimental"] = True
+    assert ProvenanceDoc.from_SNLs(material_id="mp-3", snls=snls).theoretical is False
diff --git a/tests/emmet-core/test_settings.py b/tests/emmet-core/test_settings.py
index c98c8aeaca..0a9df9b37c 100644
--- a/tests/emmet-core/test_settings.py
+++ b/tests/emmet-core/test_settings.py
@@ -38,7 +38,7 @@ def test_from_url():
 
     os.environ[
         "EMMET_CONFIG_FILE"
-    ] = "https://raw.githubusercontent.com/materialsproject/emmet/master/tests/emmet-core/test_settings.json"
+    ] = "https://raw.githubusercontent.com/materialsproject/emmet/master/tests/test_files/test_settings.json"
 
     test_config = EmmetSettings()
 
diff --git a/tests/emmet-core/test_settings.json b/tests/test_files/test_settings.json
similarity index 100%
rename from tests/emmet-core/test_settings.json
rename to tests/test_files/test_settings.json