From 7dd33fd04408f2da416a755ef13def579f1c2a6f Mon Sep 17 00:00:00 2001
From: jmmshn <jmmshn@lbl.gov>
Date: Thu, 11 Mar 2021 14:02:45 -0800
Subject: [PATCH 1/8] working insertion electrode builder

---
 .../emmet/builders/materials/electrodes.py    | 309 +-----------------
 .../builders/materials/structure_groups.py    | 269 +++++++++++++++
 emmet-builders/emmet/builders/vasp/thermo.py  |  10 +-
 emmet-core/emmet/core/electrode.py            |   7 +-
 4 files changed, 292 insertions(+), 303 deletions(-)
 create mode 100644 emmet-builders/emmet/builders/materials/structure_groups.py

diff --git a/emmet-builders/emmet/builders/materials/electrodes.py b/emmet-builders/emmet/builders/materials/electrodes.py
index 4b43c1e287..73be677873 100644
--- a/emmet-builders/emmet/builders/materials/electrodes.py
+++ b/emmet-builders/emmet/builders/materials/electrodes.py
@@ -18,280 +18,23 @@
 from pymatgen.analysis.structure_matcher import StructureMatcher, ElementComparator
 from pymatgen.apps.battery.insertion_battery import InsertionElectrode
 from pymatgen.core import Structure
-from pymatgen.entries.computed_entries import ComputedStructureEntry
+from pymatgen.entries.computed_entries import ComputedStructureEntry, ComputedEntry
 
 __author__ = "Jimmy Shen"
 __email__ = "jmmshn@lbl.gov"
 
-from pymatgen.entries.computed_entries import ComputedEntry
-
-
-def s_hash(el):
-    return el.data["comp_delith"]
-
-
-# MatDoc = namedtuple("MatDoc", ["material_id", "structure", "formula_pretty", "framework"])
-
-REDOX_ELEMENTS = [
-    "Ti",
-    "V",
-    "Cr",
-    "Mn",
-    "Fe",
-    "Co",
-    "Ni",
-    "Cu",
-    "Nb",
-    "Mo",
-    "Sn",
-    "Sb",
-    "W",
-    "Re",
-    "Bi",
-    "C",
-    "Hf",
-]
-
-# WORKING_IONS = ["Li", "Be", "Na", "Mg", "K", "Ca", "Rb", "Sr", "Cs", "Ba"]
-
-MAT_PROPS = [
-    "structure",
-    "material_id",
-    "formula_pretty",
-]
-
-sg_fields = ["number", "hall_number", "international", "hall", "choice"]
-
-
-def generic_groupby(list_in, comp=operator.eq):
-    """
-    Group a list of unsortable objects
-    Args:
-        list_in: A list of generic objects
-        comp: (Default value = operator.eq) The comparator
-    Returns:
-        [int] list of labels for the input list
-    """
-    list_out = [None] * len(list_in)
-    label_num = 0
-    for i1, ls1 in enumerate(list_out):
-        if ls1 is not None:
-            continue
-        list_out[i1] = label_num
-        for i2, ls2 in list(enumerate(list_out))[i1 + 1 :]:
-            if comp(list_in[i1], list_in[i2]):
-                if list_out[i2] is None:
-                    list_out[i2] = list_out[i1]
-                else:
-                    list_out[i1] = list_out[i2]
-                    label_num -= 1
-        label_num += 1
-    return list_out
-
-
-class StructureGroupBuilder(Builder):
-    def __init__(
-        self,
-        materials: MongoStore,
-        sgroups: MongoStore,
-        working_ion: str,
-        query: dict = None,
-        ltol: float = 0.2,
-        stol: float = 0.3,
-        angle_tol: float = 5.0,
-        check_newer: bool = True,
-        **kwargs,
-    ):
-        """
-        Aggregate materials entries into sgroups that are topotactically similar to each other.
-        This is an incremental builder that makes ensures that each materials id belongs to one StructureGroupDoc document
-        Args:
-            materials (Store): Store of materials documents that contains the structures
-            sgroups (Store): Store of grouped material ids
-            query (dict): dictionary to limit materials to be analyzed ---
-                            only applied to the materials when we need to group structures
-                            the phase diagram is still constructed with the entire set
-        """
-        self.materials = materials
-        self.sgroups = sgroups
-        self.working_ion = working_ion
-        self.query = query if query else {}
-        self.ltol = ltol
-        self.stol = stol
-        self.angle_tol = angle_tol
-        self.check_newer = check_newer
-        super().__init__(sources=[materials], targets=[sgroups], **kwargs)
-
-    def prechunk(self, number_splits: int) -> Iterable[Dict]:
-        """
-        TODO can implement this for distributed runs by adding filters
-        """
-        pass
-
-    def get_items(self):
-        """
-        Summary of the steps:
-        - query the materials database for different chemical systems that satisfies the base query
-          "contains redox element and working ion"
-        - Get the full chemsys list of interest
-        - The main loop is over all these chemsys.  within the main loop:
-            - get newest timestamp for the material documents (max_mat_time)
-            - get the oldest timestamp for the target documents (min_target_time)
-            - if min_target_time is < max_mat_time then nuke all the target documents
-        """
-
-        # All potentially interesting chemsys must contain the working ion
-        base_query = {
-            "$and": [
-                {"elements": {"$in": REDOX_ELEMENTS + [self.working_ion]}},
-                self.query.copy(),
-            ]
-        }
-        self.logger.debug(f"Initial Chemsys QUERY: {base_query}")
-
-        # get a chemsys that only contains the working ion since the working ion
-        # must be present for there to be voltage steps
-        all_chemsys = self.materials.distinct("chemsys", criteria=base_query)
-        # Contains the working ion but not ONLY the working ion
-        all_chemsys = [
-            *filter(
-                lambda x: self.working_ion in x and len(x) > 1,
-                [chemsys_.split("-") for chemsys_ in all_chemsys],
-            )
-        ]
-
-        self.logger.debug(
-            f"Performing initial checks on {len(all_chemsys)} chemical systems containing redox elements with or without the Working Ion."
-        )
-        self.total = len(all_chemsys)
-
-        for chemsys_l in all_chemsys:
-            chemsys = "-".join(sorted(chemsys_l))
-            chemsys_wo = "-".join(sorted(set(chemsys_l) - {self.working_ion}))
-            chemsys_query = {
-                "$and": [
-                    {"chemsys": {"$in": [chemsys_wo, chemsys]}},
-                    self.query.copy(),
-                ]
-            }
-            self.logger.debug(f"QUERY: {chemsys_query}")
-            all_mats_in_chemsys = list(
-                self.materials.query(
-                    criteria=chemsys_query,
-                    properties=MAT_PROPS + [self.materials.last_updated_field],
-                )
-            )
-            self.logger.debug(
-                f"Found {len(all_mats_in_chemsys)} materials in {chemsys_wo}"
-            )
-            if self.check_newer:
-                all_target_docs = list(
-                    self.sgroups.query(
-                        criteria={"chemsys": chemsys},
-                        properties=[
-                            "material_id",
-                            self.sgroups.last_updated_field,
-                            "grouped_ids",
-                        ],
-                    )
-                )
-                self.logger.debug(
-                    f"Found {len(all_target_docs)} Grouped documents in {chemsys_wo}"
-                )
-
-                mat_times = [
-                    mat_doc[self.materials.last_updated_field]
-                    for mat_doc in all_mats_in_chemsys
-                ]
-                max_mat_time = max(mat_times, default=datetime.min)
-                self.logger.debug(
-                    f"The newest material doc was generated at {max_mat_time}."
-                )
-
-                target_times = [
-                    g_doc[self.materials.last_updated_field]
-                    for g_doc in all_target_docs
-                ]
-                min_target_time = min(target_times, default=datetime.max)
-                self.logger.debug(
-                    f"The newest GROUP doc was generated at {min_target_time}."
-                )
-
-                mat_ids = set(
-                    [mat_doc["material_id"] for mat_doc in all_mats_in_chemsys]
-                )
-
-                # If any material id is missing or if any material id has been updated
-                target_mat_ids = set()
-                for g_doc in all_target_docs:
-                    target_mat_ids |= set(g_doc["grouped_ids"])
-
-                self.logger.debug(
-                    f"There are {len(mat_ids)} material ids in the source database vs {len(target_mat_ids)} in the target database."
-                )
-                if mat_ids == target_mat_ids and max_mat_time < min_target_time:
-                    continue
-                else:
-                    self.logger.info(
-                        f"Nuking all {len(target_mat_ids)} documents in chemsys {chemsys} in the target database."
-                    )
-                    self._remove_targets(list(target_mat_ids))
-
-            yield {"chemsys": chemsys, "materials": all_mats_in_chemsys}
-
-    def update_targets(self, items: List):
-        items = list(filter(None, chain.from_iterable(items)))
-        if len(items) > 0:
-            self.logger.info("Updating {} sgroups documents".format(len(items)))
-            for struct_group_dict in items:
-                struct_group_dict[self.sgroups.last_updated_field] = datetime.utcnow()
-            self.sgroups.update(docs=items, key=["material_id"])
-        else:
-            self.logger.info("No items to update")
-
-    def _entry_from_mat_doc(self, mdoc):
-        # Note since we are just structure grouping we don't need to be careful with energy or correction
-        # All of the energy analysis is left to other builders
-        d_ = {
-            "entry_id": mdoc["material_id"],
-            "structure": mdoc["structure"],
-            "energy": -math.inf,
-            "correction": -math.inf,
-        }
-        return ComputedStructureEntry.from_dict(d_)
-
-    def process_item(self, item: Any) -> Any:
-        entries = [*map(self._entry_from_mat_doc, item["materials"])]
-        s_groups = StructureGroupDoc.from_ungrouped_structure_entries(
-            entries=entries,
-            ignored_species=[self.working_ion],
-            ltol=self.ltol,
-            stol=self.stol,
-            angle_tol=self.angle_tol,
-        )
-        # append the working_ion to the group ids
-        for sg in s_groups:
-            sg.material_id = f"{sg.material_id}_{self.working_ion}"
-        return [sg.dict() for sg in s_groups]
-
-    def _remove_targets(self, rm_ids):
-        self.sgroups.remove_docs({"material_id": {"$in": rm_ids}})
-
-
 class InsertionElectrodeBuilder(MapBuilder):
     def __init__(
         self,
         grouped_materials: MongoStore,
         insertion_electrode: MongoStore,
         thermo: MongoStore,
-        material: MongoStore,
         query: dict = None,
         **kwargs,
     ):
         self.grouped_materials = grouped_materials
         self.insertion_electrode = insertion_electrode
         self.thermo = thermo
-        self.material = material
         qq_ = {} if query is None else query
         qq_.update({"structure_matched": True, "has_distinct_compositions": True})
         super().__init__(
@@ -304,12 +47,12 @@ def __init__(
     def get_items(self):
         """"""
 
-        @lru_cache(None)
+        @lru_cache()
         def get_working_ion_entry(working_ion):
             with self.thermo as store:
                 working_ion_docs = [*store.query({"chemsys": working_ion})]
             best_wion = min(
-                working_ion_docs, key=lambda x: x["thermo"]["energy_per_atom"]
+                working_ion_docs, key=lambda x: x["energy_per_atom"]
             )
             return best_wion
 
@@ -325,35 +68,23 @@ def modify_item(item):
                                 {"material_id": {"$in": item["grouped_ids"]}},
                             ]
                         },
-                        properties=["material_id", "_sbxn", "thermo"],
-                    )
-                ]
-
-            with self.material as store:
-                material_docs = [
-                    *store.query(
-                        {
-                            "$and": [
-                                {"material_id": {"$in": item["grouped_ids"]}},
-                                {"_sbxn": {"$in": ["core"]}},
-                            ]
-                        },
-                        properties=["material_id", "structure"],
+                        properties=["material_id", "_sbxn", "thermo", "entries", "energy_type", "energy_above_hull"],
                     )
                 ]
 
             self.logger.debug(f"Found for {len(thermo_docs)} Thermo Documents.")
+
             if len(item["ignored_species"]) != 1:
                 raise ValueError(
                     "Insertion electrode can only be defined for one working ion species"
                 )
+
             working_ion_doc = get_working_ion_entry(item["ignored_species"][0])
             return {
                 "material_id": item["material_id"],
                 "working_ion_doc": working_ion_doc,
                 "working_ion": item["ignored_species"][0],
                 "thermo_docs": thermo_docs,
-                "material_docs": material_docs,
             }
 
         yield from map(modify_item, super().get_items())
@@ -363,40 +94,26 @@ def unary_function(self, item):
         - Add volume information to each entry to create the insertion electrode document
         - Add the host structure
         """
-        entries = [tdoc_["thermo"]["entry"] for tdoc_ in item["thermo_docs"]]
-        entries = list(map(ComputedEntry.from_dict, entries))
+        entries = [tdoc_["entries"][tdoc_["energy_type"]] for tdoc_ in item["thermo_docs"]]
+        entries = list(map(ComputedStructureEntry.from_dict, entries))
         working_ion_entry = ComputedEntry.from_dict(
-            item["working_ion_doc"]["thermo"]["entry"]
+            item["working_ion_doc"]["entries"][item["working_ion_doc"]['energy_type']]
         )
         working_ion = working_ion_entry.composition.reduced_formula
+
         decomp_energies = {
-            d_["material_id"]: d_["thermo"]["e_above_hull"]
+            d_["material_id"]: d_["energy_above_hull"]
             for d_ in item["thermo_docs"]
         }
-        mat_structures = {
-            mat_d_["material_id"]: Structure.from_dict(mat_d_["structure"])
-            for mat_d_ in item["material_docs"]
-        }
 
         least_wion_ent = min(
             entries, key=lambda x: x.composition.get_atomic_fraction(working_ion)
         )
-        mdoc_ = next(
-            filter(
-                lambda x: x["material_id"] == least_wion_ent.entry_id,
-                item["material_docs"],
-            )
-        )
-        host_structure = Structure.from_dict(mdoc_["structure"])
+        host_structure = least_wion_ent.structure.copy()
         host_structure.remove_species([item["working_ion"]])
 
         for ient in entries:
-            if mat_structures[ient.entry_id].composition != ient.composition:
-                raise RuntimeError(
-                    f"In {item['material_id']}: the compositions for task {ient.entry_id} are matched "
-                    "between the StructureGroup DB and the Thermo DB "
-                )
-            ient.data["volume"] = mat_structures[ient.entry_id].volume
+            ient.data["volume"] = ient.structure.volume
             ient.data["decomposition_energy"] = decomp_energies[ient.entry_id]
 
         ie = InsertionElectrodeDoc.from_entries(
diff --git a/emmet-builders/emmet/builders/materials/structure_groups.py b/emmet-builders/emmet/builders/materials/structure_groups.py
new file mode 100644
index 0000000000..2cd463495e
--- /dev/null
+++ b/emmet-builders/emmet/builders/materials/structure_groups.py
@@ -0,0 +1,269 @@
+import operator
+import math
+from datetime import datetime
+from itertools import chain
+from typing import Iterable, Dict, List, Any
+
+from emmet.core.structure_group import StructureGroupDoc
+from maggma.builders import Builder
+from maggma.stores import MongoStore
+from pymatgen.entries.computed_entries import ComputedStructureEntry
+
+__author__ = "Jimmy Shen"
+__email__ = "jmmshn@lbl.gov"
+
+from pymatgen.entries.computed_entries import ComputedEntry
+
+def s_hash(el):
+    return el.data["comp_delith"]
+
+
+# MatDoc = namedtuple("MatDoc", ["material_id", "structure", "formula_pretty", "framework"])
+
+REDOX_ELEMENTS = [
+    "Ti",
+    "V",
+    "Cr",
+    "Mn",
+    "Fe",
+    "Co",
+    "Ni",
+    "Cu",
+    "Nb",
+    "Mo",
+    "Sn",
+    "Sb",
+    "W",
+    "Re",
+    "Bi",
+    "C",
+    "Hf",
+]
+
+# WORKING_IONS = ["Li", "Be", "Na", "Mg", "K", "Ca", "Rb", "Sr", "Cs", "Ba"]
+
+MAT_PROPS = [
+    "structure",
+    "material_id",
+    "formula_pretty",
+]
+
+sg_fields = ["number", "hall_number", "international", "hall", "choice"]
+
+
+def generic_groupby(list_in, comp=operator.eq):
+    """
+    Group a list of unsortable objects
+    Args:
+        list_in: A list of generic objects
+        comp: (Default value = operator.eq) The comparator
+    Returns:
+        [int] list of labels for the input list
+    """
+    list_out = [None] * len(list_in)
+    label_num = 0
+    for i1, ls1 in enumerate(list_out):
+        if ls1 is not None:
+            continue
+        list_out[i1] = label_num
+        for i2, ls2 in list(enumerate(list_out))[i1 + 1 :]:
+            if comp(list_in[i1], list_in[i2]):
+                if list_out[i2] is None:
+                    list_out[i2] = list_out[i1]
+                else:
+                    list_out[i1] = list_out[i2]
+                    label_num -= 1
+        label_num += 1
+    return list_out
+
+
+
+class StructureGroupBuilder(Builder):
+    def __init__(
+            self,
+            materials: MongoStore,
+            sgroups: MongoStore,
+            working_ion: str,
+            query: dict = None,
+            ltol: float = 0.2,
+            stol: float = 0.3,
+            angle_tol: float = 5.0,
+            check_newer: bool = True,
+            **kwargs,
+    ):
+        """
+        Aggregate materials entries into sgroups that are topotactically similar to each other.
+        This is an incremental builder that makes ensures that each materials id belongs to one StructureGroupDoc document
+        Args:
+            materials (Store): Store of materials documents that contains the structures
+            sgroups (Store): Store of grouped material ids
+            query (dict): dictionary to limit materials to be analyzed ---
+                            only applied to the materials when we need to group structures
+                            the phase diagram is still constructed with the entire set
+        """
+        self.materials = materials
+        self.sgroups = sgroups
+        self.working_ion = working_ion
+        self.query = query if query else {}
+        self.ltol = ltol
+        self.stol = stol
+        self.angle_tol = angle_tol
+        self.check_newer = check_newer
+        super().__init__(sources=[materials], targets=[sgroups], **kwargs)
+
+    def prechunk(self, number_splits: int) -> Iterable[Dict]:
+        """
+        TODO can implement this for distributed runs by adding filters
+        """
+        pass
+
+    def get_items(self):
+        """
+        Summary of the steps:
+        - query the materials database for different chemical systems that satisfies the base query
+          "contains redox element and working ion"
+        - Get the full chemsys list of interest
+        - The main loop is over all these chemsys.  within the main loop:
+            - get newest timestamp for the material documents (max_mat_time)
+            - get the oldest timestamp for the target documents (min_target_time)
+            - if min_target_time is < max_mat_time then nuke all the target documents
+        """
+
+        # All potentially interesting chemsys must contain the working ion
+        base_query = {
+            "$and": [
+                {"elements": {"$in": REDOX_ELEMENTS + [self.working_ion]}},
+                self.query.copy(),
+            ]
+        }
+        self.logger.debug(f"Initial Chemsys QUERY: {base_query}")
+
+        # get a chemsys that only contains the working ion since the working ion
+        # must be present for there to be voltage steps
+        all_chemsys = self.materials.distinct("chemsys", criteria=base_query)
+        # Contains the working ion but not ONLY the working ion
+        all_chemsys = [
+            *filter(
+                lambda x: self.working_ion in x and len(x) > 1,
+                [chemsys_.split("-") for chemsys_ in all_chemsys],
+            )
+        ]
+
+        self.logger.debug(
+            f"Performing initial checks on {len(all_chemsys)} chemical systems containing redox elements with or without the Working Ion."
+        )
+        self.total = len(all_chemsys)
+
+        for chemsys_l in all_chemsys:
+            chemsys = "-".join(sorted(chemsys_l))
+            chemsys_wo = "-".join(sorted(set(chemsys_l) - {self.working_ion}))
+            chemsys_query = {
+                "$and": [
+                    {"chemsys": {"$in": [chemsys_wo, chemsys]}},
+                    self.query.copy(),
+                ]
+            }
+            self.logger.debug(f"QUERY: {chemsys_query}")
+            all_mats_in_chemsys = list(
+                self.materials.query(
+                    criteria=chemsys_query,
+                    properties=MAT_PROPS + [self.materials.last_updated_field],
+                )
+            )
+            self.logger.debug(
+                f"Found {len(all_mats_in_chemsys)} materials in {chemsys_wo}"
+            )
+            if self.check_newer:
+                all_target_docs = list(
+                    self.sgroups.query(
+                        criteria={"chemsys": chemsys},
+                        properties=[
+                            "material_id",
+                            self.sgroups.last_updated_field,
+                            "grouped_ids",
+                        ],
+                    )
+                )
+                self.logger.debug(
+                    f"Found {len(all_target_docs)} Grouped documents in {chemsys_wo}"
+                )
+
+                mat_times = [
+                    mat_doc[self.materials.last_updated_field]
+                    for mat_doc in all_mats_in_chemsys
+                ]
+                max_mat_time = max(mat_times, default=datetime.min)
+                self.logger.debug(
+                    f"The newest material doc was generated at {max_mat_time}."
+                )
+
+                target_times = [
+                    g_doc[self.materials.last_updated_field]
+                    for g_doc in all_target_docs
+                ]
+                min_target_time = min(target_times, default=datetime.max)
+                self.logger.debug(
+                    f"The newest GROUP doc was generated at {min_target_time}."
+                )
+
+                mat_ids = set(
+                    [mat_doc["material_id"] for mat_doc in all_mats_in_chemsys]
+                )
+
+                # If any material id is missing or if any material id has been updated
+                target_mat_ids = set()
+                for g_doc in all_target_docs:
+                    target_mat_ids |= set(g_doc["grouped_ids"])
+
+                self.logger.debug(
+                    f"There are {len(mat_ids)} material ids in the source database vs {len(target_mat_ids)} in the target database."
+                )
+                if mat_ids == target_mat_ids and max_mat_time < min_target_time:
+                    continue
+                else:
+                    self.logger.info(
+                        f"Nuking all {len(target_mat_ids)} documents in chemsys {chemsys} in the target database."
+                    )
+                    self._remove_targets(list(target_mat_ids))
+
+            yield {"chemsys": chemsys, "materials": all_mats_in_chemsys}
+
+    def update_targets(self, items: List):
+        items = list(filter(None, chain.from_iterable(items)))
+        if len(items) > 0:
+            self.logger.info("Updating {} sgroups documents".format(len(items)))
+            for struct_group_dict in items:
+                struct_group_dict[self.sgroups.last_updated_field] = datetime.utcnow()
+            self.sgroups.update(docs=items, key=["material_id"])
+        else:
+            self.logger.info("No items to update")
+
+    def _entry_from_mat_doc(self, mdoc):
+        # Note since we are just structure grouping we don't need to be careful with energy or correction
+        # All of the energy analysis is left to other builders
+        d_ = {
+            "entry_id": mdoc["material_id"],
+            "structure": mdoc["structure"],
+            "energy": -math.inf,
+            "correction": -math.inf,
+        }
+        return ComputedStructureEntry.from_dict(d_)
+
+    def process_item(self, item: Any) -> Any:
+        entries = [*map(self._entry_from_mat_doc, item["materials"])]
+        s_groups = StructureGroupDoc.from_ungrouped_structure_entries(
+            entries=entries,
+            ignored_species=[self.working_ion],
+            ltol=self.ltol,
+            stol=self.stol,
+            angle_tol=self.angle_tol,
+        )
+        # append the working_ion to the group ids
+        for sg in s_groups:
+            sg.material_id = f"{sg.material_id}_{self.working_ion}"
+        return [sg.dict() for sg in s_groups]
+
+    def _remove_targets(self, rm_ids):
+        self.sgroups.remove_docs({"material_id": {"$in": rm_ids}})
+
+
diff --git a/emmet-builders/emmet/builders/vasp/thermo.py b/emmet-builders/emmet/builders/vasp/thermo.py
index 4cc41b44f5..514e6fe957 100644
--- a/emmet-builders/emmet/builders/vasp/thermo.py
+++ b/emmet-builders/emmet/builders/vasp/thermo.py
@@ -116,17 +116,17 @@ def process_item(self, item: Tuple[List[str], List[ComputedEntry]]):
 
         self.logger.debug(f"Procesing {len(entries)} entries for {chemsys}")
 
-        material_entries = defaultdict(lambda: defaultdict(list))
+        material_entries = defaultdict(dict)
         pd_entries = []
         for entry in entries:
-            material_entries[entry.entry_id][entry.data["run_type"]].append(entry)
+            material_entries[entry.entry_id][entry.data["run_type"]] = entry
 
         # TODO: How to make this general and controllable via SETTINGS?
         for material_id in material_entries:
             if "GGA+U" in material_entries[material_id]:
-                pd_entries.extend(material_entries[material_id]["GGA+U"])
+                pd_entries.append(material_entries[material_id]["GGA+U"])
             elif "GGA" in material_entries[material_id]:
-                pd_entries.extend(material_entries[material_id]["GGA"])
+                pd_entries.append(material_entries[material_id]["GGA"])
         pd_entries = self.compatibility.process_entries(pd_entries)
 
         try:
@@ -141,7 +141,7 @@ def process_item(self, item: Tuple[List[str], List[ComputedEntry]]):
                 elsyms.extend([el.symbol for el in e.composition.elements])
 
             self.logger.warning(
-                f"Phase diagram errorin chemsys {'-'.join(sorted(set(elsyms)))}: {p}"
+                f"Phase diagram error in chemsys {'-'.join(sorted(set(elsyms)))}: {p}"
             )
             return []
         except Exception as e:
diff --git a/emmet-core/emmet/core/electrode.py b/emmet-core/emmet/core/electrode.py
index 26f9ce58d9..e1d643cff9 100644
--- a/emmet-core/emmet/core/electrode.py
+++ b/emmet-core/emmet/core/electrode.py
@@ -117,6 +117,8 @@ class InsertionElectrodeDoc(InsertionVoltagePairDoc):
 
     framework: Composition
 
+    electrode_object: Dict
+
     # Make sure that the datetime field is properly formatted
     @validator("last_updated", pre=True)
     def last_updated_dict_ok(cls, v):
@@ -132,7 +134,7 @@ def from_entries(
     ) -> Union["InsertionElectrodeDoc", None]:
         try:
             ie = InsertionElectrode.from_entries(
-                entries=grouped_entries, working_ion_entry=working_ion_entry
+                entries=grouped_entries, working_ion_entry=working_ion_entry, strip_structures=True
             )
         except IndexError:
             return None
@@ -140,9 +142,10 @@ def from_entries(
         d["num_steps"] = d.pop("nsteps", None)
         d["last_updated"] = datetime.utcnow()
         return cls(
-            task_id=task_id,
+            battery_id=task_id,
             host_structure=host_structure.as_dict(),
             framework=Composition(d["framework_formula"]),
+            electrode_object=ie.as_dict(),
             **d
         )
 

From c23b77b68bb2605f970e646ec26410f7fdec8ce9 Mon Sep 17 00:00:00 2001
From: jmmshn <jmmshn@lbl.gov>
Date: Fri, 12 Mar 2021 07:21:21 -0800
Subject: [PATCH 2/8] combined all electrode builders into one file

---
 .../emmet/builders/materials/electrodes.py    | 287 +++++++++++++++++-
 .../builders/materials/structure_groups.py    | 269 ----------------
 2 files changed, 271 insertions(+), 285 deletions(-)
 delete mode 100644 emmet-builders/emmet/builders/materials/structure_groups.py

diff --git a/emmet-builders/emmet/builders/materials/electrodes.py b/emmet-builders/emmet/builders/materials/electrodes.py
index 73be677873..e8efefe193 100644
--- a/emmet-builders/emmet/builders/materials/electrodes.py
+++ b/emmet-builders/emmet/builders/materials/electrodes.py
@@ -1,36 +1,291 @@
-import operator
 import math
+import operator
 from collections import namedtuple
 from datetime import datetime
 from functools import lru_cache
-from itertools import groupby, chain
+from itertools import chain, groupby
 from pprint import pprint
-from typing import Iterable, Dict, List, Any
+from typing import Any, Dict, Iterable, List
 
-from emmet.core.electrode import InsertionElectrodeDoc
-from emmet.core.structure_group import StructureGroupDoc
-from emmet.core.utils import jsanitize
 from maggma.builders import Builder, MapBuilder
 from maggma.stores import MongoStore
 from monty.json import MontyEncoder
 from numpy import unique
-from pymatgen.core import Composition
-from pymatgen.analysis.structure_matcher import StructureMatcher, ElementComparator
+from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
 from pymatgen.apps.battery.insertion_battery import InsertionElectrode
-from pymatgen.core import Structure
-from pymatgen.entries.computed_entries import ComputedStructureEntry, ComputedEntry
+from pymatgen.core import Composition, Structure
+from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
+
+from emmet.core.electrode import InsertionElectrodeDoc
+from emmet.core.structure_group import StructureGroupDoc
+from emmet.core.utils import jsanitize
 
 __author__ = "Jimmy Shen"
 __email__ = "jmmshn@lbl.gov"
 
+from pymatgen.entries.computed_entries import ComputedEntry
+
+
+def s_hash(el):
+    return el.data["comp_delith"]
+
+
+# MatDoc = namedtuple("MatDoc", ["material_id", "structure", "formula_pretty", "framework"])
+
+REDOX_ELEMENTS = [
+    "Ti",
+    "V",
+    "Cr",
+    "Mn",
+    "Fe",
+    "Co",
+    "Ni",
+    "Cu",
+    "Nb",
+    "Mo",
+    "Sn",
+    "Sb",
+    "W",
+    "Re",
+    "Bi",
+    "C",
+    "Hf",
+]
+
+# WORKING_IONS = ["Li", "Be", "Na", "Mg", "K", "Ca", "Rb", "Sr", "Cs", "Ba"]
+
+MAT_PROPS = [
+    "structure",
+    "material_id",
+    "formula_pretty",
+]
+
+sg_fields = ["number", "hall_number", "international", "hall", "choice"]
+
+
+def generic_groupby(list_in, comp=operator.eq):
+    """
+    Group a list of unsortable objects
+    Args:
+        list_in: A list of generic objects
+        comp: (Default value = operator.eq) The comparator
+    Returns:
+        [int] list of labels for the input list
+    """
+    list_out = [None] * len(list_in)
+    label_num = 0
+    for i1, ls1 in enumerate(list_out):
+        if ls1 is not None:
+            continue
+        list_out[i1] = label_num
+        for i2, ls2 in list(enumerate(list_out))[i1 + 1 :]:
+            if comp(list_in[i1], list_in[i2]):
+                if list_out[i2] is None:
+                    list_out[i2] = list_out[i1]
+                else:
+                    list_out[i1] = list_out[i2]
+                    label_num -= 1
+        label_num += 1
+    return list_out
+
+
+
+class StructureGroupBuilder(Builder):
+    def __init__(
+            self,
+            materials: MongoStore,
+            sgroups: MongoStore,
+            working_ion: str,
+            query: dict = None,
+            ltol: float = 0.2,
+            stol: float = 0.3,
+            angle_tol: float = 5.0,
+            check_newer: bool = True,
+            **kwargs,
+    ):
+        """
+        Aggregate materials entries into sgroups that are topotactically similar to each other.
+        This is an incremental builder that makes ensures that each materials id belongs to one StructureGroupDoc document
+        Args:
+            materials (Store): Store of materials documents that contains the structures
+            sgroups (Store): Store of grouped material ids
+            query (dict): dictionary to limit materials to be analyzed ---
+                            only applied to the materials when we need to group structures
+                            the phase diagram is still constructed with the entire set
+        """
+        self.materials = materials
+        self.sgroups = sgroups
+        self.working_ion = working_ion
+        self.query = query if query else {}
+        self.ltol = ltol
+        self.stol = stol
+        self.angle_tol = angle_tol
+        self.check_newer = check_newer
+        super().__init__(sources=[materials], targets=[sgroups], **kwargs)
+
+    def prechunk(self, number_splits: int) -> Iterable[Dict]:
+        """
+        TODO can implement this for distributed runs by adding filters
+        """
+        pass
+
+    def get_items(self):
+        """
+        Summary of the steps:
+        - query the materials database for different chemical systems that satisfies the base query
+          "contains redox element and working ion"
+        - Get the full chemsys list of interest
+        - The main loop is over all these chemsys.  within the main loop:
+            - get newest timestamp for the material documents (max_mat_time)
+            - get the oldest timestamp for the target documents (min_target_time)
+            - if min_target_time is < max_mat_time then nuke all the target documents
+        """
+
+        # All potentially interesting chemsys must contain the working ion
+        base_query = {
+            "$and": [
+                {"elements": {"$in": REDOX_ELEMENTS + [self.working_ion]}},
+                self.query.copy(),
+            ]
+        }
+        self.logger.debug(f"Initial Chemsys QUERY: {base_query}")
+
+        # get a chemsys that only contains the working ion since the working ion
+        # must be present for there to be voltage steps
+        all_chemsys = self.materials.distinct("chemsys", criteria=base_query)
+        # Contains the working ion but not ONLY the working ion
+        all_chemsys = [
+            *filter(
+                lambda x: self.working_ion in x and len(x) > 1,
+                [chemsys_.split("-") for chemsys_ in all_chemsys],
+            )
+        ]
+
+        self.logger.debug(
+            f"Performing initial checks on {len(all_chemsys)} chemical systems containing redox elements with or without the Working Ion."
+        )
+        self.total = len(all_chemsys)
+
+        for chemsys_l in all_chemsys:
+            chemsys = "-".join(sorted(chemsys_l))
+            chemsys_wo = "-".join(sorted(set(chemsys_l) - {self.working_ion}))
+            chemsys_query = {
+                "$and": [
+                    {"chemsys": {"$in": [chemsys_wo, chemsys]}},
+                    self.query.copy(),
+                ]
+            }
+            self.logger.debug(f"QUERY: {chemsys_query}")
+            all_mats_in_chemsys = list(
+                self.materials.query(
+                    criteria=chemsys_query,
+                    properties=MAT_PROPS + [self.materials.last_updated_field],
+                )
+            )
+            self.logger.debug(
+                f"Found {len(all_mats_in_chemsys)} materials in {chemsys_wo}"
+            )
+            if self.check_newer:
+                all_target_docs = list(
+                    self.sgroups.query(
+                        criteria={"chemsys": chemsys},
+                        properties=[
+                            "material_id",
+                            self.sgroups.last_updated_field,
+                            "grouped_ids",
+                        ],
+                    )
+                )
+                self.logger.debug(
+                    f"Found {len(all_target_docs)} Grouped documents in {chemsys_wo}"
+                )
+
+                mat_times = [
+                    mat_doc[self.materials.last_updated_field]
+                    for mat_doc in all_mats_in_chemsys
+                ]
+                max_mat_time = max(mat_times, default=datetime.min)
+                self.logger.debug(
+                    f"The newest material doc was generated at {max_mat_time}."
+                )
+
+                target_times = [
+                    g_doc[self.materials.last_updated_field]
+                    for g_doc in all_target_docs
+                ]
+                min_target_time = min(target_times, default=datetime.max)
+                self.logger.debug(
+                    f"The newest GROUP doc was generated at {min_target_time}."
+                )
+
+                mat_ids = set(
+                    [mat_doc["material_id"] for mat_doc in all_mats_in_chemsys]
+                )
+
+                # If any material id is missing or if any material id has been updated
+                target_mat_ids = set()
+                for g_doc in all_target_docs:
+                    target_mat_ids |= set(g_doc["grouped_ids"])
+
+                self.logger.debug(
+                    f"There are {len(mat_ids)} material ids in the source database vs {len(target_mat_ids)} in the target database."
+                )
+                if mat_ids == target_mat_ids and max_mat_time < min_target_time:
+                    continue
+                else:
+                    self.logger.info(
+                        f"Nuking all {len(target_mat_ids)} documents in chemsys {chemsys} in the target database."
+                    )
+                    self._remove_targets(list(target_mat_ids))
+
+            yield {"chemsys": chemsys, "materials": all_mats_in_chemsys}
+
+    def update_targets(self, items: List):
+        items = list(filter(None, chain.from_iterable(items)))
+        if len(items) > 0:
+            self.logger.info("Updating {} sgroups documents".format(len(items)))
+            for struct_group_dict in items:
+                struct_group_dict[self.sgroups.last_updated_field] = datetime.utcnow()
+            self.sgroups.update(docs=items, key=["material_id"])
+        else:
+            self.logger.info("No items to update")
+
+    def _entry_from_mat_doc(self, mdoc):
+        # Note since we are just structure grouping we don't need to be careful with energy or correction
+        # All of the energy analysis is left to other builders
+        d_ = {
+            "entry_id": mdoc["material_id"],
+            "structure": mdoc["structure"],
+            "energy": -math.inf,
+            "correction": -math.inf,
+        }
+        return ComputedStructureEntry.from_dict(d_)
+
+    def process_item(self, item: Any) -> Any:
+        entries = [*map(self._entry_from_mat_doc, item["materials"])]
+        s_groups = StructureGroupDoc.from_ungrouped_structure_entries(
+            entries=entries,
+            ignored_species=[self.working_ion],
+            ltol=self.ltol,
+            stol=self.stol,
+            angle_tol=self.angle_tol,
+        )
+        # append the working_ion to the group ids
+        for sg in s_groups:
+            sg.material_id = f"{sg.material_id}_{self.working_ion}"
+        return [sg.dict() for sg in s_groups]
+
+    def _remove_targets(self, rm_ids):
+        self.sgroups.remove_docs({"material_id": {"$in": rm_ids}})
+
 class InsertionElectrodeBuilder(MapBuilder):
     def __init__(
-        self,
-        grouped_materials: MongoStore,
-        insertion_electrode: MongoStore,
-        thermo: MongoStore,
-        query: dict = None,
-        **kwargs,
+            self,
+            grouped_materials: MongoStore,
+            insertion_electrode: MongoStore,
+            thermo: MongoStore,
+            query: dict = None,
+            **kwargs,
     ):
         self.grouped_materials = grouped_materials
         self.insertion_electrode = insertion_electrode
diff --git a/emmet-builders/emmet/builders/materials/structure_groups.py b/emmet-builders/emmet/builders/materials/structure_groups.py
deleted file mode 100644
index 2cd463495e..0000000000
--- a/emmet-builders/emmet/builders/materials/structure_groups.py
+++ /dev/null
@@ -1,269 +0,0 @@
-import operator
-import math
-from datetime import datetime
-from itertools import chain
-from typing import Iterable, Dict, List, Any
-
-from emmet.core.structure_group import StructureGroupDoc
-from maggma.builders import Builder
-from maggma.stores import MongoStore
-from pymatgen.entries.computed_entries import ComputedStructureEntry
-
-__author__ = "Jimmy Shen"
-__email__ = "jmmshn@lbl.gov"
-
-from pymatgen.entries.computed_entries import ComputedEntry
-
-def s_hash(el):
-    return el.data["comp_delith"]
-
-
-# MatDoc = namedtuple("MatDoc", ["material_id", "structure", "formula_pretty", "framework"])
-
-REDOX_ELEMENTS = [
-    "Ti",
-    "V",
-    "Cr",
-    "Mn",
-    "Fe",
-    "Co",
-    "Ni",
-    "Cu",
-    "Nb",
-    "Mo",
-    "Sn",
-    "Sb",
-    "W",
-    "Re",
-    "Bi",
-    "C",
-    "Hf",
-]
-
-# WORKING_IONS = ["Li", "Be", "Na", "Mg", "K", "Ca", "Rb", "Sr", "Cs", "Ba"]
-
-MAT_PROPS = [
-    "structure",
-    "material_id",
-    "formula_pretty",
-]
-
-sg_fields = ["number", "hall_number", "international", "hall", "choice"]
-
-
-def generic_groupby(list_in, comp=operator.eq):
-    """
-    Group a list of unsortable objects
-    Args:
-        list_in: A list of generic objects
-        comp: (Default value = operator.eq) The comparator
-    Returns:
-        [int] list of labels for the input list
-    """
-    list_out = [None] * len(list_in)
-    label_num = 0
-    for i1, ls1 in enumerate(list_out):
-        if ls1 is not None:
-            continue
-        list_out[i1] = label_num
-        for i2, ls2 in list(enumerate(list_out))[i1 + 1 :]:
-            if comp(list_in[i1], list_in[i2]):
-                if list_out[i2] is None:
-                    list_out[i2] = list_out[i1]
-                else:
-                    list_out[i1] = list_out[i2]
-                    label_num -= 1
-        label_num += 1
-    return list_out
-
-
-
-class StructureGroupBuilder(Builder):
-    def __init__(
-            self,
-            materials: MongoStore,
-            sgroups: MongoStore,
-            working_ion: str,
-            query: dict = None,
-            ltol: float = 0.2,
-            stol: float = 0.3,
-            angle_tol: float = 5.0,
-            check_newer: bool = True,
-            **kwargs,
-    ):
-        """
-        Aggregate materials entries into sgroups that are topotactically similar to each other.
-        This is an incremental builder that makes ensures that each materials id belongs to one StructureGroupDoc document
-        Args:
-            materials (Store): Store of materials documents that contains the structures
-            sgroups (Store): Store of grouped material ids
-            query (dict): dictionary to limit materials to be analyzed ---
-                            only applied to the materials when we need to group structures
-                            the phase diagram is still constructed with the entire set
-        """
-        self.materials = materials
-        self.sgroups = sgroups
-        self.working_ion = working_ion
-        self.query = query if query else {}
-        self.ltol = ltol
-        self.stol = stol
-        self.angle_tol = angle_tol
-        self.check_newer = check_newer
-        super().__init__(sources=[materials], targets=[sgroups], **kwargs)
-
-    def prechunk(self, number_splits: int) -> Iterable[Dict]:
-        """
-        TODO can implement this for distributed runs by adding filters
-        """
-        pass
-
-    def get_items(self):
-        """
-        Summary of the steps:
-        - query the materials database for different chemical systems that satisfies the base query
-          "contains redox element and working ion"
-        - Get the full chemsys list of interest
-        - The main loop is over all these chemsys.  within the main loop:
-            - get newest timestamp for the material documents (max_mat_time)
-            - get the oldest timestamp for the target documents (min_target_time)
-            - if min_target_time is < max_mat_time then nuke all the target documents
-        """
-
-        # All potentially interesting chemsys must contain the working ion
-        base_query = {
-            "$and": [
-                {"elements": {"$in": REDOX_ELEMENTS + [self.working_ion]}},
-                self.query.copy(),
-            ]
-        }
-        self.logger.debug(f"Initial Chemsys QUERY: {base_query}")
-
-        # get a chemsys that only contains the working ion since the working ion
-        # must be present for there to be voltage steps
-        all_chemsys = self.materials.distinct("chemsys", criteria=base_query)
-        # Contains the working ion but not ONLY the working ion
-        all_chemsys = [
-            *filter(
-                lambda x: self.working_ion in x and len(x) > 1,
-                [chemsys_.split("-") for chemsys_ in all_chemsys],
-            )
-        ]
-
-        self.logger.debug(
-            f"Performing initial checks on {len(all_chemsys)} chemical systems containing redox elements with or without the Working Ion."
-        )
-        self.total = len(all_chemsys)
-
-        for chemsys_l in all_chemsys:
-            chemsys = "-".join(sorted(chemsys_l))
-            chemsys_wo = "-".join(sorted(set(chemsys_l) - {self.working_ion}))
-            chemsys_query = {
-                "$and": [
-                    {"chemsys": {"$in": [chemsys_wo, chemsys]}},
-                    self.query.copy(),
-                ]
-            }
-            self.logger.debug(f"QUERY: {chemsys_query}")
-            all_mats_in_chemsys = list(
-                self.materials.query(
-                    criteria=chemsys_query,
-                    properties=MAT_PROPS + [self.materials.last_updated_field],
-                )
-            )
-            self.logger.debug(
-                f"Found {len(all_mats_in_chemsys)} materials in {chemsys_wo}"
-            )
-            if self.check_newer:
-                all_target_docs = list(
-                    self.sgroups.query(
-                        criteria={"chemsys": chemsys},
-                        properties=[
-                            "material_id",
-                            self.sgroups.last_updated_field,
-                            "grouped_ids",
-                        ],
-                    )
-                )
-                self.logger.debug(
-                    f"Found {len(all_target_docs)} Grouped documents in {chemsys_wo}"
-                )
-
-                mat_times = [
-                    mat_doc[self.materials.last_updated_field]
-                    for mat_doc in all_mats_in_chemsys
-                ]
-                max_mat_time = max(mat_times, default=datetime.min)
-                self.logger.debug(
-                    f"The newest material doc was generated at {max_mat_time}."
-                )
-
-                target_times = [
-                    g_doc[self.materials.last_updated_field]
-                    for g_doc in all_target_docs
-                ]
-                min_target_time = min(target_times, default=datetime.max)
-                self.logger.debug(
-                    f"The newest GROUP doc was generated at {min_target_time}."
-                )
-
-                mat_ids = set(
-                    [mat_doc["material_id"] for mat_doc in all_mats_in_chemsys]
-                )
-
-                # If any material id is missing or if any material id has been updated
-                target_mat_ids = set()
-                for g_doc in all_target_docs:
-                    target_mat_ids |= set(g_doc["grouped_ids"])
-
-                self.logger.debug(
-                    f"There are {len(mat_ids)} material ids in the source database vs {len(target_mat_ids)} in the target database."
-                )
-                if mat_ids == target_mat_ids and max_mat_time < min_target_time:
-                    continue
-                else:
-                    self.logger.info(
-                        f"Nuking all {len(target_mat_ids)} documents in chemsys {chemsys} in the target database."
-                    )
-                    self._remove_targets(list(target_mat_ids))
-
-            yield {"chemsys": chemsys, "materials": all_mats_in_chemsys}
-
-    def update_targets(self, items: List):
-        items = list(filter(None, chain.from_iterable(items)))
-        if len(items) > 0:
-            self.logger.info("Updating {} sgroups documents".format(len(items)))
-            for struct_group_dict in items:
-                struct_group_dict[self.sgroups.last_updated_field] = datetime.utcnow()
-            self.sgroups.update(docs=items, key=["material_id"])
-        else:
-            self.logger.info("No items to update")
-
-    def _entry_from_mat_doc(self, mdoc):
-        # Note since we are just structure grouping we don't need to be careful with energy or correction
-        # All of the energy analysis is left to other builders
-        d_ = {
-            "entry_id": mdoc["material_id"],
-            "structure": mdoc["structure"],
-            "energy": -math.inf,
-            "correction": -math.inf,
-        }
-        return ComputedStructureEntry.from_dict(d_)
-
-    def process_item(self, item: Any) -> Any:
-        entries = [*map(self._entry_from_mat_doc, item["materials"])]
-        s_groups = StructureGroupDoc.from_ungrouped_structure_entries(
-            entries=entries,
-            ignored_species=[self.working_ion],
-            ltol=self.ltol,
-            stol=self.stol,
-            angle_tol=self.angle_tol,
-        )
-        # append the working_ion to the group ids
-        for sg in s_groups:
-            sg.material_id = f"{sg.material_id}_{self.working_ion}"
-        return [sg.dict() for sg in s_groups]
-
-    def _remove_targets(self, rm_ids):
-        self.sgroups.remove_docs({"material_id": {"$in": rm_ids}})
-
-

From 7beca8c359e327bf9b39edc0079b0d8a67d74c0e Mon Sep 17 00:00:00 2001
From: jmmshn <jmmshn@lbl.gov>
Date: Fri, 12 Mar 2021 12:27:18 -0800
Subject: [PATCH 3/8] wip

wip
---
 emmet-core/emmet/core/migration_graph.py | 117 +++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 emmet-core/emmet/core/migration_graph.py

diff --git a/emmet-core/emmet/core/migration_graph.py b/emmet-core/emmet/core/migration_graph.py
new file mode 100644
index 0000000000..e96e783f81
--- /dev/null
+++ b/emmet-core/emmet/core/migration_graph.py
@@ -0,0 +1,117 @@
+from typing import Dict, List, Union, Tuple
+
+from pydantic import BaseModel, Field, validator
+from pymatgen.analysis.diffusion.neb.full_path_mapper import MigrationGraph
+from pymatgen.analysis.diffusion.neb.pathfinder import MigrationHop
+from pymatgen.analysis.graphs import StructureGraph
+from pymatgen.core import Composition, Structure, PeriodicSite
+
+
+class Hop(BaseModel):
+    """
+    Data for a particular hop, this is distinct from the Migration Hop object since this document
+    only stores the data related a particualr hop but not the symmetrized structure itself.
+    """
+    iindex: int = Field(None, description="")
+    eindex: int = Field(None, description="")
+    ipos: Tuple[float, float, float] = Field(None, description="")
+    epos: Tuple[float, float, float] = Field(None, description="")
+    ipos_cart: Tuple[float, float, float] = Field(None, description="")
+    epos_cart: Tuple[float, float, float] = Field(None, description="")
+    to_jimage: Tuple[int, int, int] = Field(None, description="")
+    distance: float = Field(None, description="")
+    hop_label: int = Field(None, description="")
+
+
+class MigrationGraphDoc(BaseModel):
+    """
+    Data for MigrationGraph objects from pymatgen-diffusion.
+    Note:
+        This will just be used to construct the object for each material.
+        The only data we will use are the "site energies" defined at each meta-stable migrating ion site.
+        In the future more advanced query capabilities should be introduced with fields in the document model.
+    """
+
+    structure: Structure = Field(
+        None,
+        description="The atomic structure with all migting ion sites represented as atoms of the same species."
+    )
+
+    m_graph: StructureGraph = Field(
+        None,
+        description="The structure graph that represents the migration network."
+    )
+
+    hops: Dict[int, Hop] = Field(
+        None,
+        description="All of the hops in the system given as a list."
+    )
+
+    unique_hops: Dict[int, Hop] = Field(
+        None,
+        description="The unique hops dictionary keyed by the hop label {0: {=Dictionary of properties=}}"
+    )
+
+    host_structure: Structure = Field(
+        None,
+        description="The empty host lattice without the migrating ion."
+    )
+
+    symprec: float = Field(None, description="Parameter used by pymatgen to determin equivalent hops.")
+
+    vac_mode: bool = Field(None, description="Indicates whether vacancy mode should be used [currently under-supported].")
+
+    @classmethod
+    def from_migration_graph(cls, migration_graph: MigrationGraph):
+        """
+        Construct the document using a MigrationGraph object
+        """
+        summary_dict = migration_graph.get_summary_dict()
+
+        return cls(
+            structure=migration_graph.structure,
+            m_graph=migration_graph.m_graph,
+            hops = summary_dict["hops"],
+            unique_hops=summary_dict["unique_hops"],
+            host_structure=migration_graph.host_structure,
+            symprec=migration_graph.symprec,
+            vac_mode=migration_graph.vac_mode
+        )
+
+    def as_migration_graph(self):
+        """
+        Get a migration graph object from this document
+        """
+        mg = MigrationGraph(
+            structure=self.structure,
+            m_graph=self.m_graph,
+            symprec=self.symprec,
+            vac_mode=self.vac_mode
+        )
+
+        # make sure there is a one-to-one mapping between the unique hops dictionary
+        def get_mg_uhop_key(ipos, epos):
+            isite = PeriodicSite(coords=self.ipos, lattice=self.structure.lattice)
+            esite = PeriodicSite(coords=self.epos, lattice=self.structure.lattice)
+            hop = MigrationHop(isite, esite, symm_structure=mg.symm_structure)
+
+            for k,v in mg.unique_hops.items():
+                if hop == v['hop']:
+                    return k
+
+        for k, v in self.unique_hops.items():
+            mg_k = get_mg_uhop_key(v["ipos"], v["epos"])
+            if k != mg_k:
+                raise RuntimeError("The unique hops in the reconstructed migration graph is different than the one in the document"
+                                   f"MigrationGraphDoc ({k}) MigrationGraph ({mg_k})")
+
+        # TODO add any datamapping from the DB to reconstructed object here.
+        return mg
+
+
+
+
+
+
+
+

From 29bb3dbfda6e071162126178c79182cc4b7413d8 Mon Sep 17 00:00:00 2001
From: jmmshn <jmmshn@gmail.com>
Date: Sun, 14 Mar 2021 20:58:25 -0700
Subject: [PATCH 4/8] delete spaces

---
 emmet-core/emmet/core/migration_graph.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/emmet-core/emmet/core/migration_graph.py b/emmet-core/emmet/core/migration_graph.py
index e96e783f81..30eb60a80d 100644
--- a/emmet-core/emmet/core/migration_graph.py
+++ b/emmet-core/emmet/core/migration_graph.py
@@ -106,12 +106,5 @@ def get_mg_uhop_key(ipos, epos):
                                    f"MigrationGraphDoc ({k}) MigrationGraph ({mg_k})")
 
         # TODO add any datamapping from the DB to reconstructed object here.
-        return mg
-
-
-
-
-
-
-
 
+        return mg
\ No newline at end of file

From 6cee3ff9c4a0991e67b10ee49949cb6e6ed50a9a Mon Sep 17 00:00:00 2001
From: jmmshn <jmmshn@gmail.com>
Date: Mon, 15 Mar 2021 18:14:16 -0700
Subject: [PATCH 5/8] changed task_id -> material_id

---
 emmet-builders/emmet/builders/vasp/thermo.py | 2 +-
 emmet-builders/requirements.txt              | 4 ++--
 emmet-core/emmet/core/migration_graph.py     | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/emmet-builders/emmet/builders/vasp/thermo.py b/emmet-builders/emmet/builders/vasp/thermo.py
index 514e6fe957..3c65c5bc7a 100644
--- a/emmet-builders/emmet/builders/vasp/thermo.py
+++ b/emmet-builders/emmet/builders/vasp/thermo.py
@@ -249,7 +249,7 @@ def get_new_chemsys(self) -> Set:
         thermo_mat_ids = self.thermo.distinct(self.thermo.key)
         mat_ids = self.materials.distinct(self.materials.key, self.query)
         dif_task_ids = list(set(mat_ids) - set(thermo_mat_ids))
-        q = {"task_id": {"$in": dif_task_ids}}
+        q = {"material_id": {"$in": dif_task_ids}}
         new_mat_chemsys = set(self.materials.distinct("chemsys", q))
         self.logger.debug(f"Found {len(new_mat_chemsys)} new chemical systems")
 
diff --git a/emmet-builders/requirements.txt b/emmet-builders/requirements.txt
index fcc8b3e924..74cfa2f591 100644
--- a/emmet-builders/requirements.txt
+++ b/emmet-builders/requirements.txt
@@ -1,3 +1,3 @@
-pymatgen==2020.12.31
-maggma==0.25.0
+pymatgen>=2020.12.31
+maggma>=0.25.0
 emmet-core
diff --git a/emmet-core/emmet/core/migration_graph.py b/emmet-core/emmet/core/migration_graph.py
index 30eb60a80d..80c1d72f30 100644
--- a/emmet-core/emmet/core/migration_graph.py
+++ b/emmet-core/emmet/core/migration_graph.py
@@ -102,8 +102,8 @@ def get_mg_uhop_key(ipos, epos):
         for k, v in self.unique_hops.items():
             mg_k = get_mg_uhop_key(v["ipos"], v["epos"])
             if k != mg_k:
-                raise RuntimeError("The unique hops in the reconstructed migration graph is different than the one in the document"
-                                   f"MigrationGraphDoc ({k}) MigrationGraph ({mg_k})")
+                raise RuntimeError("The unique hops in the reconstructed migration graph is different than the one "
+                                   f"in the document MigrationGraphDoc ({k}) MigrationGraph ({mg_k})")
 
         # TODO add any datamapping from the DB to reconstructed object here.
 

From 13d21b1ed93e8620f74e37ca4178b4d290084fb7 Mon Sep 17 00:00:00 2001
From: jmmshn <jmmshn@gmail.com>
Date: Mon, 15 Mar 2021 22:04:56 -0700
Subject: [PATCH 6/8] updated error message

updated error message


update
---
 emmet-builders/emmet/builders/vasp/thermo.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/emmet-builders/emmet/builders/vasp/thermo.py b/emmet-builders/emmet/builders/vasp/thermo.py
index 3c65c5bc7a..850b3922c6 100644
--- a/emmet-builders/emmet/builders/vasp/thermo.py
+++ b/emmet-builders/emmet/builders/vasp/thermo.py
@@ -114,7 +114,7 @@ def process_item(self, item: Tuple[List[str], List[ComputedEntry]]):
         )
         chemsys = "-".join(elements)
 
-        self.logger.debug(f"Procesing {len(entries)} entries for {chemsys}")
+        self.logger.debug(f"Processing {len(entries)} entries for {chemsys}")
 
         material_entries = defaultdict(dict)
         pd_entries = []
@@ -128,6 +128,7 @@ def process_item(self, item: Tuple[List[str], List[ComputedEntry]]):
             elif "GGA" in material_entries[material_id]:
                 pd_entries.append(material_entries[material_id]["GGA"])
         pd_entries = self.compatibility.process_entries(pd_entries)
+        self.logger.debug(f"{len(pd_entries)} remain in {chemsys} after filtering")
 
         try:
             docs = ThermoDoc.from_entries(pd_entries)
@@ -145,7 +146,7 @@ def process_item(self, item: Tuple[List[str], List[ComputedEntry]]):
             )
             return []
         except Exception as e:
-            self.logger.error(f"Got unexpected error: {e}")
+            self.logger.error(f"Got unexpected error while processing {[ent_.entry_id for ent_ in entries]}: {e}")
             return []
 
         return [d.dict() for d in docs]

From 4f5a0f6ffe670ee5c8233e96413fb6a85dcffc61 Mon Sep 17 00:00:00 2001
From: jmmshn <jmmshn@gmail.com>
Date: Mon, 15 Mar 2021 22:07:52 -0700
Subject: [PATCH 7/8] return nothing

---
 emmet-builders/emmet/builders/vasp/thermo.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/emmet-builders/emmet/builders/vasp/thermo.py b/emmet-builders/emmet/builders/vasp/thermo.py
index 850b3922c6..a04eed5b67 100644
--- a/emmet-builders/emmet/builders/vasp/thermo.py
+++ b/emmet-builders/emmet/builders/vasp/thermo.py
@@ -106,6 +106,8 @@ def get_items(self) -> Iterator[List[Dict]]:
     def process_item(self, item: Tuple[List[str], List[ComputedEntry]]):
 
         entries = item
+        if len(entries) == 0:
+            return []
 
         entries = [ComputedStructureEntry.from_dict(entry) for entry in entries]
         # determine chemsys

From b5e2c7978e5505e1aba57638d8be97b107548f62 Mon Sep 17 00:00:00 2001
From: jmmshn <jmmshn@gmail.com>
Date: Tue, 16 Mar 2021 14:27:13 -0700
Subject: [PATCH 8/8] changed pymatgen verion

---
 emmet-builders/emmet/builders/vasp/thermo.py |  7 +------
 emmet-builders/requirements.txt              |  2 +-
 emmet-builders/setup.py                      |  6 +++++-
 emmet-core/emmet/core/thermo.py              |  7 +------
 emmet-core/emmet/core/vasp/material.py       | 14 ++++++--------
 emmet-core/requirements.txt                  |  2 +-
 6 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/emmet-builders/emmet/builders/vasp/thermo.py b/emmet-builders/emmet/builders/vasp/thermo.py
index a04eed5b67..124c83222d 100644
--- a/emmet-builders/emmet/builders/vasp/thermo.py
+++ b/emmet-builders/emmet/builders/vasp/thermo.py
@@ -6,7 +6,7 @@
 from maggma.core import Builder, Store
 from monty.json import MontyDecoder
 from pymatgen.core import Structure
-from pymatgen.analysis.phase_diagram import PhaseDiagram
+from pymatgen.analysis.phase_diagram import PhaseDiagramError
 from pymatgen.analysis.structure_analyzer import oxide_type
 from pymatgen.entries.compatibility import MaterialsProjectCompatibility
 from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
@@ -18,11 +18,6 @@
 )
 from emmet.core.thermo import ThermoDoc
 from emmet.core.vasp.calc_types import run_type
-class PhaseDiagramError(Exception):
-    """
-    An exception class for Phase Diagram generation.
-    """
-    pass
 
 class Thermo(Builder):
     def __init__(
diff --git a/emmet-builders/requirements.txt b/emmet-builders/requirements.txt
index 74cfa2f591..15986ed13e 100644
--- a/emmet-builders/requirements.txt
+++ b/emmet-builders/requirements.txt
@@ -1,3 +1,3 @@
-pymatgen>=2020.12.31
 maggma>=0.25.0
+-e git://github.com/materialsproject/pymatgen.git@master#egg=pymatgen
 emmet-core
diff --git a/emmet-builders/setup.py b/emmet-builders/setup.py
index 0ab2ceb2f9..ae5d444266 100644
--- a/emmet-builders/setup.py
+++ b/emmet-builders/setup.py
@@ -1,9 +1,13 @@
 import datetime
 from pathlib import Path
 from setuptools import setup, find_namespace_packages
+required = []
 
 with open(Path(__file__).parent / "requirements.txt") as f:
-    required = f.read().splitlines()
+    for line in f.readlines():
+        if "#egg=" in line:
+            continue
+        required.append(line)
 
 setup(
     name="emmet-builders",
diff --git a/emmet-core/emmet/core/thermo.py b/emmet-core/emmet/core/thermo.py
index 239b2fdcaf..e607e795df 100644
--- a/emmet-core/emmet/core/thermo.py
+++ b/emmet-core/emmet/core/thermo.py
@@ -4,7 +4,7 @@
 from typing import ClassVar, Dict, List, Union
 
 from pydantic import BaseModel, Field
-from pymatgen.analysis.phase_diagram import PhaseDiagram
+from pymatgen.analysis.phase_diagram import PhaseDiagram, PhaseDiagramError
 from pymatgen.core import Composition
 from pymatgen.core.periodic_table import Element
 from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
@@ -12,11 +12,6 @@
 from emmet.core.material_property import PropertyDoc
 from emmet.core.mpid import MPID
 from emmet.core.structure import StructureMetadata
-class PhaseDiagramError(Exception):
-    """
-    An exception class for Phase Diagram generation.
-    """
-    pass
 
 class DecompositionProduct(BaseModel):
     """
diff --git a/emmet-core/emmet/core/vasp/material.py b/emmet-core/emmet/core/vasp/material.py
index e9fed2fc2d..08f35d1a50 100644
--- a/emmet-core/emmet/core/vasp/material.py
+++ b/emmet-core/emmet/core/vasp/material.py
@@ -54,6 +54,12 @@ def from_tasks(
             quality_scores: quality scores for various calculation types
             use_statics: Use statics to define a material
         """
+        if task_group == 0:
+            raise Exception(f"Must have more than one task in the group.")
+
+        # Material ID
+        possible_mat_ids = [task.task_id for task in task_group]
+        material_id = min(possible_mat_ids)
 
         # Metadata
         last_updated = max(task.last_updated for task in task_group)
@@ -77,14 +83,6 @@ def from_tasks(
             else structure_optimizations
         )
 
-        # Material ID
-        possible_mat_ids = [task.task_id for task in structure_calcs]
-        possible_mat_ids = sorted(possible_mat_ids)
-
-        if len(possible_mat_ids) == 0:
-            raise Exception(f"Could not find a material ID for {task_ids}")
-        else:
-            material_id = possible_mat_ids[0]
 
         def _structure_eval(task: TaskDocument):
             """
diff --git a/emmet-core/requirements.txt b/emmet-core/requirements.txt
index 4dc90ff43f..946c60786b 100644
--- a/emmet-core/requirements.txt
+++ b/emmet-core/requirements.txt
@@ -1,4 +1,4 @@
-pymatgen==2021.2.16
+git+git://github.com/materialsproject/pymatgen@master#egg=pymatgen
 https://github.com/materialsvirtuallab/monty/archive/8d67c335bd5d8bb71ecc8ac732c82a53e0def4a1.zip
 pydantic==1.8.1
 pybtex==0.24.0