In [None]:
%env MPRESTER_MUTE_PROGRESS_BARS 1
import os
from pathlib import Path
from pandas import read_excel
from mpcontribs.client import Client, Attachment
from mp_api.client import MPRester
from flatten_dict import unflatten

In [None]:
# init clients
client = Client(project="Broberg_benchmark_defects")
mpr = MPRester(api_key=os.environ["MPCONTRIBS_API_KEY"])

In [None]:
# load data
drivedir = Path("/Users/patrick/GoogleDriveLBNL/My Drive/")
datadir = drivedir / Path("MaterialsProject/gitrepos/mpcontribs-data/Broberg_benchmark_defects")
df_bulk = read_excel(datadir / "bulk_data.xlsx")
df_defect = read_excel(datadir / "defect_level_data.xlsx")
df_transition = read_excel(datadir / "transition_level_data.xlsx")

In [None]:
# clean DOIs column
def clean_dois(s):
    return ",".join([
        doi.strip().replace("https://doi.org/", "")
        for doi in s.split(",")
    ])

df_bulk["DOI"] = df_bulk["DOI"].apply(clean_dois)

In [None]:
# fill in mp-ids
mpids = []

for name in df_bulk["Bulk Name"]:
    formula, system, symmetry = name.split("_")
    print(formula, system, symmetry)
    doc = mpr.summary.search(
        formula=formula, crystal_system=system.capitalize(),
        fields=["material_id", "formula_pretty", "crystal_system", "symmetry"],
        sort_fields="energy_above_hull"
    )[0]
    mpids.append(doc.material_id)
    
df_bulk["mp-id"] = mpids

In [None]:
# reindex and rename columns, set main index to "identifier" column
# the order of columns is respected during data submission
columns_map_bulk = {
    "mp-id": {"name": "identifier"},
    "Bulk Name": {"name": "info.bulk", "unit": None},
    "DOI": {"name": "info.DOIs", "unit": None},
    "GGA-PBE gap": {
        "name": "PBE.gap",
        "unit": "eV"
    },
    "GGA-PBE vbm": {
        "name": "PBE.vbm",
        "unit": "eV"
    },
    "GGA-PBE Elt A": {
        "name": "PBE.elementA.name",
        "unit": None
    },
    "GGA-PBE Elt A chemical potential": {
        "name": "PBE.elementA.chempot",
        "unit": "eV"
    },
    "GGA-PBE Elt B": {
        "name": "PBE.elementB.name",
        "unit": None
    },
    "GGA-PBE Elt B chemical potential": {
        "name": "PBE.elementB.chempot",
        "unit": "eV"
    },
    "GGA-PBE Elt C": {
        "name": "PBE.elementC.name",
        "unit": None
    },
    "GGA-PBE Elt C chemical potential": {
        "name": "PBE.elementC.chempot",
        "unit": "eV"
    },
    "Auto HSE06 gap": {
        "name": "HSE06.gap",
        "unit": "eV"
    },
    "Auto HSE06 vbm": {
        "name": "HSE06.vbm",
        "unit": "eV"
    },
    "FL no_bes": {
        "name": "fermi.noBES",
        "unit": "eV"
    },
    "FL bes": {
        "name": "fermi.BES",
        "unit": "eV"
    },
    "FL bes_free": {
        "name": "fermi.freeBES",
        "unit": "eV"
    },
    "lower dopability no_bes": {
        "name": "fermi.dopability.noBES.lower",
        "unit": "eV"
    },
    "upper dopability no_bes": {
        "name": "fermi.dopability.noBES.upper",
        "unit": "eV"
    },
    "lower dopability bes": {
        "name": "fermi.dopability.BES.lower",
        "unit": "eV"
    },
    "upper dopability bes": {
        "name": "fermi.dopability.BES.upper",
        "unit": "eV"
    },
    "lower dopability bes_free": {
        "name": "fermi.dopability.freeBES.lower",
        "unit": "eV"
    },
    "upper dopability bes_free": {
        "name": "fermi.dopability.freeBES.upper",
        "unit": "eV"
    },
    "hybrid-published gap": {"name": "hybrid.gap", "unit": "eV"},
    "hybrid-published vbm": {"name": "hybrid.vbm", "unit": "eV"},
    "FL hybrid-published": {"name": "hybrid.fermi", "unit": "eV"},
    "lower dopability hybrid-published": {"name": "hybrid.dopability.lower", "unit": "eV"},
    "upper dopability hybrid-published": {"name": "hybrid.dopability.upper", "unit": "eV"},
    "hybrid-published Elt A": {"name": "hybrid.elementA.name", "unit": None},
    "hybrid-published Elt A chemical potential": {"name": "hybrid.elementA.chempot", "unit": "eV"},
    "hybrid-published Elt B": {"name": "hybrid.elementB.name", "unit": None},
    "hybrid-published Elt B chemical potential": {
        "name": "hybrid.elementB.chempot",
        "unit": "eV"
    },
    "hybrid-published Elt C": {
        "name": "hybrid.elementC.name",
        "unit": None
    },
    "hybrid-published Elt C chemical potential": {
        "name": "hybrid.elementC.chempot",
        "unit": "eV"
    },
}

In [None]:
# reindex columns
new_index = list(columns_map_bulk.keys())
df_bulk = df_bulk.reindex(new_index, axis="columns")

In [None]:
# apply units to all cells
def apply_unit(cell, unit):
    if isinstance(cell, str) and cell.strip() == "-":
        return ""
    
    return f"{cell} {unit}" if unit and cell else cell

def apply_units(column):
    unit = columns_map_bulk[column.name].get("unit")
    return column.apply(apply_unit, args=(unit,))

df_bulk = df_bulk.apply(apply_units)

In [None]:
# rename columns and set new index column
new_column_names = {k: v["name"] for k, v in columns_map_bulk.items()}
df_bulk = df_bulk.rename(columns=new_column_names)

In [None]:
# compile contributions
contributions = []

for record in df_bulk.to_dict(orient="records"):
    # clean record
    data = {k: v for k, v in record.items() if v}
    # contrib data
    identifier = str(data.pop("identifier"))
    contrib = {
        "identifier": identifier,
        "data": unflatten(data, splitter="dot"),
        "attachments": []
    }
    bulk_name = data["info.bulk"]
    formula, system, symmetry = bulk_name.split("_")
    contrib["data"]["info"].update(
        {"formula": formula, "system": system, "symmetry": symmetry}
    )
    # defects attachment
    defect_rows = df_defect[df_defect["Bulk Name"] == bulk_name]
    defects = [
        {k: v for k, v in d.items() if not k.startswith("Unnamed")}
        for d in defect_rows.to_dict(orient="records")
    ]
    contrib["attachments"].append(Attachment.from_data("defects", defects))
    # transitions attachment
    transition_rows = df_transition[df_transition["Bulk Name"] == bulk_name]
    transitions = [
        {k: v for k, v in d.items() if not k.startswith("Unnamed")}
        for d in transition_rows.to_dict(orient="records")
    ]
    contrib["attachments"].append(Attachment.from_data("transitions", transitions))
    contributions.append(contrib)
    
    
contributions[0]

In [None]:
# initialize columns (including units)
columns = {"info.bulk": None, "info.formula": None, "info.system": None, "info.symmetry": None}

for col in columns_map_bulk.values():
    if "." in col["name"]:
        columns[col["name"]] = col.get("unit")

In [None]:
client.delete_contributions()
client.init_columns(columns)
client.submit_contributions(contributions)
# this shouldn't be necessary but need to re-init columns likely due to bug in API server
client.init_columns(columns) 