In [None]:
%env MPRESTER_MUTE_PROGRESS_BARS 1
import os
from pathlib import Path
from pandas import read_csv
from mpcontribs.client import Client
from mp_api.client import MPRester
from flatten_dict import unflatten
from pymatgen.core import Structure

In [None]:
# init clients
client = Client(project="deltaHvacancy")
mpr = MPRester(api_key=os.environ["MPCONTRIBS_API_KEY"])

In [None]:
# allow non-unique identifiers (disables duplicate checking)
client.projects.updateProjectByName(pk=client.project, project={"unique_identifiers": False}).result()

In [None]:
# set "other" field in project info to explain data columns
# appears on hover in contribution section on materials details pages
client.projects.updateProjectByName(
    pk=client.project, project={"other": {
        "dH": "vacancy formation enthalpy in eV",
        "dH|atom": "vacancy formation enthalpy in eV/atom",
        "m": "electron effective mass in mₑ"
        # TODO add more as needed
    }}
).result()

In [None]:
# load data
drivedir = Path("/Users/patrick/GoogleDriveLBNL/My Drive/")
datadir = drivedir / Path("MaterialsProject/gitrepos/mpcontribs-data/deltaHvacancy/nrel_matdb")

columns_map = {
    "formula": {"name": "formula"},
    "defectname": {"name": "defect"}, # string
    "site": {"name": "site", "unit": ""}, # dimensionless
    "charge": {"name": "charge", "unit": ""},
    "dH_eV": {"name": "dH", "unit": "eV"},
    "dH_eV_per_atom": {"name": "dH|atom", "unit": "eV/atom"},
    "bandgap_eV": {"name": "bandgap", "unit": "eV"},
    "electron_effective_mass": {"name": "m", "unit": "mₑ"},
    "level_theory": {"name": "theory"}
}
new_column_names = {k: v["name"] for k, v in columns_map.items()}

def apply_unit(cell, unit):
    return f"{cell} {unit}" if unit and cell else cell

def apply_units(column):
    unit = columns_map[column.name].get("unit")
    return column.apply(apply_unit, args=(unit,))

contributions = []

# NOTE make sure all `_oxstate` and `_POSCAR_wyck` files are gzipped

for path in datadir.glob("*.csv"):
    prefix, nrel_matdb_id, _ = path.name.split(".")
    stem = f"{path.parent}{os.sep}{prefix}.{nrel_matdb_id}"
    poscar_file = f"{stem}_POSCAR_wyck.gz"
    structure = Structure.from_file(poscar_file, 'POSCAR')
    mpid = mpr.find_structure(structure)
    identifier = mpid if mpid else nrel_matdb_id
    attachments = [Path(poscar_file), Path(f"{stem}_oxstate.gz")]
    df = read_csv(path).dropna(axis=1, how="all").apply(apply_units).rename(columns=new_column_names)
    
    for record in df.to_dict(orient="records"):
        data = {k: v for k, v in record.items() if v} # clean record
        contributions.append({
            "identifier": identifier,
            "data": unflatten(data, splitter="dot"),
            "structures": [structure], "attachments": attachments, # duplicates linked internally
        })
        contributions[-1]["data"]["nrel|id"] = nrel_matdb_id

contributions[0]

In [None]:
# initialize columns (including units)
columns = {"nrel|id": None}

for col in columns_map.values():
    columns[col["name"]] = col.get("unit")

In [None]:
client.delete_contributions() # easier to delete everything for small projects
client.init_columns(columns)
client.submit_contributions(contributions, ignore_dupes=True)
# this shouldn't be necessary but need to re-init columns likely due to bug in API server
client.init_columns(columns) 