In [None]:
%env MPRESTER_MUTE_PROGRESS_BARS 1
# pip install mpcontribs-client mp_api pandas flatten_dict
import os
import gzip
import json

from pathlib import Path
from mpcontribs.client import Client
from mp_api.client import MPRester
from pymatgen.core import Structure
from pandas import read_csv
from flatten_dict import flatten, unflatten

In [None]:
# mpr = MPRester(api_key=os.environ["MPCONTRIBS_API_KEY"])

In [None]:
with open("/Users/patrick/Downloads/usf_formates_tasks.json", "r") as f:
    tasks = json.load(f)

In [None]:
lookup = {doc["formula_pretty"] + "_" + str(doc["symmetry"]["number"]): doc["task_id"] for doc in tasks}

In [None]:
drivedir = Path("/Users/patrick/GoogleDriveLBNL/My Drive/")
datadir = drivedir / "MaterialsProject/gitrepos/mpcontribs-data/HFP2023"

In [None]:
# function to search MP via its summary API endpoint
# def search(formula=None, spacegroup_number=None, chemsys=None):
#     return mpr.summary.search(
#         formula=formula, chemsys=chemsys, spacegroup_number=spacegroup_number,
#         fields=["material_id"], sort_fields="energy_above_hull"
#     )

In [None]:
def make_gzip(p_in):
    p_out = str(p_in) + ".gz"
    if not Path(p_out).exists():
        with p_in.open('rb') as f_in, gzip.open(p_out, 'wb') as f_out:
            f_out.writelines(f_in)

In [None]:
columns = {
    "polarization": {
        "v1": "C/m²",
        "v2": "C/m²",
        "v3": "C/m²",
        "mag": "C/m²"
    },
    "mechanic": {
        "moduli.bulk": "N/m²",
        "moduli.young": "N/m²",
        "moduli.shear": "N/m²",
        "ratios.pugh": "", # dimensionless number
        "ratios.poisson": "",
        "compressibility": "m²/N",
        "unknown": ""
    }
}

def make_data(key, vals):
    cols = columns[key]
    dct = {}
    
    for k, v in dict(zip(cols.keys(), vals)).items():
        unit = cols[k]
        dct[k] = f"{v} {unit}" if unit else v # 5.5 eV, 100 N/m2
    
    return unflatten(dct, splitter="dot")


contributions = []

for subdir in datadir.glob('**/*'): # looping over subdirectories (DMP-Co)
    if subdir.is_file():
        continue
    
    identifier = subdir.name # default to subdir as identifier
    cifs = list(subdir.glob("*.cif"))

    if cifs:
        # assuming there's only one CIF per directory
        structure = Structure.from_file(cifs[0])

        # # try to match CIF against MP
        # matching_mpids = mpr.find_structure(structure)

        # if matching_mpids:
        #     identifier = matching_mpids[0]
        # else:
        #     # structure not in MP (yet)
        #     # get composition, formula, space group, and chemical system
        #     composition = structure.composition
        #     formula, _ = composition.get_reduced_formula_and_factor()
        #     _, spacegroup_number = structure.get_space_group_info()
        #     chemsys = composition.chemical_system

        #     # 1) try formula and space group 
        #     docs = search(formula=formula, spacegroup_number=spacegroup_number)
        #     if not docs:
        #         # 2) try formula
        #         docs = search(formula=formula)
        #         if not docs:
        #             # 3) try chemical system
        #             docs = search(chemsys=chemsys)

        #     if docs:
        #         # always use material with lowest energy above hull
        #         identifier = docs[0].material_id

        composition = structure.composition
        formula, _ = composition.get_reduced_formula_and_factor()
        _, spacegroup_number = structure.get_space_group_info()
        identifier = lookup[f"{formula}_{spacegroup_number}"]
        print(identifier) # "link to MP"
    
    # make sure everything's gzipped
    for p in subdir.glob("*.*"):
        if p.suffix in {".txt", ".vasp", ".cif"}:
            make_gzip(p)
    
    # init contribution; add all files as attachments; add structure
    contrib = {
        "identifier": identifier, "formula": formula, "data": {},
        "attachments": list(subdir.glob("*.gz"))
    }
    if identifier.startswith("mp-"):
        contrib["structures"] = [structure]

    # load properties from polarization and stiffness tensor
    for fn in ["Polarization.txt", "Stiffness_tensor.txt"]:
        with (subdir / fn).open() as f:
            values = f.readlines()[-1].split()
            if len(values) == 4:
                contrib["data"]["polarization"] = make_data("polarization", values)
            elif len(values) == 7:
                contrib["data"]["mechanic"] = make_data("mechanic", values)
    
#     # option to add tensors to `data`    
#     for fn in subdir.glob("*.txt"):
#         stem = fn.stem.lower()
#         if stem.endswith("_tensor"):
#             field = ".".join(stem.split("_")[:-1])
#             df = read_csv(fn, sep="\t", header=0, names=range(1, 7))
#             df.index = range(1,4)
#             contrib["data"][field] = df.T.to_dict()
        
    contributions.append(contrib)

In [None]:
client = Client(project="HFP2023")

In [None]:
flat_columns = flatten(columns, reducer="dot")
client.delete_contributions()
client.init_columns(columns)
client.submit_contributions(contributions)
# this shouldn't be necessary but need to re-init columns likely due to bug in API server
client.init_columns(columns) 