In [1]:
import gzip
from monty.json import MontyDecoder
from ujson import load
from pymatgen.core.structure import Molecule, Structure
from pathlib import Path
from time import time
from mpcontribs.client import Client
from tqdm.auto import tqdm

In [2]:
project_name = "open_catalyst_project"
client = Client(project=project_name)

In [3]:
p = Path("/global/cscratch1/sd/huck/is2res_train/is2res_train")
jsons = list(p.glob("*.json.gz"))

In [4]:
decoder = MontyDecoder()

def get_contribution(path):
    
    if path.stat().st_size / 1024 / 1024 > 15:
        return None
    
    with gzip.open(path) as f:
        data = decoder.process_decoded(load(f))
    
    struct = data['trajectory'][-1]
    struct.add_site_property('tags', [int(t) for t in data['tags']])

    mol = Molecule.from_sites([site for site in struct if site.properties['tags'] == 2])
    iupac_formula = mol.composition.iupac_formula
    bulk_struct = Structure.from_sites([site for site in struct if site.properties['tags'] != 2])
    bulk_formula = bulk_struct.composition.reduced_formula

    search_data = {
        "mpid": data['bulk_id'],
        "adsorptionEnergy": data["adsorption_energy"],
        # TODO systemEnergy?
        "adsorbateSmiles": data["adsorbate_smiles"],
        "adsorbateIUPACFormula": iupac_formula,
        "bulkFormula": bulk_formula,
        "h": data["surface_miller_indices"][0],
        "k": data["surface_miller_indices"][1],
        "l": data["surface_miller_indices"][2],
        "surfaceTop": data["surface_top"],
        "surfaceShift": data["surface_shift"]
    }

    return {
        "formula": struct.composition.reduced_formula,
        "identifier": data["id"],
        "data": search_data,
        "structures": [struct],
        "attachments": [path]
    }

In [5]:
client.get_totals()

Totals:   0%|          | 0/1 [00:00<?, ?it/s]

(460328, 921)

In [None]:
all_ids = client.get_all_ids().get(project_name, {}).get("identifiers", set())
print(len(all_ids))

contributions, cnt = [], 0
kwargs = {"per_request": 50, "ignore_dupes": True, "skip_dupe_check": True}

for path in tqdm(jsons):
    if Path(path.stem).stem not in all_ids:
        contrib = get_contribution(path)
        if not contrib:
            continue
            
        contributions.append(contrib)
        cnt += 1
        
        if not cnt % 2000:
            client.submit_contributions(contributions, **kwargs)
            contributions.clear()
            
if contributions:
    client.submit_contributions(contributions, **kwargs)

print(cnt)

In [None]:
# delete contributions with missing attachments
# client.delete_contributions(name, query={"attachments__size": 0})