In [None]:
import wget, json, os, math
from pathlib import Path
from string import capwords
from pybtex.database import parse_string
import pybtex.errors
from mpcontribs.client import Client
from bravado.exception import HTTPNotFound
from pymatgen.core import Structure
from pymatgen.ext.matproj import MPRester
from tqdm.notebook import tqdm
from matminer.datasets import load_dataset
from monty.json import MontyEncoder, MontyDecoder

### Configuration and Initialization

In [None]:
BENCHMARK_FULL_SET = [
    {
        "name": "log_kvrh",
        "data_file": "matbench_log_kvrh.json.gz",
        "target": "log10(K_VRH)",
        "clf_pos_label": None,
        "unit": None,
        "has_structure": True,
    }, {
        "name": "log_gvrh",
        "data_file": "matbench_log_gvrh.json.gz",
        "target": "log10(G_VRH)",
        "clf_pos_label": None,
        "unit": None,
        "has_structure": True,
    }, {
        "name": "dielectric",
        "data_file": "matbench_dielectric.json.gz",
        "target": "n",
        "clf_pos_label": None,
        "unit": None,
        "has_structure": True,
    }, {
        "name": "jdft2d",
        "data_file": "matbench_jdft2d.json.gz",
        "target": "exfoliation_en",
        "clf_pos_label": None,
        "unit": "meV/atom",
        "has_structure": True,
    }, {
        "name": "mp_gap",
        "data_file": "matbench_mp_gap.json.gz",
        "target": "gap pbe",
        "clf_pos_label": None,
        "unit": "eV",
        "has_structure": True,
    }, {
        "name": "mp_is_metal",
        "data_file": "matbench_mp_is_metal.json.gz",
        "target": "is_metal",
        "clf_pos_label": True,
        "unit": None,
        "has_structure": True,
    }, {
        "name": "mp_e_form",
        "data_file": "matbench_mp_e_form.json.gz",
        "target": "e_form",
        "clf_pos_label": None,
        "unit": "eV/atom",
        "has_structure": True,
    }, {
        "name": "perovskites",
        "data_file": "matbench_perovskites.json.gz",
        "target": "e_form",
        "clf_pos_label": None,
        "unit": "eV",
        "has_structure": True,
    }, {
        "name": "glass",
        "data_file": "matbench_glass.json.gz",
        "target": "gfa",
        "clf_pos_label": True,
        "unit": None,
        "has_structure": False,
    }, {
        "name": "expt_is_metal",
        "data_file": "matbench_expt_is_metal.json.gz",
        "target": "is_metal",
        "clf_pos_label": True,
        "unit": None,
        "has_structure": False,
    }, {
        "name": "expt_gap",
        "data_file": "matbench_expt_gap.json.gz",
        "target": "gap expt",
        "clf_pos_label": None,
        "unit": "eV",
        "has_structure": False,
    }, {
        "name": "phonons",
        "data_file": "matbench_phonons.json.gz",
        "target": "last phdos peak",
        "clf_pos_label": None,
        "unit": "cm^-1",
        "has_structure": True,
    }, {
        "name": "steels",
        "data_file": "matbench_steels.json.gz",
        "target": "yield strength",
        "clf_pos_label": None,
        "unit": "MPa",
        "has_structure": False,
    }
]

In [None]:
# Map of canonical yet non-mpcontribs-compatible tagret nams to compatible (unicode, no punctuation) target names
target_map = {
    "yield strength": "σᵧ",
    "log10(K_VRH)": "log₁₀Kᵛʳʰ",
    "log10(G_VRH)": "log₁₀Gᵛʳʰ",
    "n": "𝑛",
    "exfoliation_en": "Eˣ",
    "gap pbe": "Eᵍ",
    "is_metal": "metallic",
    "e_form": "Eᶠ",
    "gfa": "glass",
    "gap expt": "Eᵍ",
    "last phdos peak": "ωᵐᵃˣ",
}

In [None]:
pybtex.errors.set_strict_mode(False)
mprester = MPRester()
client = Client(host='ml-api.materialsproject.org')

In [None]:
datadir = Path('/Users/patrick/gitrepos/mp/mpcontribs-data/')
fn = Path('dataset_metadata.json')
fp = datadir / fn
if not fp.exists():
    prefix = "https://raw.githubusercontent.com/hackingmaterials/matminer"
    url = f'{prefix}/master/matminer/datasets/{fn}'
    wget.download(url)
    fn.rename(fp)
    
metadata = json.load(open(fp, 'r'))

### Prepare and create/update Projects

In [None]:
for ds in BENCHMARK_FULL_SET:
    name = "matbench_" + ds["name"]
    primitive_key = "structure" if ds["has_structure"] else "composition"
    target = ds["target"]
    columns = {
        target_map[target]: metadata[name]["columns"][target],
        primitive_key: metadata[name]["columns"][primitive_key]
    }
    project = {
        'name': name,
        'is_public': True,
        'owner': 'ardunn@lbl.gov',
        'title': name, # TODO update and set long_title
        'authors': 'A. Dunn, A. Jain',
        'description': metadata[name]['description'] + \
        " If you are viewing this on MPContribs-ML interactively, please ensure the order of the"
        f"identifiers is sequential (mb-{ds['name']}-0001, mb-{ds['name']}-0002, etc.) before benchmarking.",
        'other': {
            'columns': columns,
            'entries': metadata[name]['num_entries']
        },
        'references': [
            {'label': 'RawData', 'url': metadata["name"]["url"]}
        ]
    }
    
    for ref in metadata[name]['bibtex_refs']:
        if name == "matbench_phonons":
            ref = ref.replace(
                "petretto_dwaraknath_miranda_winston_giantomassi_rignanese_van setten_gonze_persson_hautier_2018",
                "petretto2018"
            )
            
        bib = parse_string(ref, 'bibtex')
        for key, entry in bib.entries.items():
            key_is_doi = key.startswith('doi:')
            url = 'https://doi.org/' + key.split(':', 1)[-1] if key_is_doi else entry.fields.get('url')
            k = 'Zhuo2018' if key_is_doi else capwords(key.replace('_', ''))
            if k.startswith('C2'):
                k = 'Castelli2012'
            elif k.startswith('Landolt'):
                k = 'LB1997'
            elif k == 'Citrine':
                url = 'https://www.citrination.com'
            
            if len(k) > 8:
                k = k[:4] + k[-4:]
            project['references'].append(
                {'label': k, 'url': url}
            )

    try:
        client.projects.getProjectByName(pk=name, _fields=["name"]).result()
    except HTTPNotFound:
        client.projects.create_entry(project=project).result()
        print(name, "created")
    else:
        project.pop("name")
        client.projects.update_entry(pk=name, project=project).result()
        print(name, "updated")

### Prepare Contributions

In [None]:
structure_filename = "/Users/patrick/Downloads/outfile.cif"

for ds in BENCHMARK_FULL_SET:
    name = "matbench_" + ds["name"]
    fn = datadir / f"{name}.json"
    if fn.exists():
        continue

    target = ds["target"]
    unit = f" {ds['unit']}" if ds["unit"] else ""
    df = load_dataset(name)
    contributions = []
    id_prefix = df.shape[0]
    id_n_zeros = math.floor(math.log(df.shape[0], 10)) + 1

    for i, row in tqdm(enumerate(df.iterrows()), total=df.shape[0]):
        entry = row[1]
        contrib = {'project': name, 'is_public': True}

        if "structure" in entry.index:
            s = entry.loc["structure"]
            s.to("cif", structure_filename)
            s = Structure.from_file(structure_filename)
            c = s.composition.get_integer_formula_and_factor()[0]
            contrib["structures"] = [s]
        else:
            c = entry["composition"]

        id_number = f"{i+1:0{id_n_zeros}d}"
        identifier = f"mb-{ds['name']}-{id_number}"
        contrib["identifier"] = identifier
        contrib["data"] = {target_map[target]: f"{entry.loc[target]}{unit}"}
        contrib["formula"] = c
        contributions.append(contrib)

    with open(fn, "w") as f:
        json.dump(contributions, f, cls=MontyEncoder)
    
    print("saved to", fn)

### Submit Contributions

In [None]:
name = "matbench_log_gvrh"
fn = datadir / f"{name}.json"
with open(fn, "r") as f:
    contributions = json.load(f, cls=MontyDecoder)

In [None]:
# client.delete_contributions(name)
client.submit_contributions(contributions, ignore_dupes=True)