# Submit Jarvis to the MDF Index
Downloads the latest copies of Jarvis and submit them to the MDF search index. First has to convert the JSON files stored with each Jarvis calculation to a form compatible with the MDF index, which requires standardizing data types and mapping fields in the JARVIS data schema into ones defined in the MDF schema. Once complete, this notebook submits the data to MDF Connect.

This notebook is designed to be run on Linux with Globus Connect Personal installed.

In [1]:
from mdf_connect_client import MDFConnectClient
from monty.json import MontyEncoder, MontyDecoder
from monty.serialization import loadfn, dumpfn
from tqdm import tqdm_notebook as tqdm
from glob import glob
import requests
import shutil
import json
import math
import cgi
import os

Proceeding without mdf_connect_client


Settings to change

In [2]:
with open(os.path.expanduser(os.path.join('~', '.globusonline', 'lta', 'client-id.txt'))) as fp:
    SOURCE_UUID=fp.readline().strip()
print('My Endpoint Name:', SOURCE_UUID)
metadata_path = os.path.join(os.getcwd(), 'feedstock')

My Endpoint Name: 757daaa4-e697-11e8-8c9a-0a1d4c5c824a


Urls for the different data objects

In [3]:
data_urls = [
    'https://ndownloader.figshare.com/files/12468083',
    'https://ndownloader.figshare.com/files/12394556'
]

## Download the Data
Get the data from Figshare, if it hasn't been downloaded already

In [4]:
if not os.path.isdir('data'):
    os.mkdir('data')

In [5]:
filenames = []
for url in data_urls:
    # Get file info from figshare
    req = requests.get(url, stream=True)
    
    # Get the file name
    filename = cgi.parse_header(req.headers['Content-Disposition'])[1]['filename']
    filenames.append(filename)
    
    # Check if file already download, and skip if it is
    data_path = os.path.join('data', filename)
    if os.path.isfile(data_path):
        continue
        
    # If not, download the file
    with open(data_path, 'wb') as fp:
        for chunk in req.iter_content(chunk_size=1024):
            fp.write(chunk)

## Extract Additional Information Out of Records
Records need some additional tuning to be the most useful for the MDF index

In [6]:
def construct_mdf_record(d, is_2d):
    """Given a JSON object from Jarvis, convert to an MDF format.
    
    Schema is designed to be similar to that of the OQMD.
    
    Args:
        d (dict): Record to be converted
        is_2d (bool): Wether we are converting the 2d dataset
    Returns: (dict) MDF record"""

    # Records that are used multiple times
    comp = d['initial_str'].composition.get_integer_formula_and_factor()[0]
    
    # Make the main part of the output
    r = {
        "mdf": {
            "title": 'JARVIS - %s - %s'%(d['jid'], comp),
            "acl": "public",
        },
        "dft": {
            "converged": True,
            "cutoff_energy": float(d['incar']['ENCUT'])
        },
        "material": {
            "composition": comp,
            "elements": [e.symbol for e in d['initial_str'].composition.element_composition.keys()]
        },
        "crystal_structure": {
            "space_group_number": d['final_str'].get_space_group_info()[1],
            "number_of_atoms": len(d['final_str']),
            "volume": d['final_str'].volume,
        },
        "origin": {
            "name": "VASP",
            "creator": "University of Vienna",
            "type": "computation"
        },
        "jarvis": {
            "id": d['jid'],
            "crossreference": {
                "materials_project": d['mpid'],
            },
            "dimensionality": '2d' if is_2d else '3d',
            "band_gap": {},
            "landing_page": "https://www.ctcms.nist.gov/~knc6/jsmol/%s.html"%d['jid'],
            "total_energy": float(d['fin_en']), #LW 17Oct17: Are the units "ev/atom"
        },
        "jarvis_record": MontyEncoder().encode(d)
    }
    
    # Add in fields that may or may not have valid values
    try:
        r["jarvis"]["band_gap"]["mbj"] = float(d["mbj_gap"])
    except:
        pass
    try:
        r["jarvis"]["band_gap"]["optb88vdw"] = float(d["op_gap"])
    except:
        pass
    try:
        r['jarvis']['formation_enthalpy'] = float(d["form_enp"])
    except:
        pass
        
        
    # Add in fields that are only good for some materials
    if not is_2d:
        try:
            r["jarvis"]["elastic_moduli"] = {
                "bulk": float(d["kv"]), # Units GPa
                "shear": float(d["gv"]) # Units GPa
            }
        except:
            pass
        
        # LW 17Oct17: Are dielectric functions only valid for 3D materials?
        # LW 17Oct17: What are the units?
        #r["jarvis"]["dielectric_function"] = {}
        #try:
        #    r["jarvis"]["dielectric_function"]["optb88vdw"] = {"value": float(d["epsx"]), "units": "units"}
        #except:
        #    pass
        #try:
        #    r["jarvis"]["dielectric_function"]["mbj"] = {"value": float(d["mepsx"]), "units": "units"}
        #except:
        #    pass
        
    # Check if any values are NaNs or infinity. If so, delete them
    for key, value in dict(r['jarvis']).items():
        # Check the values of dictionaries. Assumes we do not have nested dicts
        if isinstance(value, dict):
            for key2, value2 in dict(value).items():
                if isinstance(value2, (int, float)) and not math.isfinite(value2):
                    del r['jarvis'][key][key2]

        # Check non-dicts
        if isinstance(value, (int, float)) and not math.isfinite(value):
            del r['jarvis'][key]

    return r

Process all of the data

In [7]:
if os.path.isdir('feedstock'):
    shutil.rmtree('feedstock')
os.mkdir('feedstock')

In [8]:
jarvis_keys = set()
for f in filenames:
    # Get the data
    records = loadfn(os.path.join('data', f), cls=MontyDecoder)
    
    # Make the output directory
    dirname = f
    os.mkdir(os.path.join('feedstock', dirname))
    for record in tqdm(records, leave=True, desc=f):
        # Convert JARVIS json into MDF-friendly JSON
        record_metadata = construct_mdf_record(record, '2d' in f)
        jarvis_keys.update(record_metadata["jarvis"].keys())
        dumpfn(record_metadata, os.path.join('feedstock', dirname, '{}.json'.format(record_metadata['jarvis']['id'])), cls=MontyEncoder)

HBox(children=(IntProgress(value=0, description='jdft_3d-7-7-2018.json', max=25923, style=ProgressStyle(descri…

  % self.symbol)
  % self.symbol)
  % self.symbol)





HBox(children=(IntProgress(value=0, description='jdft_2d-7-7-2018.json', max=636, style=ProgressStyle(descript…




## Create the Mapping
The mapping for this dataset

In [9]:
mapping = {
    'crystal_structure.space_group_number': 'crystal_structure.space_group_number',
    'crystal_structure.number_of_atoms': 'crystal_structure.number_of_atoms',
    'crystal_structure.volume': 'crystal_structure.volume',
#    'crystal_structure.cross_reference.icsd': 'entry.crossreference.icsd',
    'dft.converged': 'dft.converged',
    'dft.cutoff_energy': 'dft.cutoff_energy',
    #'dft.exchange_correlation_functional': 'xc_functional', # LW 16Apr18: Can I just assume PBE?
    'material.composition': 'material.composition',
    'material.elements': 'material.elements',
}

In [10]:
mapping.update(dict(zip(["__custom."+x for x in jarvis_keys], ["jarvis."+x for x in jarvis_keys])))

In [11]:
mapping

{'crystal_structure.space_group_number': 'crystal_structure.space_group_number',
 'crystal_structure.number_of_atoms': 'crystal_structure.number_of_atoms',
 'crystal_structure.volume': 'crystal_structure.volume',
 'dft.converged': 'dft.converged',
 'dft.cutoff_energy': 'dft.cutoff_energy',
 'material.composition': 'material.composition',
 'material.elements': 'material.elements',
 '__custom.formation_enthalpy': 'jarvis.formation_enthalpy',
 '__custom.elastic_moduli': 'jarvis.elastic_moduli',
 '__custom.total_energy': 'jarvis.total_energy',
 '__custom.crossreference': 'jarvis.crossreference',
 '__custom.id': 'jarvis.id',
 '__custom.landing_page': 'jarvis.landing_page',
 '__custom.band_gap': 'jarvis.band_gap',
 '__custom.dimensionality': 'jarvis.dimensionality'}

Make the descriptions for the JARVIS-specific fields

In [12]:
custom_desc = {
    '__custom.band_gap': 'Band gap energies (eV)',
    '__custom.crossreference': 'Cross-references to other DFT databases',
    '__custom.dimensionality': 'Dimensionality of the structure',
    '__custom.elastic_moduli': 'Elastic moduli (GPa)',
    '__custom.formation_enthalpy': 'Formation enthalpy (eV/atom)',
    '__custom.id': 'ID number in jarvis database',
    '__custom.landing_page': 'URL of landing page in Jarvis website',
    '__custom.total_energy': 'Total energy of the structure (eV/atom) '
}

### Test the Mapping
Check it out to make sure it works

In [13]:
with open(glob(os.path.join('feedstock', '**', 'JVASP-28754.json'))[0]) as fp:
    f = json.load(fp)

In [14]:
f

{'mdf': {'title': 'JARVIS - JVASP-28754 - TeMoWSeS2', 'acl': 'public'},
 'dft': {'converged': True, 'cutoff_energy': 650.0},
 'material': {'composition': 'TeMoWSeS2',
  'elements': ['Te', 'Mo', 'W', 'Se', 'S']},
 'crystal_structure': {'space_group_number': 156,
  'number_of_atoms': 12,
  'volume': 358.2165572042844},
 'origin': {'name': 'VASP',
  'creator': 'University of Vienna',
  'type': 'computation'},
 'jarvis': {'id': 'JVASP-28754',
  'crossreference': {'materials_project': 'mp-1027609'},
  'dimensionality': '3d',
  'band_gap': {'optb88vdw': 0.0607},
  'landing_page': 'https://www.ctcms.nist.gov/~knc6/jsmol/JVASP-28754.html',
  'total_energy': -58.427629,
  'formation_enthalpy': -0.538},
 'jarvis_record': '{"gv": "na", "mpid": "mp-1027609", "encut": 650, "icsd": "None", "form_enp": -0.538, "final_str": {"@module": "pymatgen.core.structure", "@class": "Structure", "charge": null, "lattice": {"matrix": [[3.317942086959519, 0.0, 0.0], [-1.6589710434797595, 2.873431366402459, 4.95442

In [15]:
def apply_mapping(data, mapping):
    """Given a dictionary, return the MDF-formatted mapping"""
    
    output = {}
    for out_field, in_field in mapping.items():
        # Get the value from the input data
        fields = in_field.split(".")
        current_rec = data
        for field in fields[:-1]:
            current_rec = current_rec.get(field, {})
        val = current_rec.get(fields[-1])
        if val is None:
            continue
        
        # Add it to the output structure
        fields = out_field.split(".")
        current_rec = output
        for field in fields[:-1]:
            if not field in current_rec:
                current_rec[field] = {}
            current_rec = current_rec[field]
        current_rec[fields[-1]] = val
    return output

In [16]:
print(json.dumps(apply_mapping(f, mapping), indent=2))

{
  "crystal_structure": {
    "space_group_number": 156,
    "number_of_atoms": 12,
    "volume": 358.2165572042844
  },
  "dft": {
    "converged": true,
    "cutoff_energy": 650.0
  },
  "material": {
    "composition": "TeMoWSeS2",
    "elements": [
      "Te",
      "Mo",
      "W",
      "Se",
      "S"
    ]
  },
  "__custom": {
    "formation_enthalpy": -0.538,
    "total_energy": -58.427629,
    "crossreference": {
      "materials_project": "mp-1027609"
    },
    "id": "JVASP-28754",
    "landing_page": "https://www.ctcms.nist.gov/~knc6/jsmol/JVASP-28754.html",
    "band_gap": {
      "optb88vdw": 0.0607
    },
    "dimensionality": "3d"
  }
}


## Submit the Data to the MDF
We will use the MDF Connect Client to describe the dataset (e.g., who made it, where is it), and send it to Connect with Globus

In [17]:
client = MDFConnectClient()

In [18]:
client.set_source_name('jarvis')

In [19]:
client.create_dc_block(
    title="JARVIS - Joint Automated Repository for Various Integrated Simulations",
    authors=["Choudhary, Kamal", "Kalish, Irena", "Beams, Ryan", "Tavazza, Francesca"],
    affiliations="National Institute of Standards and Technology",
    publisher='Figshare',
    publication_year=2017,
    related_dois=['10.1038/s41598-017-05402-0'],
    description="JARVIS (Joint Automated Repository for Various Integrated Simulations) is a repository designed to automate materials discovery using classical force-field, density functional theory, machine learning calculations and experiments. The Force-field section of JARVIS (JARVIS-FF) consists of thousands of automated LAMMPS based force-field calculations on DFT geometries. Some of the properties included in JARVIS-FF are energetics, elastic constants, surface energies, defect formations energies and phonon frequencies of materials. The Density functional theory section of JARVIS (JARVIS-DFT) consists of thousands of VASP based calculations for 3D-bulk, single layer (2D), nanowire (1D) and molecular (0D) systems. Most of the calculations are carried out with optB88vDW functional. JARVIS-DFT includes materials data such as: energetics, diffraction pattern, radial distribution function, band-structure, density of states, carrier effective mass, temperature and carrier concentration dependent thermoelectric properties, elastic constants and gamma-point phonons. The Machine-learning section of JARVIS (JARVIS-ML) consists of machine learning prediction tools, trained on JARVIS-DFT data. Some of the ML-predictions focus on energetics, heat of formation, GGA/METAGGA bandgaps, bulk and shear modulus."
)

In [20]:
client.add_index('json', mapping)

In [21]:
client.add_data('globus://{}{}/'.format(SOURCE_UUID, 
                                         os.path.abspath(os.path.join('feedstock'))))

Print out the submission, for record keeping

In [22]:
client.get_submission()

{'dc': {'titles': [{'title': 'JARVIS - Joint Automated Repository for Various Integrated Simulations'}],
  'creators': [{'creatorName': 'Choudhary, Kamal',
    'familyName': 'Choudhary',
    'givenName': 'Kamal',
    'affiliations': ['National Institute of Standards and Technology']},
   {'creatorName': 'Kalish, Irena',
    'familyName': 'Kalish',
    'givenName': 'Irena',
    'affiliations': ['National Institute of Standards and Technology']},
   {'creatorName': 'Beams, Ryan',
    'familyName': 'Beams',
    'givenName': 'Ryan',
    'affiliations': ['National Institute of Standards and Technology']},
   {'creatorName': 'Tavazza, Francesca',
    'familyName': 'Tavazza',
    'givenName': 'Francesca',
    'affiliations': ['National Institute of Standards and Technology']}],
  'publisher': 'Figshare',
  'publicationYear': '2017',
  'resourceType': {'resourceTypeGeneral': 'Dataset',
   'resourceType': 'Dataset'},
  'descriptions': [{'description': 'JARVIS (Joint Automated Repository for Var

Send it in to MDF

In [23]:
client.submit_dataset()

{'source_id': 'jarvis_v1', 'success': True, 'error': None}

In [24]:
client.check_status()


Status of convert submission jarvis_v1 (JARVIS - Joint Automated Repository for Various Integrated Simulations)
Submitted by Logan Ward at 2018-11-13T16:26:32.295979Z

Conversion initialization is in progress.
Conversion data download has not started yet.
Data conversion has not started yet.
Ingestion preparation has not started yet.
Ingestion initialization has not started yet.
Ingestion data download has not started yet.
Integration data download has not started yet.
Globus Search ingestion has not started yet.
Globus Publish publication has not started yet.
Citrine upload has not started yet.
Materials Resource Registration has not started yet.
Post-processing cleanup has not started yet.

