# Get and Hash all VASP data in the MDF

In [1]:
import globus_sdk
from mdf_forge.forge import Forge

import dfttopif

from ase.io import vasp
from ase import Atoms

import codecs

from joblib import Parallel, delayed

from pypif import pif
import gzip
import os
import json
import io
import requests
from tempfile import TemporaryDirectory

In [2]:
DEFAULT_DIR = "pifs"

## Get All VASP Records
This section gathers all VASP records from the MDF

In [3]:
forge = Forge()

Find all applicable data

In [4]:
query_string = "mdf.tags:outcar AND mdf.resource_type:record"
#query_string += " AND mdf.source_name:trinkle_elastic_fe_bcc"
query_string += " AND (mdf.source_name:ti_o_meam_model OR mdf.source_name:khazana_vasp)"

all_results = forge.aggregate(query_string, scroll_size=1000)

result_records = {}
for result in all_results:
    source = result["mdf"]["source_name"]
    if source not in result_records.keys():
        result_records[source] = []
    result_records[source].append(result)

print('Found %d sources'%len(result_records.keys()))
print('Found %d records'%len(all_results))
#result_records[0]
result_records

100%|██████████| 40/40 [00:00<00:00, 83.20it/s]

Found 2 sources
Found 40 records





{'khazana_vasp': [{'mdf': {'collection': 'Khazana',
    'composition': 'Al128',
    'elements': ['Al'],
    'ingest_date': '2017-08-04T19:27:06.214936Z',
    'links': {'landing_page': 'http://khazana.uconn.edu/module_search/search.php?m=2#7',
     'outcar': {'globus_endpoint': '82f1b5c6-6e9b-11e5-ba47-22000b92c6ec',
      'http_host': 'https://data.materialsdatafacility.org',
      'path': '/collections/khazana/OUTCARS/OUTCAR_bcc_large'},
     'parent_id': '5984ca11f2c00438032d6a94'},
    'mdf_id': '5984ca8af2c00438032d6a9b',
    'metadata_version': '0.3.2',
    'resource_type': 'record',
    'scroll_id': 7,
    'source_name': 'khazana_vasp',
    'tags': ['outcar'],
    'title': 'Khazana VASP - Al128'}},
  {'mdf': {'collection': 'Khazana',
    'composition': 'Al32',
    'elements': ['Al'],
    'ingest_date': '2017-08-04T19:29:43.954072Z',
    'links': {'landing_page': 'http://khazana.uconn.edu/module_search/search.php?m=2#11',
     'outcar': {'globus_endpoint': '82f1b5c6-6e9b-11e5-ba47

A utility operation for getting files from the MDF. 

In [5]:
def file_request(url, **kwargs):
    """Opens a file at the MDF
    
    :param url: str, path to file
    :param kwargs: these get passed to `requests.get`"""
    
    headers = {}
    forge.mdf_authorizer.set_authorization_header(headers)
    return requests.get(url, headers=headers, **kwargs)
# Test getting a single file
#assert file_request(
#    result_records[0]['mdf']['links']['outcar']['http_host'] +
#    result_records[0]['mdf']['links']['outcar']['path']
#).status_code == 200

For each entry, read the structure from the data stored in the MDF. The MDF has a complete record of the data for each OQMD calculation that, while useful, does not make sense to include in a search index.

In [6]:
def get_file(url, directory, name):
    """Download files from url to a directory
    
    :param directory: str, path to download directory
    :param url: str, url to download
    :param name: str, new name of file
    
    :return: int, 200 on success, url status code otherwise"""
    
    req = file_request(url, stream=True)
    
    if req.status_code != 200:
        return req.status_code
    
    # If the URL ends with ".gz" uncompress it
    with open(os.path.join(directory, name), 'wb') as fp:
        if url.endswith('.gz'):
            with gzip.open(req.raw) as go:
                for data in go:
                    fp.write(data)
        else:
            for data in req.iter_content():
                fp.write(data)
    return 200

#with TemporaryDirectory() as d:
#    url = result_records[0]['mdf']['links']['outcar']['http_host'] + \
#        result_records[0]['mdf']['links']['outcar']['path']
#    get_file(url, d, 'OUTCAR')
#    assert os.path.isfile(os.path.join(d,'OUTCAR'))
#    assert isinstance(vasp.read_vasp_out(os.path.join(d,'OUTCAR')), Atoms)

In [7]:
def convert_to_pif(entry, directory='pifs', overwrite=False):
    """Get the composition, structure, band gap, and stability of an entry given its search result
    
    :param entry: dict, metadata for an entry as returned by Globus search
    :param overwrite: boolean, whether to overwrite existing file, if present
    :return: 
        - bool, whether successful
        - if successful, - str, path to file
        - if not successful, - str, error message"""
    
    if not os.path.isdir(directory):
        if not os.path.exists(directory):
            os.makedirs(directory, exist_ok=True)
        else:
            raise IOError("Path " + directory + " exists and is not a directory.")
    filename = os.path.join(directory, '%s.json'%entry['mdf']['mdf_id'])
    if os.path.isfile(filename) and not overwrite:
        return True, filename
    
    # Get the path to the OUTCAR
    url = entry['mdf']['links']['outcar']['http_host'] + entry['mdf']['links']['outcar']['path']
        
    # If doesn't end with OUTCAR, skip this calculation (assuming that if OUTCAR renamed, INCAR gone)
    #   You might want to adjust this later
    if not 'OUTCAR' in os.path.basename(url):
        return False, 'Not an OUTCAR'
    
    base_url = '/'.join(url.split('/')[:-1]) + '/'
    
    # Download the files
    with TemporaryDirectory() as d:
        try:
            # Download OUTCAR
            get_file(url, d, 'OUTCAR')
                
            # Download optional files
            for f in ['INCAR','POSCAR','KPOINTS','DOSCAR']:
                result = get_file(base_url+f,d,f)
                
            # Run the DFT to pif
            try:
                mypif = dfttopif.drivers.directory_to_pif(d, quality_report=False)
            except:
                return False, 'dftotopif failed'
            
            # Convert (or [debug], return the pif)
            pif.dump(mypif, open(filename, 'w'))
            return True, filename
        except:
            raise
#result, path = convert_to_pif(result_records[0], 'pifs')
#assert result
#assert os.path.isfile(path)

Process all of the records in parallel

In [8]:
with Parallel(n_jobs=20, verbose=11) as p:
    f = delayed(convert_to_pif)
    pif_results = []
    for source, results in result_records.items():
        output_dir = os.path.join(DEFAULT_DIR, source)
        pif_results.extend(p([f(e, directory=output_dir, overwrite=True) for e in results]))
print('%d of %d successful'%(sum([x[0] for x in pif_results]), len(pif_results)))

[Parallel(n_jobs=20)]: Done   1 tasks      | elapsed:   20.8s
[Parallel(n_jobs=20)]: Done   2 out of  10 | elapsed:   27.7s remaining:  1.8min
[Parallel(n_jobs=20)]: Done   3 out of  10 | elapsed:   37.3s remaining:  1.5min
[Parallel(n_jobs=20)]: Done   4 out of  10 | elapsed:   49.9s remaining:  1.2min
[Parallel(n_jobs=20)]: Done   5 out of  10 | elapsed:  1.3min remaining:  1.3min
[Parallel(n_jobs=20)]: Done   6 out of  10 | elapsed:  3.5min remaining:  2.3min
[Parallel(n_jobs=20)]: Done   7 out of  10 | elapsed:  3.5min remaining:  1.5min
[Parallel(n_jobs=20)]: Done   8 out of  10 | elapsed:  5.1min remaining:  1.3min
[Parallel(n_jobs=20)]: Done  10 out of  10 | elapsed:  8.2min remaining:    0.0s
[Parallel(n_jobs=20)]: Done  10 out of  10 | elapsed:  8.2min finished
[Parallel(n_jobs=20)]: Done   1 tasks      | elapsed:  1.3min
[Parallel(n_jobs=20)]: Done   3 out of  30 | elapsed: 14.5min remaining: 130.9min
[Parallel(n_jobs=20)]: Done   6 out of  30 | elapsed: 42.2min remaining: 16

KeyboardInterrupt: 