In [1]:
import json
import pandas as pd
import requests
import tqdm

from io import StringIO, BytesIO
from multiprocessing.pool import Pool

# Materials specific packages
from ase.io import read as ase_read
from ase.visualize import view

# MDF imports
from mdf_dataworks.forge import Forge

# Instantiate Base Class

In [2]:
mdf = Forge()

### Perform a raw search

In [3]:
source_name = "cytochrome_qsar"
element = "Cl"

q = "mdf.source_name:{source_name} AND \
          mdf.resource_type:record AND mdf.elements:{element}".format(source_name = source_name, 
                                                                element=element)
res = mdf.search(q, advanced=True)

### Perform the same query with helper function

In [4]:
sources = ["cytochrome_qsar"]
elements = ["Cl"]

res = mdf.search_by_elements(elements=elements, sources=sources)
print("Total Matching Records: {n_records}".format(n_records = len(res)))
print(json.dumps(res[:2], sort_keys=True,indent=4, separators=(',', ': ')))

Total Matching Records: 26
[
    {
        "mdf": {
            "collection": "Cytochrome QSAR",
            "composition": "C18Cl3N2O",
            "elements": [
                "O",
                "N",
                "C",
                "Cl"
            ],
            "ingest_date": "2017-07-12T17:50:18.726506Z",
            "links": {
                "landing_page": "ftp://ftp.ics.uci.edu/pub/baldig/learning/Cytochrome/#5",
                "parent_id": "5966615a1d27540f4d501f76",
                "sdf": {
                    "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                    "http_host": "https://data.materialsdatafacility.org",
                    "path": "/collections/cytochrome_qsar/talele/talele_record-13.sdf"
                }
            },
            "mdf-id": "5966615a1d27540f4d501f7b",
            "metadata_version": "0.3.0",
            "resource_type": "record",
            "scroll_id": 5,
            "source_name": "cytochrome_qsar",
      

## Retrieve SDF file contents and read into a pandas dataframe

In [5]:
def get_sdf(r):
    mdf_base = r['mdf']
    URL = mdf_base['links']['sdf']['http_host']+mdf_base['links']['sdf']['path']
    response = requests.get(URL)
    r_data = ase_read(StringIO(response.text), format="sdf")
    return r_data

tasks = res
n_workers = 5

mp = Pool(n_workers)
mdf_data = mp.map(get_sdf, tasks)
mp.close()
mp.join()

formulae = [d.get_chemical_formula() for d in mdf_data]
df_cyto = pd.DataFrame({"formula":formulae, "data":mdf_data})


## Visualize a retrieved molecule

In [6]:
print(mdf_data[0].get_chemical_formula())
view(mdf_data[0], viewer='x3d')

C18Cl3N2O


## Get records via Globus for a larger, mixed source, result set

In [9]:
elements = ["Al","Ti"]
sources = ["trinkle_elastic_fe_bcc", "dilute_mg_alloys_dft"]
my_ep = "c8ee7e5c-6d04-11e5-ba46-22000b92c6ec"
my_path = "/Users/ben/Desktop/blaiszik-macbookpro/Ti" # This path should be writeable by Globus

mdf = Forge()
res = mdf.search_by_elements(elements=elements, sources=sources, limit=10)
mdf.globus_download(res, dest=my_path, 
               local_ep=my_ep, preserve_dir=True)

Processing records: 100%|██████████| 20/20 [00:00<00:00, 327.47it/s]
Submitting transfers: 100%|██████████| 1/1 [00:00<00:00,  3.04it/s]

All transfers submitted
Submission IDs: f80bf525-75fb-11e7-8b5e-22000b9923ef



