In [1]:
import json
import search_client
from gmeta_utils import gmeta_pop, format_gmeta

In [2]:
client = search_client.SearchClient("https://search.api.globus.org/", "mdf")

In [19]:
query = {
    "q": "mdf.source_name:fe_cr_al_oxidation",
    "advanced": True,
    "limit": 9999
}
res = client.structured_search(query)
res

GlobusHTTPResponse({'@datatype': 'GSearchResult', '@version': '2016-11-09', 'count': 1247, 'gmeta': [{'@datatype': 'GMetaResult', '@version': '2016-11-09', 'content': [{'fe_cr_al_oxidation': {'atomic_composition_percent': {'Al': '3.1', 'Cr': '23.5', 'Fe': '73.4'}, 'temperature_k': '690'}, 'mdf': {'collection': 'Fe-Cr-Al Oxidation Studies', 'composition': 'FeCrAl', 'elements': ['Al', 'Fe', 'Cr'], 'ingest_date': '2017-07-10T21:35:38.356184Z', 'links': {'csv': {'globus_endpoint': '82f1b5c6-6e9b-11e5-ba47-22000b92c6ec', 'http_host': 'https://data.materialsdatafacility.org', 'path': '/collections/Fe_Cr_Al_data/690 K Hour 1/690 K Hour 1 Point 1_088.txt'}, 'landing_page': 'https://materialsdata.nist.gov/dspace/xmlui/handle/11256/836#800', 'parent_id': '5963f323d0a0d3082468bdea'}, 'mdf-id': '5963f32ad0a0d3082468c10a', 'metadata_version': '0.3.0', 'resource_type': 'record', 'scroll_id': 800, 'source_name': 'fe_cr_al_oxidation', 'title': 'Fe-Cr-Al Oxidation - 690 K Hour 1 Point 1_088'}}], 'subje

In [20]:
for entry in res["gmeta"]:
    client.remove(entry["subject"])

In [5]:
from urllib.parse import quote
quote("https://figshare.com/articles/HOPV15_Dataset/1610063/4#1")

'https%3A//figshare.com/articles/HOPV15_Dataset/1610063/4%231'

# General Search

In [3]:
query1 = {
    "q": "nist",        # Fulltext search term
    "advanced": False,  # Not using advanced query structure
    "limit": 10         # Max number of results to return
}
search_result1 = client.search(query1)
cleaned_search_result1 = gmeta_pop(search_result1)
print(len(cleaned_search_result1))
print(cleaned_search_result1)

10
[{'mdf-author': [{'email': 'robert.zarr@nist.gov', 'family_name': 'Zarr', 'given_name': 'Robert', 'institution': 'National Institute of Standards and Technology'}], 'mdf-citation': ['Robert R. Zarr, Josue A. Chavez, Angela Y. Lee, Geraldine Dalton, and Shari L. Young, NIST Heat Transmission Properties of Insulating and Building Materials, NIST Standard Reference Database Number 81, National Institute of Standards and Technology, Gaithersburg MD, 20899, http://srdata.nist.gov/Insulation/.'], 'mdf-collection': 'NIST Heat Transmission Materials', 'mdf-composition': 'Cellular Polystyrene, Extruded', 'mdf-data_contact': {'email': 'robert.zarr@nist.gov', 'family_name': 'Zarr', 'given_name': 'Robert', 'institution': 'National Institute of Standards and Technology'}, 'mdf-data_contributor': [{'email': 'jgaff@uchicago.edu', 'family_name': 'Gaff', 'github': 'jgaff', 'given_name': 'Jonathon', 'institution': 'The University of Chicago'}], 'mdf-data_format': ['json'], 'mdf-data_type': ['text'], 

In [4]:
query2 = {
    "q": "mdf.source_name:hopv",  # Specific field search
    "advanced": True,                    # Using advanced/structured query
    "limit": 10                          # Max number of results to return
}
search_result2 = client.structured_search(query2)
cleaned_search_result2 = gmeta_pop(search_result2)
print(len(cleaned_search_result2))
print(json.dumps(cleaned_search_result2, sort_keys=True, indent=4, separators=(',', ': ')))

10
[
    {
        "mdf": {
            "collection": "Harvard Organic Photovoltaic Dataset",
            "composition": "CN1c2ccccc2C(=C2c3ccc(-c4cccs4)cc3N(C)C2=O)C1=O",
            "ingest_date": "2017-07-10T13:58:26.432034Z",
            "links": {
                "landing_page": "https://figshare.com/articles/HOPV15_Dataset/1610063/4#2",
                "molecule": {
                    "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                    "http_host": "https://data.materialsdatafacility.org",
                    "path": "/collections/hopv/hopv_2.txt"
                },
                "original": {
                    "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                    "http_host": "https://data.materialsdatafacility.org",
                    "path": "/collections/hopv/HOPV_15_revised_2.data"
                },
                "parent_id": "59638801d0a0d30327e29128"
            },
            "mdf-id": "59638802d0a0d30327e2912a"

# Search for specific elements

In [5]:
all_elements = {"Actinium": "Ac", "Silver": "Ag", "Aluminum": "Al", "Americium": "Am", "Argon": "Ar", "Arsenic": "As", "Astatine": "At", "Gold": "Au", "Boron": "B", "Barium": "Ba", "Beryllium": "Be", "Bohrium": "Bh", "Bismuth": "Bi", "Berkelium": "Bk", "Brom    ine": "Br", "Carbon": "C", "Calcium": "Ca", "Cadmium": "Cd", "Cerium": "Ce", "Californium": "Cf", "Chlorine": "Cl", "Curium": "Cm", "Copernicium": "Cn", "Cobalt": "Co", "Chromium": "Cr", "Cesium": "Cs", "Copper": "Cu", "Dubnium": "Db", "Darmstadtium": "Ds", "Dyspros    ium": "Dy", "Erbium": "Er", "Einsteinium": "Es", "Europium": "Eu", "Fluorine": "F", "Iron": "Fe", "Flerovium": "Fl", "Fermium": "Fm", "Francium": "Fr", "Gallium": "Ga", "Gadolinium": "Gd", "Germanium": "Ge", "Hydrogen": "H", "Helium": "He", "Hafnium": "Hf", "Mercury": "Hg", "Holmium": "Ho", "Hassium": "Hs", "Iodine": "I", "Indium": "In", "Iridium": "Ir", "Potassium": "K", "Krypton": "Kr", "Lanthanum": "La", "Lithium": "Li", "Lawrencium": "Lr", "Lutetium": "Lu", "Livermorium": "Lv", "Mendelevium": "Md", "Magnesium": "Mg", "Mang    anese": "Mn", "Molybdenum": "Mo", "Meitnerium": "Mt", "Nitrogen": "N", "Sodium": "Na", "Niobium": "Nb", "Neodymium": "Nd", "Neon": "Ne", "Nickel": "Ni", "Nobelium": "No", "Neptunium": "Np", "Oxygen": "O", "Osmium": "Os", "Phosphorus": "P", "Protactinium": "Pa", "Lea    d": "Pb", "Palladium": "Pd", "Promethium": "Pm", "Polonium": "Po", "Praseodymium": "Pr", "Platinum": "Pt", "Plutonium": "Pu", "Radium": "Ra", "Rubidium": "Rb", "Rhenium": "Re", "Rutherfordium": "Rf", "Roentgenium": "Rg", "Rhodium": "Rh", "Radon": "Rn", "Ruthenium":     "Ru", "Sulfur": "S", "Antimony": "Sb", "Scandium": "Sc", "Selenium": "Se", "Seaborgium": "Sg", "Silicon": "Si", "Samarium": "Sm", "Tin": "Sn", "Strontium": "Sr", "Tantalum": "Ta", "Terbium": "Tb", "Technetium": "Tc", "Tellurium": "Te", "Thorium": "Th", "Titanium": "Ti", "Thallium": "Tl", "Thulium": "Tm", "Uranium": "U", "Ununoctium": "Uuo", "Ununpentium": "Uup", "Ununseptium": "Uus", "Ununtrium": "Uut", "Vanadium": "V", "Tungsten": "W", "Xenon": "Xe", "Yttrium": "Y", "Ytterbium": "Yb", "Zinc": "Zn", "Zirconium": "Zr"}

In [6]:
elems = all_elements.values()

In [13]:
query = {
    "q": "mdf.source_name:khazana_polymer AND mdf.resource_type:record",
    "advanced": True,
    "limit": 2
}
must = ["H", "Cl"]
query2 = {
    "q": "",
    "advanced": True,
    "limit": 9999
}
for elem in elems:
    if elem in must:
        query2["q"] += " AND elements:" + elem
    else:
        query2["q"] += " AND NOT elements:" + elem
if query2["q"].startswith(" AND "):
    query2["q"] = query2["q"].replace(" AND ", "", 1)
else:
    print("No replace")

In [14]:
#search_res = gmeta_pop(client.structured_search(query))
raw_res = client.structured_search(query)
#raw_res2 = client.structured_search(query2)
search_res = gmeta_pop(raw_res)
#search_res2 = gmeta_pop(raw_res2)

In [15]:
print(raw_res["total"])
#print(raw_res2["total"])
print(len(search_res))
#print(raw_res)
search_res[0]

0
0


IndexError: list index out of range

In [None]:
#elements = set()
#for res in search_res:
#    for elem in res.get("http://globus.org/publication-schemas/mdf-base/0.1#elements", []):
#        elements.add(elem)
elements2 = set()
for res2 in search_res2:
    for elem2 in res2.get("mdf-elements", []):
        elements2.add(elem2)

In [None]:
print(("Al" not in elements))
print(("Al" in elements2))
print(("Cu" not in elements2))
#print(str(elements))
print(str(elements2))

In [None]:
search_res2[0]

# Search for >10k records

In [None]:
count = 0
saved = 0
while True:
    query = {
        "q": ("mdf-source_name:nist_xps_db AND mdf-node_type:record AND "
        "mdf-scroll_id:(>=" + str(count) + " AND <" + str(count + 10000) + ")"),
        "advanced": True,
        "limit": 10000
    }
    raw_res = client.structured_search(query)
    num_ret = len(gmeta_pop(raw_res))
    if num_ret:
        saved += num_ret
        count += 10000
    else:
        break

In [None]:
print(saved)

# Get GMeta

In [None]:
feed_path = "/Volumes/Seagate Backup Plus Drive/mdf_data/feedstock/hopv_all.json"
out_path = "/Users/jonathongaff/Downloads/records.gmeta"
with open(feed_path) as feedstock:
    glist = []
    for line in feedstock:
        glist.append(format_gmeta(json.loads(line)))
gmeta = format_gmeta(glist)
#print(gmeta)
with open(out_path, "w") as outfile:
    json.dump(gmeta, outfile)