# Agent prototype: nist_xps_db aggregation

Agent to aggregate binding energies from the NIST XPS database, combine the data into a histogram, and update the nist_xps_db dataset entry with the histogram.

In [1]:
import json
import pandas as pd
from io import StringIO
import matplotlib as plt
import search_client
from gmeta_utils import gmeta_pop, format_gmeta

%matplotlib inline

In [2]:
client = search_client.SearchClient("https://search.api.globus.org/", "mdf")

# Fetch and aggregate records

In [3]:
count = 0
num_processed = 0
data_list = []
while True:
    query = {
        "q": ("mdf.source_name:nist_xps_db AND mdf.resource_type:record AND "
        "mdf.scroll_id:(>=" + str(count) + " AND <" + str(count + 10000) + ")"),
        "advanced": True,
        "limit": 10000
    }
    raw_res = client.structured_search(query)
    search_res = gmeta_pop(raw_res)
    for res in search_res:
        data_dict = json.loads(res["mdf"].get("raw", "{}"))
        if data_dict.get("Binding Energy (eV)", None):
            data_list.append({
                "composition": res["mdf"]["composition"],
                "binding_energy": float(data_dict.get("Binding Energy (eV)", "nan"))
    #            "excitation_energy": float(data_dict.get("Excitation Energy", "nan"))
            })
    num_ret = len(search_res)
    if num_ret:
        num_processed += num_ret
        count += 10000
    else:
        break
print(len(data_list), "/", num_processed, "|", len(data_list) - num_processed)
bind_en_list = [d["binding_energy"] for d in data_list]
print("Max:", max(bind_en_list), "\nMin:", min(bind_en_list))

23302 / 29189 | -5887
Max: 3939.9 
Min: 2.4


In [4]:
df = pd.DataFrame(data_list)
buckets = list(range(0, 4000, 100))
df['bucket'] = pd.cut(df['binding_energy'], buckets, labels=buckets[:-1])
hist = df.sort_values("bucket").groupby("bucket").bucket.count()

In [5]:
df2 = pd.DataFrame(hist)
df2.columns = ["count"]
df2.reset_index(inplace=True)
df2.columns = ["binding energy (eV)", "count"]
data = df2.as_matrix().tolist()
headers = ["binding energy (eV)", "count"]

# Update dataset entry

In [6]:
query = {
    "q": "mdf.source_name:nist_xps_db AND mdf.resource_type:dataset",
    "advanced": True
}
raw_res = client.structured_search(query)
search_res = gmeta_pop(raw_res)
if len(search_res) != 1:
    raise ValueError("Incorrect number of results: " + str(len(search_res)))
ingest = search_res[0]
ingest["globus_subject"] = raw_res["gmeta"][0]["subject"]
ingest["mdf"]["acl"] = ["public"]
ingest["nist_xps_db"] = {}
ingest["nist_xps_db"]["histogram"] = {"headers": headers, "matrix": data}
#ingest["http://materialsdatafacility.org/#nist_xps_db:histogram"] = {"http://globus.org/#fieldthing": "otherthing"}
#ingest["http://materialsdatafacility.org/#nist_xps_db:histogram"] = "12345"
#ingest["https://google.com/#testingthing"] = "thingthing"
gmeta = format_gmeta([format_gmeta(ingest)])

In [7]:
client.ingest(gmeta)

GlobusAPIError: (403, 'Forbidden.Generic', 'Ingest request denied by service')

# Check ingest

In [None]:
query = {
    "q": "mdf-source_name:nist_xps_db AND mdf-node_type:dataset",
    "advanced": True
}
raw_res = client.structured_search(query)
search_res = gmeta_pop(raw_res)
#print(json.dumps(search_res[0], sort_keys=True, indent=4, separators=(',', ': ')))

In [None]:
hist = search_res[0]["nist_xps-histogram"]
df2 = pd.DataFrame(hist["matrix"], columns=hist["headers"])

In [None]:
df2