In [None]:
import json
import re
from glob import glob
from mpcontribs.client import Client
from flatten_dict import unflatten, flatten

In [None]:
# create a project - only needed once
# client = Client()
# client.create_project(
#     name="springer_materials",
#     title="Springer Materials",
#     authors="S. Scherer, S. George, P. Huck",
#     description="Linus Pauling Files from Springer Materials",
#     url="https://materials.springer.com"
# )

In [None]:
# init client and update project info if needed
client = Client(project="springer_materials")
# client.make_public()  # needs approval
# client.update_project(update={"unique_identifiers": False})  # allow multiple contributions per identifier/mpid
# client.update_project(update={"other": {  # functions as a legend for root-level fields
#     "springer": "main info about springer entry",
#     "properties": "meta data and availability of property entries",
#     "phasediagram": "meta data about phase diagram entries"
# }})

In [None]:
# load data
data_dir = "/Users/patrick/GoogleDriveLBNL/My Drive/MaterialsProject/gitrepos/mpcontribs-data/springer_materials"

data = {}
for p in glob(f"{data_dir}/*.json"):
    if not p.endswith("_example.json") and p.startswith(f"{data_dir}/link_full_"):
        print(p)
        with open(p) as f:
            k = p.rsplit("/", 1)[-1]
            data[k] = json.load(f)

keys = set(k for docs in data.values() for doc in docs for k in doc)
len(data), len(keys)

In [None]:
# define map for column names and their units
columns_map = {
    # common fields/columns
    "Document_ID": {"name": "springer.id"},
    "Document_Title": {"name": "springer.title"},
    "Element_System": {"name": "springer.chemsys"},
    "ISP_Distinct_Solid_Phase": {"name": "springer.phase"},
    "Release_Year": {"name": "springer.released", "unit": ""},
    "URL": {"name": "springer.url"},
    # properties
    "Prototype": {"name": "properties.prototype"},
    "Pearson_Symbol": {"name": "properties.pearson"},
    "Space_Group_Symbol": {"name": "properties.spacegroup"},
    "Sample_Form": {"name": "properties.sample"},
    "Main_Physical_Property": {"name": "properties.main"},
    "Number_of_DataPoints": {"name": "properties.stats.datapoints", "unit": ""},
    "Number_of_Samples": {"name": "properties.stats.samples", "unit": ""},
    "Number_of_References": {"name": "properties.stats.references", "unit": ""},
    # phase diagram
    # "Composition": {"name": "phasediagram.composition"},
    # "Temperature": {"name": "phasediagram.temperature", "unit": "K"},
    # "Status_of_Phase_Diagram": {"name": "phasediagram.status"}
}

keys - set(columns_map.keys()) # just making sure I didn't miss a key

In [None]:
# prep contributions
contributions = []
prop_set = set()
special_char_map = {ord('ä'): 'ae', ord('ü'): 'ue', ord('ö'): 'oe', ord('ß'): 'ss'}
CLEANR = re.compile('<.*?>') 

def convert_prop(s):
    cleaned = "".join([c if c.isalnum() else " " for c in s])
    capitalized = "".join([w.capitalize() for w in cleaned.split()])
    return capitalized.translate(special_char_map)

def cleanhtml(raw_html):
    return re.sub(CLEANR, '', raw_html)

for fn, docs in data.items():
    print(fn)
    for doc in docs:
        identifier = doc["MaterialsProject_ID"]
        category = "-".join(fn.rsplit("_", 2)[1:]).replace(".json", "")
        formula = doc["Molecular_Formula"]
        # properties = [
        #     convert_prop(prop)
        #     for prop in sorted(doc["List_of_Physical_Properties"])
        # ] if category == "physical-properties" else []
        contrib = {
            "identifier": identifier, "formula": formula,
            "data": {"springer.category": category},
        }
        
        # if properties:
        #     prop_set |= set(properties)
        #     for prop in properties:
        #         contrib["data"][f"properties.available.{prop}"] = "Yes"

        for k, v in doc.items():
            if v:
                col = columns_map.get(k)
                if col:
                    name = col.get("name")
                    if name:
                        unit = col.get("unit")
                        val = ",".join(v) if isinstance(v, list) else v
                        if unit is None and "<" in val:
                            val = cleanhtml(val)

                        contrib["data"][name] = f"{val} {unit}" if unit else val 
                    
        contrib["data"] = unflatten(contrib["data"], splitter="dot")
        contributions.append(contrib)
            
len(contributions)

In [None]:
# init columns
columns = {v["name"]: v.get("unit") for v in columns_map.values()}
columns["springer.category"] = None

# for prop in sorted(prop_set):
#     columns[f"properties.available.{prop}"] = None

#client.init_columns(columns)

In [None]:
# submit everything
client.delete_contributions()  # need to delete first due to `unique_identifiers=False`
client.init_columns(columns)  # good practice :)
client.submit_contributions(contributions)
client.init_columns(columns) # just to make sure that all columns show up in the intended order

In [None]:
# list of available query parameters for this project
client._reinit()  # might be needed if new data was just submitted
client.available_query_params(startswith=("data__springer__released", "formula"))

In [None]:
# count contributions for query:
# - "physical-properties" category
# - "elasticity" as main property
# - more than 5 samples
query = {
    "data__springer__category__exact": "physical-properties",
    "data__properties__main__exact": "elasticity",
    "data__properties__stats__samples__value__gt": 5
}
client.count(query=query)

In [None]:
# retrieve contributions for query and project out Springer ID and spacegroup fields
fields = ["id", "identifier", "data.springer.id", "data.properties.spacegroup"]
client.query_contributions(query=query, fields=fields)

In [None]:
# get mp-id (and other info if needed) from Springer ID
springer_id = "ppp_350781a8aa14dc0b19c6c879daff3be2"
client.query_contributions(
    query={"data__springer__id__exact": springer_id},
    fields=["id", "identifier", "data.springer.id", "data.properties.pearson"]
)

In [None]:
# count all entries for a list of formulas released before 2023
client.count(query={
    "formula__in": ["Fe2O3", "GaAS"], "data__springer__released__value__lt": 2023
})

In [None]:
# get all entries containing all selected properties
# properties = ["XRayDiffraction", "IsotropicDisplacementParameter", "AnisotropicDisplacementParameter"]
# query = {f"data__properties__available__{prop}__exact": "Yes" for prop in properties}

# client.query_contributions(
#     query=query,
#     fields=["id", "identifier", "data.springer.id"]
# )

In [None]:
# query/code to show Springer URLs and available properties under "External Links" on MP Details Page
query = {
    "identifier": "mp-2534",
    "data__springer__category__exact": "physical-properties",
}
fields = ["data.springer.url", "data.properties.main"]
entries = client.query_contributions(query=query, fields=fields).get("data")

# mimick table
for entry in entries:
    prop = entry["data"]["properties"]["main"]
    url = entry["data"]["springer"]["url"]
    print(prop, url)