Write a YAML file based on fetching a material document that includes keys not currently accounted for by the API repository.

In [2]:
import json
import os
import yaml
from bson.json_util import dumps as bson_dumps
from mongogrant import Client

client = Client()
db = client.db("ro:prod/mp_emmet_prod")

In [22]:
db.materials.count_documents({"has": {"$exists": False}})

64902

In [29]:
props = list(filter(None, db.materials.distinct("has")))
props_remaining = list(props)
docs = []
while props_remaining:
    doc_with_most_props = list(db.materials.aggregate([
        {"$match": {"has": {"$in": props_remaining}}},
        {"$project": {"task_id": 1,
                      "has": 1,
                      "nhas": {"$cond":
                               {"if": {"$isArray": "$has" },
                                "then": {"$size": "$has"},
                                "else": 0}}}},
        {"$sort": {"nhas": -1}},
        {"$limit": 1}
    ]))[0]
    docs.append(doc_with_most_props)
    props_remaining = list(set(props_remaining) - set(doc_with_most_props["has"]))

In [30]:
docs

[{'_id': ObjectId('5c08c4c2277e2b8e3aa9189c'),
  'task_id': 'mp-20351',
  'has': ['xas',
   'elasticity',
   'piezo',
   'diel',
   'phonons',
   'bandstructure',
   'eos'],
  'nhas': 7},
 {'_id': ObjectId('5c08c4f1277e2b8e3aa933f6'),
  'task_id': 'mp-19',
  'has': ['xas',
   'elasticity',
   'piezo',
   'surfaces',
   'diel',
   'bandstructure',
   'eos'],
  'nhas': 7}]

In [31]:
for mid in [d["task_id"] for d in docs]:
    doc = json.loads(bson_dumps(db.materials.find_one({'task_id': mid})))

    with open(f"{mid}.yaml", "w") as f:
        yaml.safe_dump(doc, f, default_flow_style=False)