In [1]:
import concurrent.futures
import json
import jsonschema
import os
from pprint import pprint
import requests
from time import time

from mongospawn.schema import dbschema_from_file, collschemas_for
from pymongo import MongoClient, ReplaceOne
from toolz import keyfilter
from tqdm.notebook import tqdm

In [2]:
tic = time()

In [3]:
client = MongoClient(
    host=os.getenv("NMDC_MONGO_HOST"),
    username="dwinston_rw",
    password=os.getenv("NMDC_MONGO_RW_PWD")
)

dbname = "dwinston_scratch"
db = client[dbname]

nmdc_schema_json_path = "../../schema/nmdc.schema.json"

In [4]:
dbschema = dbschema_from_file(nmdc_schema_json_path)
collschemas = collschemas_for(dbschema)

def reset_database(db):
    for coll_name in collschemas:
        db.drop_collection(coll_name)
        db.create_collection(coll_name, validator={"$jsonSchema": collschemas[coll_name]})
        db[coll_name].create_index("id", unique=True)

reset_database(db)
db.list_collection_names()

['genome_feature_set',
 'mags_activity_set',
 'data_object_set',
 'biosample_set',
 'functional_annotation_set',
 'study_set',
 'activity_set',
 'omics_processing_set']

In [5]:
def jsonschema_for(collection_name=None):
    if collection_name not in set(dbschema["properties"]):
        raise ValueError(f'collection_name must be one of {set(dbschema["properties"])}')
    defn = dbschema["properties"][collection_name]["items"]["$ref"].split("/")[-1]
    return dbschema["definitions"][defn]

def validator_for(collection):
    return collection.options()['validator']['$jsonSchema']

def pick(whitelist, d):
    return keyfilter(lambda k: k in whitelist, d)

def conform(doc, collection_name=None):
    """Provides limited, conservative conformance on a docments.
    
    - If additionalProperties is False, omit any supplied.
    - If a field must be a list of strings, and a lone string is supplied, wrap it in a list.
    
    """
    if collection_name not in set(dbschema["properties"]):
        raise ValueError(f'collection_name must be one of {set(dbschema["properties"])}')
    defn = dbschema["properties"][collection_name]["items"]["$ref"].split("/")[-1]
    schema = dbschema["definitions"][defn]
    doc_old = doc
    if schema.get("additionalProperties") is False:
        doc = pick(list(schema["properties"]), doc)
    for k in list(doc.keys()):
        if (isinstance(doc[k], str) and
            schema["properties"].get(k, {}).get("type") == "array" and
            schema["properties"][k]["items"]["type"] == "string" and 
            not isinstance(doc[k], list)):
                doc[k] = [doc[k]]
    return doc

def validate(doc, collection_name=None, conform_doc=False):
    if collection_name not in set(dbschema["properties"]):
        raise ValueError(f'collection_name must be one of {set(dbschema["properties"])}')
    defn = dbschema["properties"][collection_name]["items"]["$ref"].split("/")[-1]
    if conform_doc:
        doc = conform(doc, collection_name=collection_name)
    #jsonschema.validate(doc, schema=dbschema["definitions"][defn])
    jsonschema.validate(doc, schema=dbschema)
    return doc
    
def fetch_json(url):
    return requests.get(url).json()
    
def fetch_and_validate_json(resource, collection_name=None, conform_doc=False):
    """Takes a URL or the pre-fetched resource (list or dict)"""
    payload = fetch_json(resource) if isinstance(resource, str) else resource
    validated = []
    if isinstance(payload, list):
        for doc in tqdm(payload):
             validated.append(validate(doc, collection_name=collection_name, conform_doc=conform_doc))
    elif isinstance(payload, dict):
        if set(payload) & set(dbschema["properties"]):
            for collection_name, docs in payload.items():
                for doc in tqdm(docs, desc=collection_name):
                    validated.append(validate(doc, collection_name=collection_name, conform_doc=conform_doc))
        else:
            validated.append(validate(payload, collection_name=collection_name, conform_doc=conform_doc))
    else:
        raise ValueError(f"Fetched JSON must be a JSON array or object")
    return validated

def add_to_db(validated, db, collection_name=None):
    requests = []
    if collection_name not in set(dbschema["properties"]):
        raise ValueError(f'collection_name must be one of {set(dbschema["properties"])}')
    if isinstance(validated, list):
        db[collection_name].bulk_write([ReplaceOne({"id": v["id"]}, v, upsert=True) for v in validated])
    elif isinstance(validated, dict):
        if set(validated) & set(dbschema["properties"]):
            for collection_name, docs in validated.items():
                db[collection_name].bulk_write([ReplaceOne({"id": v["id"]}, v, upsert=True) for v in docs])
        else:
            db[collection_name].bulk_write([ReplaceOne({"id": validated["id"]}, validated, upsert=True)])
    else:
        raise ValueError(f"payload must be a list or dict")


In [6]:
to_fetch = [{
    "url": "https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_objects.json",
    "type": "activity_set",
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_data_objects.json",
    "type": "data_object_set",
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_objects.json",
    "type": "activity_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_data_objects.json",
    "type": "data_object_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_activity.json",
    "type": "activity_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_data_objects.json",
    "type": "data_object_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/MAGs_activity.json",
    "type": "mags_activity_set",
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/MAGs_data_objects.json",
    "type": "data_object_set"
}]

In [7]:
for spec in to_fetch:
    url = spec["url"]
    collection_name = spec["type"]
    print(f"fetching {url} ({collection_name})")
    payload = fetch_and_validate_json(url, collection_name, conform_doc=True)
    add_to_db(payload, db, collection_name)

fetching https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_objects.json (activity_set)


  0%|          | 0/114 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_data_objects.json (data_object_set)


  0%|          | 0/570 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_objects.json (activity_set)


  0%|          | 0/42 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_data_objects.json (data_object_set)


  0%|          | 0/210 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_activity.json (activity_set)


  0%|          | 0/274 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_data_objects.json (data_object_set)


  0%|          | 0/2740 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/cfs/m3408/meta/MAGs_activity.json (mags_activity_set)


  0%|          | 0/114 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/cfs/m3408/meta/MAGs_data_objects.json (data_object_set)


  0%|          | 0/2443 [00:00<?, ?it/s]

In [8]:
from tqdm.notebook import tqdm

error_urls = []

collection_name = "activity_set"
url_manifest = "https://nmdcdemo.emsl.pnnl.gov/metabolomics/registration/gcms_metabolomics_metadata_products.json"
urls = fetch_json(url_manifest)

pbar = tqdm(total=len(urls))

with concurrent.futures.ThreadPoolExecutor() as executor:
    future_to_url = {executor.submit(fetch_and_validate_json, url, collection_name, conform_doc=True): url
                     for url in urls}
    for future in concurrent.futures.as_completed(future_to_url):
        pbar.update(1)
        url = future_to_url[future]
        try:
            payload = future.result()
        except Exception as e:
            error_urls.append((url, str(e)))
        else:
            add_to_db(payload, db, collection_name)

pbar.close()
len(error_urls)

  0%|          | 0/209 [00:00<?, ?it/s]

0

In [9]:
toc = time()

print(f"{toc - tic} seconds")

80.27096390724182 seconds
