In [58]:
import concurrent
import os
from pathlib import Path
from time import time

import jsonschema
import requests
from mongospawn.schema import dbschema_from_file, collschemas_for
from pymongo import MongoClient, ReplaceOne
from toolz import keyfilter
from tqdm.notebook import tqdm

from collections import defaultdict
import csv
from datetime import datetime
from functools import partial, reduce
import json
import os
import re
from pprint import pprint
from zipfile import ZipFile

from dictdiffer import diff
from pymongo import DeleteMany, DeleteOne, InsertOne, MongoClient, ReplaceOne, UpdateOne
from toolz import assoc_in, compose, concat, dissoc, keyfilter, get_in, merge, merge_with
from tqdm.notebook import tqdm


def reset_database(db):
    for coll_name in collschemas:
        db.drop_collection(coll_name)
        db.create_collection(
            coll_name, validator={"$jsonSchema": collschemas[coll_name]}
        )
        db[coll_name].create_index("id", unique=True)


def jsonschema_for(collection_name=None):
    if collection_name not in set(dbschema["properties"]):
        raise ValueError(
            f'collection_name must be one of {set(dbschema["properties"])}'
        )
    defn = dbschema["properties"][collection_name]["items"]["$ref"].split("/")[-1]
    return dbschema["definitions"][defn]


def validator_for(collection):
    return collection.options()["validator"]["$jsonSchema"]


def pick(whitelist, d):
    return keyfilter(lambda k: k in whitelist, d)


def conform(doc, collection_name=None):
    """Provides limited, conservative conformance on a docments.

    - If additionalProperties is False, omit any supplied.
    - If a field must be a list of strings, and a lone string is supplied, wrap it in a list.

    """
    if collection_name not in set(dbschema["properties"]):
        raise ValueError(
            f'collection_name must be one of {set(dbschema["properties"])}'
        )
    defn = dbschema["properties"][collection_name]["items"]["$ref"].split("/")[-1]
    schema = dbschema["definitions"][defn]
    if schema.get("additionalProperties") is False:
        doc = pick(list(schema["properties"]), doc)
    for k in list(doc.keys()):
        if (
            isinstance(doc[k], str)
            and schema["properties"].get(k, {}).get("type") == "array"
            and schema["properties"][k]["items"]["type"] == "string"
            and not isinstance(doc[k], list)
        ):
            doc[k] = [doc[k]]
    return doc


def validate(doc, collection_name=None, conform_doc=False):
    if collection_name not in set(dbschema["properties"]):
        raise ValueError(
            f'collection_name must be one of {set(dbschema["properties"])}'
        )
    if conform_doc:
        doc = conform(doc, collection_name=collection_name)
    jsonschema.validate({collection_name: [doc]}, schema=dbschema)
    return doc


def fetch_json(url):
    return requests.get(url).json()


def fetch_and_validate_json(resource, collection_name=None, conform_doc=False):
    """Takes a URL or the pre-fetched resource (list or dict)"""
    payload = fetch_json(resource) if isinstance(resource, str) else resource
    validated = []
    if isinstance(payload, list):
        for doc in tqdm(payload):
            validated.append(
                validate(doc, collection_name=collection_name, conform_doc=conform_doc)
            )
    elif isinstance(payload, dict):
        if set(payload) & set(dbschema["properties"]):
            for collection_name, docs in payload.items():
                for doc in tqdm(docs, desc=collection_name):
                    validated.append(
                        validate(
                            doc,
                            collection_name=collection_name,
                            conform_doc=conform_doc,
                        )
                    )
        else:
            validated.append(
                validate(
                    payload, collection_name=collection_name, conform_doc=conform_doc
                )
            )
    else:
        raise ValueError(f"Fetched JSON must be a JSON array or object")
    return validated


def add_to_db(validated, db, collection_name=None):
    if collection_name not in set(dbschema["properties"]):
        raise ValueError(
            f'collection_name must be one of {set(dbschema["properties"])}'
        )
    if isinstance(validated, list):
        db[collection_name].bulk_write(
            [ReplaceOne({"id": v["id"]}, v, upsert=True) for v in validated]
        )
    elif isinstance(validated, dict):
        if set(validated) & set(dbschema["properties"]):
            for collection_name, docs in validated.items():
                db[collection_name].bulk_write(
                    [ReplaceOne({"id": v["id"]}, v, upsert=True) for v in docs]
                )
        else:
            db[collection_name].bulk_write(
                [ReplaceOne({"id": validated["id"]}, validated, upsert=True)]
            )
    else:
        raise ValueError(f"payload must be a list or dict")


def fetch_conform_and_persist(spec, db):
    url = spec["url"]
    collection_name = spec["type"]
    print(f"fetching {url} ({collection_name})")
    payload = fetch_and_validate_json(url, collection_name, conform_doc=True)
    add_to_db(payload, db, collection_name)


def fetch_conform_and_persist_from_manifest(spec, db):
    error_urls = []
    url_manifest = spec["url_manifest"]
    collection_name = spec["type"]
    urls = fetch_json(url_manifest)

    pbar = tqdm(total=len(urls))

    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_url = {
            executor.submit(
                fetch_and_validate_json, url, collection_name, conform_doc=True
            ): url
            for url in urls
        }
        for future in concurrent.futures.as_completed(future_to_url):
            pbar.update(1)
            url = future_to_url[future]
            try:
                payload = future.result()
            except Exception as e:
                error_urls.append((url, str(e)))
            else:
                add_to_db(payload, db, collection_name)

    pbar.close()
    return error_urls

In [2]:
from itertools import tee

def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

def coalesce_acronyms(set_name):
    out = ""
    for this_part, next_part in pairwise(set_name.split("_")):
        if len(this_part) == 1 and len(next_part) == 1:
            out += this_part
        elif next_part == "set":
            out += this_part + "_set"
        else:
            out += this_part + "_"
    return out

def snake_case_set_name(object_name):
    first_pass =  re.sub(r'(?<!^)(?=[A-Z])', '_', object_name).lower() + "_set"
    return coalesce_acronyms(first_pass)

In [3]:
nmdc_schema_json_path = str(
    Path.cwd().parent.parent.joinpath("schema", "nmdc.schema.json")
)
dbschema = dbschema_from_file(nmdc_schema_json_path)

###########################
# Adjustments for GSP below
###########################

defined_object_names = set(dbschema["definitions"])

set_for_object_name = {
    spec["items"]["$ref"].split("#/definitions/")[-1]: set_name
    for set_name, spec in dbschema["properties"].items()
}

existing_set_names = set(dbschema["properties"])

for object_without_set in (defined_object_names - set(set_for_object_name.keys())):
    proposed_set_name = snake_case_set_name(object_without_set)
    if proposed_set_name not in existing_set_names:
        dbschema["properties"][proposed_set_name] = {
            "description": (f"This property links a database object to the set of"
                            f" {object_without_set} objects within it."),
            "items": {"$ref": f"#/definitions/{object_without_set}"},
            "type": "array",
        }

del dbschema["definitions"]["OmicsProcessing"]["additionalProperties"]
del dbschema["definitions"]["Biosample"]["additionalProperties"]
del dbschema["definitions"]["ReadQCAnalysisActivity"]["additionalProperties"]
del dbschema["definitions"]["MetaproteomicsAnalysisActivity"]["additionalProperties"]
dbschema = assoc_in(dbschema, ["definitions", "ControlledTermValue", "properties", "term", "type"], "string")
del dbschema["definitions"]["ControlledTermValue"]["properties"]["term"]["$ref"]
dbschema = assoc_in(dbschema, ["definitions", "MetagenomeAssembly", "properties", "scaf_l_gt50k", "type"], "number")

In [4]:
collschemas = collschemas_for(dbschema)

# Reconstruct
set_for_object_name = {
    spec["items"]["$ref"].split("#/definitions/")[-1]: set_name
    for set_name, spec in dbschema["properties"].items()
}

In [5]:
client = MongoClient(
    host=os.getenv("NMDC_MONGO_HOST"),
    username="dwinston_rw",
    password=os.getenv("NMDC_MONGO_RW_PWD")
)

dbname = "dwinston_dev"
db = client[dbname]

In [6]:
reset_database(db)
sorted(db.list_collection_names())

['activity_set',
 'agent_set',
 'attribute_value_set',
 'biosample_processing_set',
 'biosample_set',
 'boolean_value_set',
 'chemical_entity_set',
 'controlled_term_value_set',
 'data_object_set',
 'database_set',
 'environmental_material_term_set',
 'functional_annotation_set',
 'gene_product_set',
 'genome_feature_set',
 'geolocation_value_set',
 'instrument_set',
 'integer_value_set',
 'mag_bin_set',
 'mags_activity_set',
 'metabolite_quantification_set',
 'metabolomics_analysis_activity_set',
 'metagenome_annotation_activity_set',
 'metagenome_assembly_set',
 'metaproteomics_analysis_activity_set',
 'nom_analysis_activity_set',
 'omics_processing_set',
 'ontology_class_set',
 'orthology_group_set',
 'pathway_set',
 'peptide_quantification_set',
 'person_set',
 'person_value_set',
 'protein_quantification_set',
 'quantity_value_set',
 'reaction_participant_set',
 'reaction_set',
 'read_QC_analysis_activity_set',
 'read_based_analysis_activity_set',
 'study_set',
 'text_value_set',


In [9]:
to_fetch = [{
    "url": "https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_objects.json",
    "type": "metagenome_annotation_activity_set",
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_data_objects.json",
    "type": "data_object_set",
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_objects.json",
    "type": "metagenome_annotation_activity_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_data_objects.json",
    "type": "data_object_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_activity.json",
    "type": "read_based_analysis_activity_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_data_objects.json",
    "type": "data_object_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/MAGs_activity.json",
    "type": "mags_activity_set",
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/MAGs_data_objects.json",
    "type": "data_object_set"
}, {
    "url": "https://nmdcdemo.emsl.pnnl.gov/metabolomics/registration/gcms_metabolomics_data_products.json",
    "type": "data_object_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/stegen_MetaProteomicAnalysis_activity.json",
    "type": "metaproteomics_analysis_activity_set",
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/stegen_emsl_analysis_data_objects.json",
    "type": "data_object_set"
}, {
    "url": "https://nmdcdemo.emsl.pnnl.gov/nom/registration/ftms_nom_data_products.json",
    "type": "data_object_set"
}]

In [47]:
from toolz import identity

metaP_field_map = {
    'PeptideSequence': ('peptide_sequence', identity),
    'sum(MASICAbundance)': ('peptide_sum_masic_abundance', int),
    'SpectralCount': ('peptide_spectral_count', int),
    'BestProtein': ('best_protein', identity),
    'min(QValue)': ('min_q_value', float),
}

def map_fields(doc, field_map=None):
    for k_old, todo in field_map.items():
        if k_old in doc:
            k_new, fn = todo
            doc = assoc_in(doc,[k_new], fn(doc[k_old]))
            doc = dissoc(doc, k_old)
    return doc

def correct_metaP_doc(doc):
    if not "has_peptide_quantifications" in doc:
        return doc
    new_items = [map_fields(item, metaP_field_map) for item in doc["has_peptide_quantifications"]]
    doc = assoc_in(
        doc,
        ["has_peptide_quantifications"],
        new_items,
    )
    return doc

In [50]:
for i, spec in enumerate(to_fetch):
    url = spec["url"]
    collection_name = spec["type"]
    print(f"fetching {url} ({collection_name})")
    docs = fetch_json(url)
    if not isinstance(docs, list):
        docs = [docs]
    docs = [correct_metaP_doc(d) for d in docs]
    payload = fetch_and_validate_json(docs, collection_name, conform_doc=False)
    add_to_db(payload, db, collection_name)

fetching https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_objects.json (metagenome_annotation_activity_set)


  0%|          | 0/114 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_data_objects.json (data_object_set)


  0%|          | 0/570 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_objects.json (metagenome_annotation_activity_set)


  0%|          | 0/42 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_data_objects.json (data_object_set)


  0%|          | 0/210 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_activity.json (read_based_analysis_activity_set)


  0%|          | 0/274 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_data_objects.json (data_object_set)


  0%|          | 0/2740 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/cfs/m3408/meta/MAGs_activity.json (mags_activity_set)


  0%|          | 0/114 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/cfs/m3408/meta/MAGs_data_objects.json (data_object_set)


  0%|          | 0/2443 [00:00<?, ?it/s]

fetching https://nmdcdemo.emsl.pnnl.gov/metabolomics/registration/gcms_metabolomics_data_products.json (data_object_set)


  0%|          | 0/209 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/cfs/m3408/meta/stegen_MetaProteomicAnalysis_activity.json (metaproteomics_analysis_activity_set)


  0%|          | 0/1 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/cfs/m3408/meta/stegen_emsl_analysis_data_objects.json (data_object_set)


  0%|          | 0/4 [00:00<?, ?it/s]

fetching https://nmdcdemo.emsl.pnnl.gov/nom/registration/ftms_nom_data_products.json (data_object_set)


  0%|          | 0/788 [00:00<?, ?it/s]

In [51]:
from tqdm.notebook import tqdm

error_urls = fetch_conform_and_persist_from_manifest({
    "url_manifest": ("https://nmdcdemo.emsl.pnnl.gov/metabolomics/registration/"
                     "gcms_metabolomics_metadata_products.json"),
    "type": "metabolomics_analysis_activity_set"
}, db)
len(error_urls)

  0%|          | 0/209 [00:00<?, ?it/s]

0

In [59]:
from tqdm.notebook import tqdm

error_urls = fetch_conform_and_persist_from_manifest({
    "url_manifest": ("https://nmdcdemo.emsl.pnnl.gov/nom/registration/"
                     "ftms_nom_metadata_products.json"),
    "type": "nom_analysis_activity_set"
}, db)
len(error_urls)

  0%|          | 0/788 [00:00<?, ?it/s]

0

In [61]:
target_collection_names = [
    name for name in db.list_collection_names()
    if db[name].count_documents({}) > 1
]

In [62]:
target_collection_names


['metabolomics_analysis_activity_set',
 'read_based_analysis_activity_set',
 'data_object_set',
 'metagenome_annotation_activity_set',
 'nom_analysis_activity_set',
 'mags_activity_set']

In [63]:
admin_client = MongoClient(
    host=os.getenv("NMDC_MONGO_HOST"),
    username="nmdc-admin",
    password=os.getenv("NMDC_MONGO_ADMIN_PWD")
)
admin_dwinston_share = admin_client["dwinston_share"]

In [64]:
def reset_database_schema(db):
    for coll_name in target_collection_names:
        if coll_name not in db.list_collection_names():
            print("creating", coll_name)
            db.create_collection(
                coll_name, validator={"$jsonSchema": collschemas[coll_name]}
            )
            db[coll_name].create_index("id", unique=True)
        else:
            print("updating", coll_name)
            db.command("collMod", coll_name, validator={"$jsonSchema": collschemas[coll_name]})

In [65]:
reset_database_schema(admin_dwinston_share)

creating metabolomics_analysis_activity_set
creating read_based_analysis_activity_set
updating data_object_set
updating metagenome_annotation_activity_set
creating nom_analysis_activity_set
updating mags_activity_set


In [67]:
db_share = client["dwinston_share"]
for name in target_collection_names:
    docs = [dissoc(d, "_id") for d in db[name].find()]
    print(name)
    add_to_db(docs, db_share, collection_name=name)

metabolomics_analysis_activity_set
read_based_analysis_activity_set
data_object_set
metagenome_annotation_activity_set
nom_analysis_activity_set
mags_activity_set


MetaG annotations (`/global/project/projectdirs/m3408/www/meta/anno2/*_annotations.json`) are 155 JSON files totalling  ~83GB. To load them into MongoDB, I
1. Set up a Globus transfer from NERSC DTN to a Globus Connect Personal endpoint on my laptop. I could e.g.
```
$ scp dtn01.nersc.gov:/global/project/projectdirs/m3408/www/meta/anno2/*_annotations.json .
```
but I chose to use Globus, and it works well.
2. I have a bash script that uses GNU sed to transform each JSON file to a simple json array, as expected by `mongoimport`:

```bash
# trim.sh

task(){
    echo $datafile
    gsed -e '1,2d' -e '$d' -e '3i\[' $datafile > anno2/$(basename $datafile)
}

for datafile in ~/globus-nersc/nmdc/m3408/www/meta/anno2/*_annotations.json; do
    task $datafile &
done
```
I use `ps aux | grep gsed | wc -l` to monitor the progress of the parallel sed tasks. I found that trying to do this head/tail file trimming by `json.load`ing the files in Python and resaving was quite slow because the JSON files are individually quite large.
3. I have a bash script that `mongoimport`s each json array file to the database
```bash
# mongoimport.sh

n=$(ls anno2/*_annotations.json | wc -l | xargs) # `| xargs` to trim whitespace
i=1
for datafile in anno2/*_annotations.json; do
    echo "($i of $n): $datafile"
    mongoimport --uri "mongodb://<user>:<pwd>@<host>/?authSource=admin" \
        --jsonArray -d dwinston_share -c raw.functional_annotation_set \
        -j 8 $datafile
    i=$((i+1))
done
```
specifying multiple (8 in this case) insertion workers per import.

In [None]:
toc = time()

print(f"{toc - tic} seconds")