In [None]:
import concurrent
import os
from pathlib import Path
from time import time

import jsonschema
import requests
from mongospawn.schema import dbschema_from_file, collschemas_for
from pymongo import MongoClient, ReplaceOne
from toolz import keyfilter
from tqdm.notebook import tqdm

nmdc_schema_json_path = str(
    Path.cwd().parent.parent.joinpath("schema", "nmdc.schema.json")
)
dbschema = dbschema_from_file(nmdc_schema_json_path)
collschemas = collschemas_for(dbschema)


def reset_database(db):
    for coll_name in collschemas:
        db.drop_collection(coll_name)
        db.create_collection(
            coll_name, validator={"$jsonSchema": collschemas[coll_name]}
        )
        db[coll_name].create_index("id", unique=True)


def jsonschema_for(collection_name=None):
    if collection_name not in set(dbschema["properties"]):
        raise ValueError(
            f'collection_name must be one of {set(dbschema["properties"])}'
        )
    defn = dbschema["properties"][collection_name]["items"]["$ref"].split("/")[-1]
    return dbschema["definitions"][defn]


def validator_for(collection):
    return collection.options()["validator"]["$jsonSchema"]


def pick(whitelist, d):
    return keyfilter(lambda k: k in whitelist, d)


def conform(doc, collection_name=None):
    """Provides limited, conservative conformance on a docments.

    - If additionalProperties is False, omit any supplied.
    - If a field must be a list of strings, and a lone string is supplied, wrap it in a list.

    """
    if collection_name not in set(dbschema["properties"]):
        raise ValueError(
            f'collection_name must be one of {set(dbschema["properties"])}'
        )
    defn = dbschema["properties"][collection_name]["items"]["$ref"].split("/")[-1]
    schema = dbschema["definitions"][defn]
    if schema.get("additionalProperties") is False:
        doc = pick(list(schema["properties"]), doc)
    for k in list(doc.keys()):
        if (
            isinstance(doc[k], str)
            and schema["properties"].get(k, {}).get("type") == "array"
            and schema["properties"][k]["items"]["type"] == "string"
            and not isinstance(doc[k], list)
        ):
            doc[k] = [doc[k]]
    return doc


def validate(doc, collection_name=None, conform_doc=False):
    if collection_name not in set(dbschema["properties"]):
        raise ValueError(
            f'collection_name must be one of {set(dbschema["properties"])}'
        )
    if conform_doc:
        doc = conform(doc, collection_name=collection_name)
    jsonschema.validate(doc, schema=dbschema)
    return doc


def fetch_json(url):
    return requests.get(url).json()


def fetch_and_validate_json(resource, collection_name=None, conform_doc=False):
    """Takes a URL or the pre-fetched resource (list or dict)"""
    payload = fetch_json(resource) if isinstance(resource, str) else resource
    validated = []
    if isinstance(payload, list):
        for doc in tqdm(payload):
            validated.append(
                validate(doc, collection_name=collection_name, conform_doc=conform_doc)
            )
    elif isinstance(payload, dict):
        if set(payload) & set(dbschema["properties"]):
            for collection_name, docs in payload.items():
                for doc in tqdm(docs, desc=collection_name):
                    validated.append(
                        validate(
                            doc,
                            collection_name=collection_name,
                            conform_doc=conform_doc,
                        )
                    )
        else:
            validated.append(
                validate(
                    payload, collection_name=collection_name, conform_doc=conform_doc
                )
            )
    else:
        raise ValueError(f"Fetched JSON must be a JSON array or object")
    return validated


def add_to_db(validated, db, collection_name=None):
    if collection_name not in set(dbschema["properties"]):
        raise ValueError(
            f'collection_name must be one of {set(dbschema["properties"])}'
        )
    if isinstance(validated, list):
        db[collection_name].bulk_write(
            [ReplaceOne({"id": v["id"]}, v, upsert=True) for v in validated]
        )
    elif isinstance(validated, dict):
        if set(validated) & set(dbschema["properties"]):
            for collection_name, docs in validated.items():
                db[collection_name].bulk_write(
                    [ReplaceOne({"id": v["id"]}, v, upsert=True) for v in docs]
                )
        else:
            db[collection_name].bulk_write(
                [ReplaceOne({"id": validated["id"]}, validated, upsert=True)]
            )
    else:
        raise ValueError(f"payload must be a list or dict")


def fetch_conform_and_persist(spec, db):
    url = spec["url"]
    collection_name = spec["type"]
    print(f"fetching {url} ({collection_name})")
    payload = fetch_and_validate_json(url, collection_name, conform_doc=True)
    add_to_db(payload, db, collection_name)


def fetch_conform_and_persist_from_manifest(spec, db):
    error_urls = []
    url_manifest = spec["url_manifest"]
    collection_name = spec["type"]
    urls = fetch_json(url_manifest)

    pbar = tqdm(total=len(urls))

    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_url = {
            executor.submit(
                fetch_and_validate_json, url, collection_name, conform_doc=True
            ): url
            for url in urls
        }
        for future in concurrent.futures.as_completed(future_to_url):
            pbar.update(1)
            url = future_to_url[future]
            try:
                payload = future.result()
            except Exception as e:
                error_urls.append((url, str(e)))
            else:
                add_to_db(payload, db, collection_name)

    pbar.close()
    return error_urls


In [None]:
tic = time()

In [None]:
client = MongoClient(
    host=os.getenv("NMDC_MONGO_HOST"),
    username="dwinston_rw",
    password=os.getenv("NMDC_MONGO_RW_PWD")
)

dbname = "dwinston_dev"
db = client[dbname]

In [None]:
#reset_database(db)
db.list_collection_names()

In [None]:
to_fetch = [{
    "url": "https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_objects.json",
    "type": "activity_set",
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_data_objects.json",
    "type": "data_object_set",
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_objects.json",
    "type": "activity_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_data_objects.json",
    "type": "data_object_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_activity.json",
    "type": "activity_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_data_objects.json",
    "type": "data_object_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/MAGs_activity.json",
    "type": "mags_activity_set",
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/MAGs_data_objects.json",
    "type": "data_object_set"
}, {
    "url": "https://nmdcdemo.emsl.pnnl.gov/metabolomics/registration/gcms_metabolomics_data_products.json",
    "type": "data_object_set"
}]

In [None]:
for spec in to_fetch:
    fetch_conform_and_persist(spec, db)

In [None]:
from tqdm.notebook import tqdm

error_urls = fetch_conform_and_persist_from_manifest({
    "url_manifest": ("https://nmdcdemo.emsl.pnnl.gov/metabolomics/registration/"
                     "gcms_metabolomics_metadata_products.json"),
    "type": "activity_set"
}, db)
len(error_urls)

MetaG annotations (`/global/project/projectdirs/m3408/www/meta/anno2/*_annotations.json`) are 155 JSON files totalling  ~83GB. To load them into MongoDB, I
1. Set up a Globus transfer from NERSC DTN to a Globus Connect Personal endpoint on my laptop. I could e.g.
```
$ scp dtn01.nersc.gov:/global/project/projectdirs/m3408/www/meta/anno2/*_annotations.json .
```
but I chose to use Globus, and it works well.
2. I have a bash script that uses GNU sed to transform each JSON file to a simple json array, as expected by `mongoimport`:

```bash
# trim.sh

task(){
    echo $datafile
    gsed -e '1,2d' -e '$d' -e '3i\[' $datafile > anno2/$(basename $datafile)
}

for datafile in ~/globus-nersc/nmdc/m3408/www/meta/anno2/*_annotations.json; do
    task $datafile &
done
```
I use `ps aux | grep gsed | wc -l` to monitor the progress of the parallel sed tasks. I found that trying to do this head/tail file trimming by `json.load`ing the files in Python and resaving was quite slow because the JSON files are individually quite large.
3. I have a bash script that `mongoimport`s each json array file to the database
```bash
# mongoimport.sh

n=$(ls anno2/*_annotations.json | wc -l | xargs) # `| xargs` to trim whitespace
i=1
for datafile in anno2/*_annotations.json; do
    echo "($i of $n): $datafile"
    mongoimport --uri "mongodb://<user>:<pwd>@<host>/?authSource=admin" \
        --jsonArray -d dwinston_share -c raw.functional_annotation_set \
        -j 8 $datafile
    i=$((i+1))
done
```
specifying multiple (8 in this case) insertion workers per import.

In [None]:
toc = time()

print(f"{toc - tic} seconds")