In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
from time import time
import os

tic = time()

from dotenv import load_dotenv
load_dotenv(os.path.expanduser("~/.nmdc_mongo.env"))

True

In [36]:
from toolz import assoc_in

from mongospawn.schema import collschemas_for

from nmdc_mongo import (
    add_to_db,
    correct_metaP_doc,
    dbschema,
    fetch_and_validate_json,
    fetch_conform_and_persist_from_manifest,
    fetch_json,
    get_db,
    reset_database,
    snake_case_set_name
)

In [37]:
###########################
# Adjustments for GSP below
###########################

defined_object_names = set(dbschema["definitions"])

set_for_object_name = {
    spec["items"]["$ref"].split("#/definitions/")[-1]: set_name
    for set_name, spec in dbschema["properties"].items()
}

existing_set_names = set(dbschema["properties"])

for object_without_set in (defined_object_names - set(set_for_object_name.keys())):
    proposed_set_name = snake_case_set_name(object_without_set)
    if proposed_set_name not in existing_set_names:
        dbschema["properties"][proposed_set_name] = {
            "description": (f"This property links a database object to the set of"
                            f" {object_without_set} objects within it."),
            "items": {"$ref": f"#/definitions/{object_without_set}"},
            "type": "array",
        }

dbschema = assoc_in(dbschema, ["definitions", "ControlledTermValue", "properties", "term", "type"], "string")
del dbschema["definitions"]["ControlledTermValue"]["properties"]["term"]["$ref"]
dbschema = assoc_in(dbschema, ["definitions", "MetagenomeAssembly", "properties", "scaf_l_gt50k", "type"], "number")

In [38]:
collschemas = collschemas_for(dbschema)

# Reconstruct
set_for_object_name = {
    spec["items"]["$ref"].split("#/definitions/")[-1]: set_name
    for set_name, spec in dbschema["properties"].items()
}

In [58]:
db = get_db("dwinston_dev")

In [59]:
for name in db.list_collection_names():
    db.drop_collection(name)
reset_database(db)

In [60]:
to_fetch = [{
    "url": "https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_objects.json",
    "type": "metagenome_annotation_activity_set",
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/img_mg_annotation_data_objects.json",
    "type": "data_object_set",
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_objects.json",
    "type": "metagenome_annotation_activity_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/mt_annotation_data_objects.json",
    "type": "data_object_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_activity.json",
    "type": "read_based_analysis_activity_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/ReadbasedAnalysis_data_objects.json",
    "type": "data_object_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/MAGs_activity.json",
    "type": "mags_activity_set",
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/MAGs_data_objects.json",
    "type": "data_object_set"
}, {
    "url": "https://nmdcdemo.emsl.pnnl.gov/metabolomics/registration/gcms_metabolomics_data_products.json",
    "type": "data_object_set"
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/stegen_MetaProteomicAnalysis_activity.json",
    "type": "metaproteomics_analysis_activity_set",
}, {
    "url": "https://portal.nersc.gov/cfs/m3408/meta/stegen_emsl_analysis_data_objects.json",
    "type": "data_object_set"
}, {
    "url": "https://nmdcdemo.emsl.pnnl.gov/nom/registration/ftms_nom_data_products.json",
    "type": "data_object_set"
}]

In [61]:
for i, spec in enumerate(to_fetch):
    if i != 9:
        continue
    url = spec["url"]
    collection_name = spec["type"]
    print(f"fetching {url} ({collection_name})")
    docs = fetch_json(url)
    if not isinstance(docs, list):
        docs = [docs]
    docs = [correct_metaP_doc(d) for d in docs]
    payload = fetch_and_validate_json(docs, collection_name, conform_doc=False)
    add_to_db(payload, db, collection_name)

fetching https://portal.nersc.gov/cfs/m3408/meta/stegen_MetaProteomicAnalysis_activity.json (metaproteomics_analysis_activity_set)


  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
from tqdm.notebook import tqdm

error_urls = fetch_conform_and_persist_from_manifest({
    "url_manifest": ("https://nmdcdemo.emsl.pnnl.gov/metabolomics/registration/"
                     "gcms_metabolomics_metadata_products.json"),
    "type": "metabolomics_analysis_activity_set"
}, db)
len(error_urls)

  0%|          | 0/209 [00:00<?, ?it/s]

0

In [11]:
from tqdm.notebook import tqdm

error_urls = fetch_conform_and_persist_from_manifest({
    "url_manifest": ("https://nmdcdemo.emsl.pnnl.gov/nom/registration/"
                     "ftms_nom_metadata_products.json"),
    "type": "nom_analysis_activity_set"
}, db)
len(error_urls)

  0%|          | 0/788 [00:00<?, ?it/s]

0

In [62]:
db.metaproteomics_analysis_activity_set.count_documents({})

1

In [63]:
from toolz import dissoc

from nmdc_mongo.admin import admin_client, reset_database_schema

In [64]:
target_collection_names = [
    name for name in db.list_collection_names()
    if db[name].count_documents({}) > 0
]

In [65]:
target_collection_names

['metaproteomics_analysis_activity_set']

In [68]:
targetdb_as_admin = admin_client["dwinston_share"]

reset_database_schema(targetdb_as_admin, target_collection_names, collschemas)

creating metaproteomics_analysis_activity_set


In [69]:
targetdb = db.client["dwinston_share"]
for name in target_collection_names:
    docs = [dissoc(d, "_id") for d in db[name].find()]
    print(name)
    add_to_db(docs, targetdb, collection_name=name)

metaproteomics_analysis_activity_set


MetaG annotations (`/global/project/projectdirs/m3408/www/meta/anno2/*_annotations.json`) are 155 JSON files totalling  ~83GB. To load them into MongoDB, I
1. Set up a Globus transfer from NERSC DTN to a Globus Connect Personal endpoint on my laptop. I could e.g.
```
$ scp dtn01.nersc.gov:/global/project/projectdirs/m3408/www/meta/anno2/*_annotations.json .
```
but I chose to use Globus, and it works well.
2. I have a bash script that uses GNU sed to transform each JSON file to a simple json array, as expected by `mongoimport`:

```bash
# trim.sh

task(){
    echo $datafile
    gsed -e '1,2d' -e '$d' -e '3i\[' $datafile > anno2/$(basename $datafile)
}

for datafile in ~/globus-nersc/nmdc/m3408/www/meta/anno2/*_annotations.json; do
    task $datafile &
done
```
I use `ps aux | grep gsed | wc -l` to monitor the progress of the parallel sed tasks. I found that trying to do this head/tail file trimming by `json.load`ing the files in Python and resaving was quite slow because the JSON files are individually quite large.
3. I have a bash script that `mongoimport`s each json array file to the database
```bash
# mongoimport.sh

n=$(ls anno2/*_annotations.json | wc -l | xargs) # `| xargs` to trim whitespace
i=1
for datafile in anno2/*_annotations.json; do
    echo "($i of $n): $datafile"
    mongoimport --uri "mongodb://<user>:<pwd>@<host>/?authSource=admin" \
        --jsonArray -d dwinston_share -c raw.functional_annotation_set \
        -j 8 $datafile
    i=$((i+1))
done
```
specifying multiple (8 in this case) insertion workers per import.

In [16]:
toc = time()

print(f"{toc - tic} seconds")

138.39117908477783 seconds
