In [1]:
import os

from dotenv import load_dotenv
load_dotenv(os.path.expanduser("~/.nmdc_mongo.env"))

from nmdc_mongo import get_db

db_share = get_db("dwinston_share")
db_scratch = get_db("dwinston_scratch")

In [2]:
from toolz import assoc_in

from mongospawn.schema import collschemas_for

from nmdc_mongo import (
    add_to_db,
    dbschema,
    fetch_and_validate_json,
    fetch_conform_and_persist_from_manifest,
    fetch_json,
    get_db,
    reset_database,
    snake_case_set_name
)

In [3]:
dbschema = assoc_in(dbschema, ["definitions", "ControlledTermValue", "properties", "term", "type"], "string")
del dbschema["definitions"]["ControlledTermValue"]["properties"]["term"]["$ref"]
collschemas = collschemas_for(dbschema)

In [4]:
to_fetch = [{
    # >200MB
    "url": "https://portal.nersc.gov/cfs/m3408/meta/stegen_MetaProteomicAnalysis_activity.json",
    "type": "metaproteomics_analysis_activity_set",
}, {
    # ~50KB
    "url": "https://portal.nersc.gov/cfs/m3408/meta/stegen_emsl_analysis_data_objects.json",
    "type": "data_object_set"
}]

In [5]:
from toolz import identity, dissoc, assoc_in

metaP_field_map = {
    "PeptideSequence": ("peptide_sequence", identity),
    "sum(MASICAbundance)": ("peptide_sum_masic_abundance", int),
    "SpectralCount": ("peptide_spectral_count", int),
    "BestProtein": ("best_protein", identity),
    "min(QValue)": ("min_q_value", float),
    
    "peptide_sequence": ("peptide_sequence", identity),
    "peptide_sum_masic_abundance": ("peptide_sum_masic_abundance", int),
    "peptide_spectral_count": ("peptide_spectral_count", int),
    "best_protein": ("best_protein", identity),
    "min_q_value": ("min_q_value", float),
}


def map_fields(doc, field_map=None):
    for k_old, todo in field_map.items():
        if k_old in doc:
            k_new, fn = todo
            v_new = fn(doc[k_old])
            doc = dissoc(doc, k_old)
            doc = assoc_in(doc, [k_new], v_new)
    return doc


def correct_metaP_doc(doc):
    if not "has_peptide_quantifications" in doc:
        return doc
    new_items = [
        map_fields(item, metaP_field_map) for item in doc["has_peptide_quantifications"]
    ]
    doc = assoc_in(
        doc,
        ["has_peptide_quantifications"],
        new_items,
    )
    return doc

In [7]:
for i, spec in enumerate(to_fetch):
    url = spec["url"]
    collection_name = spec["type"]
    print(f"fetching {url} ({collection_name})")
    docs = fetch_json(url)
    if not isinstance(docs, list):
        docs = [docs]
    docs = [correct_metaP_doc(d) for d in docs]
    payload = fetch_and_validate_json(docs, collection_name, conform_doc=False)
    add_to_db(payload, db_share, collection_name)

fetching https://portal.nersc.gov/cfs/m3408/meta/stegen_MetaProteomicAnalysis_activity.json (metaproteomics_analysis_activity_set)


  0%|          | 0/32 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/cfs/m3408/meta/stegen_emsl_analysis_data_objects.json (data_object_set)


  0%|          | 0/128 [00:00<?, ?it/s]

In [8]:
# 33rd analysis

to_fetch = [{
    "url": "https://portal.nersc.gov/project/m3408/meta/501128_1781_100340_stegen_MetaProteomicAnalysis_activity.json",
    "type": "metaproteomics_analysis_activity_set",
}, {
    "url": "https://portal.nersc.gov/project/m3408/meta/501128_1781_100340_stegen_emsl_analysis_data_objects.json",
    "type": "data_object_set"
}]

In [10]:
for i, spec in enumerate(to_fetch):
    url = spec["url"]
    collection_name = spec["type"]
    print(f"fetching {url} ({collection_name})")
    docs = fetch_json(url)
    if not isinstance(docs, list):
        docs = [docs]
    docs = [correct_metaP_doc(d) for d in docs]
    payload = fetch_and_validate_json(docs, collection_name, conform_doc=False)
    add_to_db(payload, db_share, collection_name)

fetching https://portal.nersc.gov/project/m3408/meta/501128_1781_100340_stegen_MetaProteomicAnalysis_activity.json (metaproteomics_analysis_activity_set)


  0%|          | 0/1 [00:00<?, ?it/s]

fetching https://portal.nersc.gov/project/m3408/meta/501128_1781_100340_stegen_emsl_analysis_data_objects.json (data_object_set)


  0%|          | 0/4 [00:00<?, ?it/s]