In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import concurrent
import os
from pathlib import Path
from time import time

import jsonschema
import requests
from mongospawn.schema import dbschema_from_file, collschemas_for
from pymongo import MongoClient, ReplaceOne
from toolz import keyfilter
from tqdm.notebook import tqdm

from collections import defaultdict
import csv
from datetime import datetime
from functools import partial, reduce
import json
import os
import re
from pprint import pprint
from zipfile import ZipFile

from dictdiffer import diff
from pymongo import DeleteMany, DeleteOne, InsertOne, MongoClient, ReplaceOne, UpdateOne
from toolz import assoc_in, compose, concat, dissoc, keyfilter, get_in, merge, merge_with
from tqdm.notebook import tqdm


def reset_database(db):
    for coll_name in collschemas:
        db.drop_collection(coll_name)
        db.create_collection(
            coll_name, validator={"$jsonSchema": collschemas[coll_name]}
        )
        db[coll_name].create_index("id", unique=True)


def jsonschema_for(collection_name=None):
    if collection_name not in set(dbschema["properties"]):
        raise ValueError(
            f'collection_name must be one of {set(dbschema["properties"])}'
        )
    defn = dbschema["properties"][collection_name]["items"]["$ref"].split("/")[-1]
    return dbschema["definitions"][defn]


def validator_for(collection):
    return collection.options()["validator"]["$jsonSchema"]


def pick(whitelist, d):
    return keyfilter(lambda k: k in whitelist, d)


def conform(doc, collection_name=None):
    """Provides limited, conservative conformance on a docments.

    - If additionalProperties is False, omit any supplied.
    - If a field must be a list of strings, and a lone string is supplied, wrap it in a list.

    """
    if collection_name not in set(dbschema["properties"]):
        raise ValueError(
            f'collection_name must be one of {set(dbschema["properties"])}'
        )
    defn = dbschema["properties"][collection_name]["items"]["$ref"].split("/")[-1]
    schema = dbschema["definitions"][defn]
    if schema.get("additionalProperties") is False:
        doc = pick(list(schema["properties"]), doc)
    for k in list(doc.keys()):
        if (
            isinstance(doc[k], str)
            and schema["properties"].get(k, {}).get("type") == "array"
            and schema["properties"][k]["items"]["type"] == "string"
            and not isinstance(doc[k], list)
        ):
            doc[k] = [doc[k]]
    return doc


def validate(doc, collection_name=None, conform_doc=False):
    if collection_name not in set(dbschema["properties"]):
        raise ValueError(
            f'collection_name must be one of {set(dbschema["properties"])}'
        )
    if conform_doc:
        doc = conform(doc, collection_name=collection_name)
    jsonschema.validate({collection_name: [doc]}, schema=dbschema)
    return doc


def fetch_json(url):
    return requests.get(url).json()


def fetch_and_validate_json(resource, collection_name=None, conform_doc=False):
    """Takes a URL or the pre-fetched resource (list or dict)"""
    payload = fetch_json(resource) if isinstance(resource, str) else resource
    validated = []
    if isinstance(payload, list):
        for doc in tqdm(payload):
            validated.append(
                validate(doc, collection_name=collection_name, conform_doc=conform_doc)
            )
    elif isinstance(payload, dict):
        if set(payload) & set(dbschema["properties"]):
            for collection_name, docs in payload.items():
                for doc in tqdm(docs, desc=collection_name):
                    validated.append(
                        validate(
                            doc,
                            collection_name=collection_name,
                            conform_doc=conform_doc,
                        )
                    )
        else:
            validated.append(
                validate(
                    payload, collection_name=collection_name, conform_doc=conform_doc
                )
            )
    else:
        raise ValueError(f"Fetched JSON must be a JSON array or object")
    return validated


def add_to_db(validated, db, collection_name=None):
    if collection_name not in set(dbschema["properties"]):
        raise ValueError(
            f'collection_name must be one of {set(dbschema["properties"])}'
        )
    if isinstance(validated, list):
        db[collection_name].bulk_write(
            [ReplaceOne({"id": v["id"]}, v, upsert=True) for v in validated]
        )
    elif isinstance(validated, dict):
        if set(validated) & set(dbschema["properties"]):
            for collection_name, docs in validated.items():
                db[collection_name].bulk_write(
                    [ReplaceOne({"id": v["id"]}, v, upsert=True) for v in docs]
                )
        else:
            db[collection_name].bulk_write(
                [ReplaceOne({"id": validated["id"]}, validated, upsert=True)]
            )
    else:
        raise ValueError(f"payload must be a list or dict")


def fetch_conform_and_persist(spec, db):
    url = spec["url"]
    collection_name = spec["type"]
    print(f"fetching {url} ({collection_name})")
    payload = fetch_and_validate_json(url, collection_name, conform_doc=True)
    add_to_db(payload, db, collection_name)


def fetch_conform_and_persist_from_manifest(spec, db):
    error_urls = []
    url_manifest = spec["url_manifest"]
    collection_name = spec["type"]
    urls = fetch_json(url_manifest)

    pbar = tqdm(total=len(urls))

    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_url = {
            executor.submit(
                fetch_and_validate_json, url, collection_name, conform_doc=True
            ): url
            for url in urls
        }
        for future in concurrent.futures.as_completed(future_to_url):
            pbar.update(1)
            url = future_to_url[future]
            try:
                payload = future.result()
            except Exception as e:
                error_urls.append((url, str(e)))
            else:
                add_to_db(payload, db, collection_name)

    pbar.close()
    return error_urls


In [3]:
from itertools import tee

def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

def coalesce_acronyms(set_name):
    out = ""
    for this_part, next_part in pairwise(set_name.split("_")):
        if len(this_part) == 1 and len(next_part) == 1:
            out += this_part
        elif next_part == "set":
            out += this_part + "_set"
        else:
            out += this_part + "_"
    return out

def snake_case_set_name(object_name):
    first_pass =  re.sub(r'(?<!^)(?=[A-Z])', '_', object_name).lower() + "_set"
    return coalesce_acronyms(first_pass)

In [99]:
nmdc_schema_json_path = str(
    Path.cwd().parent.parent.joinpath("schema", "nmdc.schema.json")
)
dbschema = dbschema_from_file(nmdc_schema_json_path)

###########################
# Adjustments for GSP below
###########################

defined_object_names = set(dbschema["definitions"])

set_for_object_name = {
    spec["items"]["$ref"].split("#/definitions/")[-1]: set_name
    for set_name, spec in dbschema["properties"].items()
}

existing_set_names = set(dbschema["properties"])

for object_without_set in (defined_object_names - set(set_for_object_name.keys())):
    proposed_set_name = snake_case_set_name(object_without_set)
    if proposed_set_name not in existing_set_names:
        dbschema["properties"][proposed_set_name] = {
            "description": (f"This property links a database object to the set of"
                            f" {object_without_set} objects within it."),
            "items": {"$ref": f"#/definitions/{object_without_set}"},
            "type": "array",
        }
        
del dbschema["definitions"]["OmicsProcessing"]["additionalProperties"]
del dbschema["definitions"]["Biosample"]["additionalProperties"]
del dbschema["definitions"]["ReadQCAnalysisActivity"]["additionalProperties"]
dbschema = assoc_in(dbschema, ["definitions", "ControlledTermValue", "properties", "term", "type"], "string")
del dbschema["definitions"]["ControlledTermValue"]["properties"]["term"]["$ref"]
dbschema = assoc_in(dbschema, ["definitions", "MetagenomeAssembly", "properties", "scaf_l_gt50k", "type"], "number")

In [100]:
collschemas = collschemas_for(dbschema)

# Reconstruct
set_for_object_name = {
    spec["items"]["$ref"].split("#/definitions/")[-1]: set_name
    for set_name, spec in dbschema["properties"].items()
}

(Re-)load existing NMDC DB from file.

In [101]:
with ZipFile('../src/data/nmdc_database.json.zip') as myzip:
    with myzip.open('nmdc_database.json') as f:
        nmdc_database = json.load(f)

In [102]:
client = MongoClient(
    host=os.getenv("NMDC_MONGO_HOST"),
    username="dwinston_rw",
    password=os.getenv("NMDC_MONGO_RW_PWD"))
dbname = "dwinston_scratch"
db = client[dbname]

reset_database(db)
#for collection in nmdc_database:
#    db[collection].insert_many(nmdc_database[collection])
sorted(db.list_collection_names())

['activity_set',
 'agent_set',
 'attribute_value_set',
 'biosample_processing_set',
 'biosample_set',
 'boolean_value_set',
 'chemical_entity_set',
 'controlled_term_value_set',
 'data_object_set',
 'database_set',
 'environmental_material_term_set',
 'foo',
 'functional_annotation_set',
 'gene_product_set',
 'genome_feature_set',
 'geolocation_value_set',
 'instrument_set',
 'integer_value_set',
 'mag_bin_set',
 'mags_activity_set',
 'metabolite_quantification_set',
 'metabolomics_analysis_activity_set',
 'metagenome_annotation_activity_set',
 'metagenome_assembly_set',
 'metaproteomics_analysis_activity_set',
 'nom_analysis_activity_set',
 'omics_processing_set',
 'ontology_class_set',
 'orthology_group_set',
 'pathway_set',
 'peptide_quantification_set',
 'person_set',
 'person_value_set',
 'protein_quantification_set',
 'quantity_value_set',
 'reaction_participant_set',
 'reaction_set',
 'read_QC_analysis_activity_set',
 'read_based_analysis_activity_set',
 'study_set',
 'text_valu

In [103]:
target_collection = {
    "Study": "study_set",
    "OmicsProcessing": "omics_processing_set",
    "Biosample": "biosample_set",
    "DataObject": "data_object_set",
    "MetagenomeAssembly": "metagenome_assembly_set",
    "MetaProteomicAnalysis": "metaproteomics_analysis_activity_set",
    "MetagenomeAnnotation": "metagenome_annotation_activity_set",
    "ReadQCAnalysisActivity": "read_QC_analysis_activity_set",
    "": "data_object_set",
}
for collection in nmdc_database:
    docs = nmdc_database[collection]
    object_types = {d.get("type", "nmdc:")[5:] for d in docs} - {""}
    if any(d for d in docs if "type" not in d):
        print("some",collection,"docs have no type")
    print(collection, object_types)

study_set {'Study'}
omics_processing_set {'OmicsProcessing'}
biosample_set {'Biosample'}
some data_object_set docs have no type
data_object_set {'DataObject'}
activity_set {'MetaProteomicAnalysis', 'MetagenomeAssembly', 'ReadQCAnalysisActivity', 'MetagenomeAnnotation'}


In [110]:
from collections import defaultdict

for source_collection in nmdc_database:
    docs = nmdc_database[source_collection]
    docs_per_target = defaultdict(list)
    for d in docs:
        type_ = d.get("type", "nmdc:")[5:]
        d_new = dissoc(d, "type")
        
        if "lat_lon" in d_new:
            d_new["lat_lon"].pop("type", None)
        for k_float in (
            "asm_score", "ctg_logsum", "ctg_powsum", "gap_pct", "gc_avg", "gc_std",
            "scaf_logsum", "scaf_powsum"):
            if k_float in d_new:
                d_new[k_float] = float(d_new[k_float]) 
        keys_with_term_ids = [
            k for k in d_new
            if isinstance(d_new[k], dict)
            and "term" in d_new[k]
            and "id" in d_new[k]["term"]
        ]
        for k in keys_with_term_ids:
            d_new = assoc_in(d_new, [k, "term"], d_new[k]["term"]["id"])
            
        docs_per_target[target_collection[type_]].append(d_new)
                
    for collection_name, docs in docs_per_target.items():
        print(collection_name)
        payload = fetch_and_validate_json(docs, collection_name=collection_name)
        add_to_db(payload, db, collection_name=collection_name)

study_set


  0%|          | 0/13 [00:00<?, ?it/s]

omics_processing_set


  0%|          | 0/7454 [00:00<?, ?it/s]

study_set: `Additional properties are not allowed ('principal_investigator_name', 'type' were unexpected)`

omics_processing_set: `Additional properties are not allowed ('principal_investigator_name', 'mod_date', 'add_date', 'processing_institution', 'ncbi_project_name' were unexpected)`

biosample_set: 

-`Additional properties are not allowed ('location', 'mod_date', 'identifier', 'habitat', 'ncbi_taxonomy_name', 'add_date', 'community', 'sample_collection_site' were unexpected)`

-`Additional properties are not allowed ('type' was unexpected)` for `lat_lon`

data_object_set: OK

activity_set:
- `Additional properties are not allowed ('contigs', 'ctg_N50', 'scaf_max', 'contig_bp', 'scaffolds', 'gc_std', 'gap_pct', 'num_input_reads', 'scaf_N50', 'ctg_max', 'ctg_L90', 'ctg_powsum', 'gc_avg', 'scaf_powsum', 'scaf_L90', 'ctg_L50', 'scaf_l_gt50k', 'scaf_N90', 'ctg_N90', 'scaf_L50', 'asm_score', 'scaf_n_gt50K', 'scaf_pct_gt50K', 'num_aligned_reads', 'scaf_bp', 'ctg_logsum', 'scaf_logsum' were unexpected)` for type 'nmdc:MetagenomeAssembly'

- `Additional properties are not allowed ('input_read_count', 'output_read_count', 'input_read_bases', 'output_read_bases' were unexpected)` for type 'nmdc:ReadQCAnalysisActivity'

metagenome_assembly_set:
`Additional properties are not allowed ('scaf_l_gt50k' was unexpected)`

read_QC_analysis_activity_set:
- `Additional properties are not allowed ('output_read_bases', 'input_read_bases' were unexpected)`

Load FICUS Brodie spreadsheet and create gold-id-to-igsn map.

In [112]:
GOLD_ID_IDX = 5
IGSN_IDX = 2

igsn_golds = defaultdict(list)

gold_id_pattern = re.compile(r"Gb\d+")

with open('../src/data/FICUS_Soil_Gs0135149_Brodie-12-23-2020_PS.xlsx - Brodie_Gs0135149_Soil_Metadata.csv') as f:
    reader = csv.reader(f)
    for row in reader:
        gold_id = row[GOLD_ID_IDX]
        igsn = row[IGSN_IDX]
        if gold_id_pattern.fullmatch(gold_id):
            igsn_golds[igsn].append(gold_id)

Prepare helper function to compare timestamps given in e.g. "15-MAY-20 08.30.01.000000000 am" format.

In [113]:
dt_pattern = re.compile(r"\d{2}-(?P<month>\w+)-\d{2} \d{2}\.\d{2}\.\d{2}\.(?P<ns>\d+) [A|P]M")
dt_format = "%d-%b-%y %I.%M.%S.%f %p"

def order_timestamps(timestamps):
    if not all(isinstance(ts, str) for ts in timestamps):
        raise Exception(f"{timestamps} not strings")
    as_datetimes = []
    for ts in timestamps:
        match = dt_pattern.search(ts)
        first, month, rest = ts.partition(match.group("month"))
        ts_new = first + month[0] + month[1:].lower() + rest
        ts_new = ts_new.replace(match.group("ns"), match.group("ns")[:-3]) # truncate to microseconds
        as_datetimes.append(datetime.strptime(ts_new, dt_format))
    sorted_dts = sorted(as_datetimes)
    return [dt.strftime(dt_format) for dt in sorted_dts]

Prepare helper-function pipeline to unify biosample_set documents that should be considered equivalent.

In [114]:
er_xna_pattern = re.compile(r"ER_[D|R]NA_\d+$")

def rstrip_name_ER_ID(d):
    s = get_in(["name"], d)
    s_new = er_xna_pattern.split(s)[0] if er_xna_pattern.search(s) else s
    return assoc_in(d, ["name"], s_new)

def capitalize_location_raw_value(d):
    s = get_in(["location", "has_raw_value"], d)
    s_new = s[0].upper() + s[1:]
    return assoc_in(d, ["location", "has_raw_value"], s_new)

pipeline = compose(
    capitalize_location_raw_value,
    rstrip_name_ER_ID,
    lambda d: dissoc(d, "_id", "id", "add_date", "mod_date", "identifier"),
)

Produce new biosample objects with ISGN ids.

In [115]:
merged_biosample_docs = []

for igsn, golds in igsn_golds.items():
    igsn_curie = "igsn:"+igsn
    to_change = list(db.biosample_set.find({"id": {"$in": [f"gold:{g}" for g in golds]}}))
    
    # No merge needed, just change of id.
    if len(to_change) == 1:
        merged = assoc_in(to_change[0], ["id"], igsn_curie)
        merged = assoc_in(merged, ["identifier", "has_raw_value"], igsn_curie)
        merged_biosample_docs.append(merged)
        continue
    elif len(to_change) == 0:
        continue

    # Ensure that unification pipeline is adequate to resolve differences.
    distilled = list(map(pipeline, to_change))
    result = list(diff(distilled[0], distilled[1]))
    assert result == []
    
    # Produce a merged document
    earlier_ts, _ = order_timestamps([get_in(["add_date", "has_raw_value"], d) for d in to_change])
    merged = assoc_in(distilled[0], ["add_date", "has_raw_value"], earlier_ts)
    _, later_ts = order_timestamps([get_in(["mod_date", "has_raw_value"], d) for d in to_change])
    merged = assoc_in(merged, ["mod_date", "has_raw_value"], later_ts)
    merged = assoc_in(merged, ["id"], igsn_curie)
    merged = assoc_in(merged, ["identifier", "has_raw_value"], igsn_curie)
    
    merged_biosample_docs.append(merged)
    merged = None # defense against accidental reuse during next iteration.

assert len(merged_biosample_docs) == len(igsn_golds)

Delete old biosample objects and insert new ones in one bulk-write operation.

In [117]:
requests = [DeleteMany({"id": {"$in": ["gold:"+g for g in concat(igsn_golds.values())]}})]
requests.extend([InsertOne(d) for d in merged_biosample_docs])
result = db.biosample_set.bulk_write(requests)
result.deleted_count, result.inserted_count

(93, 48)

Update omics_processing_set references to biosample_set ids.

In [118]:
goldid_igsn = {}
for igsn, gids in igsn_golds.items():
    for gid in gids:
        goldid_igsn[gid] = igsn

In [119]:
requests = []
to_replace = {"gold:"+k: "igsn:"+v for k, v in goldid_igsn.items()}

for doc in db.omics_processing_set.find({"has_input": {"$in": list(to_replace)}}):
    operations = {"$set": {
        "has_input": [to_replace.get(i, i) for i in doc["has_input"]],
    }}
    requests.append({"filter": {"_id": doc["_id"]}, "update": operations})

In [120]:
rv = db.omics_processing_set.bulk_write([UpdateOne(**r) for r in requests])

In [121]:
rv.modified_count

93

Update omics_processing_set references from EMSL ids to IGSNs.

In [122]:
EMSL_IDS_IDX = 7
IGSN_IDX = 2

igsn_emsls = {}

emsl_ids_pattern = re.compile(r"\d+")

with open('../src/data/FICUS_Soil_Gs0135149_Brodie-12-23-2020_PS.xlsx - Brodie_Gs0135149_Soil_Metadata.csv') as f:
    reader = csv.reader(f)
    for row in reader:
        emsl_ids = row[EMSL_IDS_IDX]
        igsn = row[IGSN_IDX]
        ids = emsl_ids_pattern.findall(emsl_ids)
        # XXX some rows have emsl ids but no IGSN, so igsn.strip() check here
        if igsn.strip() and ids:
            igsn_emsls[igsn] = ids

In [123]:
emslid_igsn = {}
for igsn, eids in igsn_emsls.items():
    for eid in eids:
        emslid_igsn[eid] = igsn

In [124]:
n_with_emsl_id = db.omics_processing_set.count_documents(
    {"id": {"$in": ["emsl:"+i for i in emslid_igsn]}})

In [125]:
requests = []
to_replace = {"emsl:"+k: "igsn:"+v for k, v in emslid_igsn.items()}
to_replace.update({"emsl:output_"+k: "igsn:"+v for k, v in emslid_igsn.items()})

def omit(blacklist, d):
    return keyfilter(lambda k: k not in blacklist, d)

def sans_mongo_id(d):
    return omit(["_id"], d)


for doc in db.omics_processing_set.find({"has_input": {"$in": list(to_replace)}}):
    operations = {"$set": {
        "has_input": [to_replace.get(i, i) for i in doc["has_input"]],
    }}
    requests.append({"filter": {"_id": doc["_id"]}, "update": operations})

In [126]:
if requests:
    rv = db.omics_processing_set.bulk_write([UpdateOne(**r) for r in requests])
    print(rv.modified_count)

In [137]:
validator_for(db_share.biosample_set)

{'additionalProperties': False,
 'description': 'A material sample. It may be environmental (encompassing many organisms) or isolate or tissue.   An environmental sample containing genetic material from multiple individuals is commonly referred to as a biosample.',
 'properties': {'agrochem_addition': {'additionalProperties': False,
   'description': 'A simple quantity, e.g. 2cm',
   'properties': {'has_numeric_value': {'description': 'The number part of the quantity',
     'type': 'number'},
    'has_raw_value': {'description': 'Unnormalized atomic string representation, should in syntax {number} {unit}',
     'type': 'string'},
    'has_unit': {'description': 'The unit of the quantity', 'type': 'string'},
    'was_generated_by': {'type': 'string'}},
   'title': 'QuantityValue',
   'type': 'object'},
  'al_sat': {'additionalProperties': False,
   'description': 'A simple quantity, e.g. 2cm',
   'properties': {'has_numeric_value': {'description': 'The number part of the quantity',
    

In [140]:
admin_client = MongoClient(
    host=os.getenv("NMDC_MONGO_HOST"),
    username="nmdc-admin",
    password=os.getenv("NMDC_MONGO_ADMIN_PWD")
)
admin_dwinston_share = admin_client["dwinston_share"]

In [144]:
target_collection_names = sorted(set(target_collection.values()))

In [152]:
def reset_database_schema(db):
    for coll_name in target_collection_names:
        if coll_name not in db.list_collection_names():
            db.create_collection(
                coll_name, validator={"$jsonSchema": collschemas[coll_name]}
            )
            db[coll_name].create_index("id", unique=True)
        else:
            print(coll_name)
            db.command("collMod", coll_name, validator={"$jsonSchema": collschemas[coll_name]})

In [154]:
reset_database_schema(admin_dwinston_share)

biosample_set
data_object_set
metagenome_annotation_activity_set
metagenome_assembly_set
metaproteomics_analysis_activity_set
omics_processing_set
read_QC_analysis_activity_set
study_set


In [155]:
db_share = client["dwinston_share"]
target_collection_names = sorted(set(target_collection.values()))
for name in target_collection_names:
    docs = [sans_mongo_id(d) for d in db[name].find()]
    print(name)
    add_to_db(docs, db_share, collection_name=name)

biosample_set
data_object_set
metagenome_annotation_activity_set
metagenome_assembly_set
metaproteomics_analysis_activity_set
omics_processing_set
read_QC_analysis_activity_set
study_set
