In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import os

from dotenv import load_dotenv
from pymongo import MongoClient
from pymongo.errors import OperationFailure

load_dotenv("../../.env.localhost")

admin_client = MongoClient(
    host=os.getenv("MONGO_HOST"),
    username=os.getenv("MONGO_USERNAME"),
    password=os.getenv("MONGO_PASSWORD"),
    directConnection=True, # connect to host as a standalone, rather than to entire replicaset
)
reader_client = MongoClient(
    host=os.getenv("MONGO_HOST"),
    username=os.getenv("MONGO_READONLY_USERNAME"),
    password=os.getenv("MONGO_READONLY_PASSWORD"),
    directConnection=True, # connect to host as a standalone, rather than to entire replicaset
)

mdb_src = reader_client[os.getenv("MONGO_DBNAME")]
mdb_tgt = admin_client[os.getenv("MONGO_DBNAME") + "-scratch"]


try:
    mdb_src.study_set.insert_one({"_id": "foobar"})
    print("NOT OK")
    mdb_src.study_set.delete_one({"_id": "foobar"})
    print("state restored")
    raise Exception("can write to `mdb_src`!")
except OperationFailure:
    pass

No output from above? Good!

In [67]:
"""Utils"""

from copy import deepcopy
from pprint import pprint

import fastjsonschema
from toolz import dissoc

from nmdc_runtime.util import get_nmdc_jsonschema_dict

def without_id_patterns(nmdc_jsonschema):
    rv = deepcopy(nmdc_jsonschema)
    for cls_, spec in rv["$defs"].items():
        if "properties" in spec:
            if "id" in spec["properties"]:
                spec["properties"]["id"].pop("pattern", None)
    return rv

def strip_oid(doc):
    return dissoc(doc, "_id")

def get_all(mdb=None, collection_name=None) -> list[dict]:
    return {collection_name: [strip_oid(d) for d in mdb_src[collection_name].find()]}

def get_some(mdb=None, collection_name=None, limit=5) -> dict:
    return {collection_name: [strip_oid(d) for d in mdb_src[collection_name].find(limit=limit)]}

def get_one(mdb=None, collection_name=None) -> dict:
    return strip_oid(mdb_src[collection_name].find_one())

nmdc_jsonschema_validate = fastjsonschema.compile(without_id_patterns(get_nmdc_jsonschema_dict()))

def nmdc_schema_collection_names() -> set:
    return {
        k for k, v in get_nmdc_jsonschema_dict()["$defs"]["Database"]["properties"].items()
        if v.get("items",{}).get("$ref")
    }

def present_src_collections(mdb) -> list:
    return sorted(
        n for n in (nmdc_schema_collection_names() & set(mdb_src.list_collection_names()))
        if mdb_src[n].estimated_document_count()
    )

Reference for anticipated migration logic/config: https://github.com/microbiomedata/nmdc-schema/blob/v7.4.10/nmdc_schema/migrate_3_2_to_7.py

In [74]:
from inspect import signature
import re

from toolz import assoc, assoc_in, dissoc


id_pattern = re.compile(r"(PO|ENVO):\d+")
curie_pattern = re.compile(r"[A-Za-z]\w+:\w+")


def ensure_term_id(v):
    if not ("term" in v or "has_raw_value" in v):
        raise Exception(f"needs `term` or `has_raw_value`")
    if not (m := re.search(id_pattern, v["has_raw_value"])):
        raise Exception(f'{v["has_raw_value"]} does not match a known ID pattern')
    
    return assoc(v, "term", {"id": v["has_raw_value"]})


def uppercase_curie_prefixes(v):
    if not isinstance(v, list):
        raise Exception("list expected")
    if not all(":" in elt for elt in v):
        raise Exception("CURIEs expected")
    
    rv = []
    for elt in v:
        prefix, localpart = elt.split(":", maxsplit=1)
        rv.append(prefix.upper() + ":" + localpart)
    return rv

def lowercase_curie_prefixes(v):
    if not isinstance(v, list):
        raise Exception("list expected")
    if not all(":" in elt for elt in v):
        raise Exception("CURIEs expected")
    
    rv = []
    for elt in v:
        prefix, localpart = elt.split(":", maxsplit=1)
        rv.append(prefix.lower() + ":" + localpart)
    return rv

def ensure_curies(v):
    """
    >>> ensure_curies(['HTTPS://identifiers.org/gold:Gs0144557'])
    ['gold:Gs0144557']
    """
    if not isinstance(v, list):
        raise Exception("list expected")
    if not all(":" in elt for elt in v):
        raise Exception("CURIE-ables expected")

    return [re.search(curie_pattern, elt).group(0) for elt in v]

def ensure_depth_via_depth2(v, d):
    if "depth" not in d:
        raise Exception("no `depth` field")
    
    depth = d["depth"]
    return {"depth": depth, "depth2": None}
#     depth = d["depth"]
#     if v['has_unit'] != depth['has_unit']:
#         raise Exception("disagreement wrt units")
#     if 'has_numeric_value' not in v:
#         return {"depth
    
    
#     if 'has_maximum_numeric_value' in v:
#         if v['has_maximum_numeric_value'] != depth2['has_numeric_value']:
#             raise Exception("disagreement wrt maximum value")
#     else:
#         v = assoc(v, "has_maximum_numeric_value", depth2["has_numeric_value"])
#         if 'has_minimum_numeric_value' not in v:
#             # sets missing min value as given numeric value
#             v = assoc(v, "has_minimum_numeric_value", v["has_numeric_value"])
#         else:
#             if v["has_numeric_value"] != v["has_minimum_numeric_value"]:
#                 # XXX Is this really a problem?
#                 raise Exception("min value not the same as numeric value")


def remove_type_field(v):
    return dissoc(v, "type")


def ensure_number_in_meters(v):
    if isinstance(v, dict):
        assert "has_unit" in v, "no units"
        assert v["has_unit"] in ("meter", "metre"), "not meters"
        return v["has_numeric_value"]
    assert isinstance(v, int | float), "not a number"
    return v

def ensure_gold_sequencing_project_identifiers(v, d):
    if not isinstance(v, list):
        raise Exception("list expected")
    if "gold_sequencing_project_identifiers" not in d:
        return v

    rv = []
    for elt in v:
        if elt.lower().startswith("gold:gb"):
            if d["id"].lower().startswith("gold:gp"):
                rv.append(d["id"])
            else:
                raise Exception("no avaialable substitute")
        else:
            rv.append(elt)
    return {"gold_sequencing_project_identifiers": rv}


def rename_num_tRNA(v):
    return [
        change_fieldname(elt, "num_tRNA", "num_t_rna")
        for elt in v if "num_tRNA" in elt
    ]


fieldname_replacements = {
    "biosample_set": {
        "INSDC_biosample_identifiers": "insdc_biosample_identifiers",
        # # "part_of": "sample_link",
        "identifier": "samp_name",
        "GOLD_sample_identifiers": "gold_biosample_identifiers",
        "env_broad_scale": [ensure_term_id, remove_type_field],
        "env_local_scale": [ensure_term_id, remove_type_field],
        "env_medium": [ensure_term_id, remove_type_field],
        # depends on `dict`s being ordered in python 3.8+
        "gold_biosample_identifiers": [uppercase_curie_prefixes],
        "depth2": [ensure_depth_via_depth2],
        "lat_lon": [remove_type_field],
        "elev": [ensure_number_in_meters],
        "insdc_biosample_identifiers": [lowercase_curie_prefixes],
    },
    "study_set": {
        "GOLD_study_identifiers": "gold_study_identifiers",
        "gold_study_identifiers": [ensure_curies, uppercase_curie_prefixes],
    },
    "omics_processing_set": {
        "GOLD_sequencing_project_identifiers": "gold_sequencing_project_identifiers",
        "gold_sequencing_project_identifiers": [
            ensure_gold_sequencing_project_identifiers,
            uppercase_curie_prefixes,
        ],
    },
    "mags_activity_set": {
        "mags_list": [rename_num_tRNA],
        "lowDepth_contig_num": "low_depth_contig_num",
    },
    "metagenome_assembly_set": {
        "ctg_L50": "ctg_l50",
        "ctg_L90": "ctg_l90",
        "ctg_N50": "ctg_n50",
        "ctg_N90": "ctg_n90",
        "scaf_L50": "scaf_l50",
        "scaf_L90": "scaf_l90",
        "scaf_N50": "scaf_n50",
        "scaf_N90": "scaf_n90",
        "scaf_l_gt50K": "scaf_l_gt50k",
        "scaf_n_gt50K": "scaf_n_gt50k",
        "scaf_pct_gt50K": "scaf_pct_gt50k",
    },
    "read_qc_analysis_activity_set": {
        "output_read_bases": "output_base_count",
        "input_read_bases": "input_base_count",
    },
}

biosample_id_routing = {
    "emsl": "emsl_biosample_identifiers",
    "gold": "gold_biosample_identifiers",
    "igsn": "igsn_biosample_identifiers",
    "img.taxon": "img_identifiers"
}


def change_fieldname(d, fieldname, replacement):
    return dissoc(assoc(d, replacement, d[fieldname]), fieldname)


def replace_fields(d, context):
    assert "collection_name" in context
    for fieldname, replacement in fieldname_replacements[context["collection_name"]].items():
        if fieldname in d:
            if isinstance(replacement, list):
                assert all(callable(r) for r in replacement), "replacement-list must be all functions"
                for rfun in replacement:
                    n_params = len(signature(rfun).parameters)
                    if n_params == 1:
                        d = assoc(d, fieldname, rfun(d[fieldname]))
                    elif n_params == 2:
                        mapping_updates = rfun(d[fieldname], d)
                        for k, v in mapping_updates.items():
                            d = dissoc(d, k) if v is None else assoc(d, k, v)
                    else:
                        raise Exception("replaced fn takes too many parameters")
            else:
                d = change_fieldname(d, fieldname, replacement)
    return d


def iter_validate(mdb=None, collection_name=None, limit=0, xform=None):
    print(f"validating {collection_name}...")
    for d in mdb_src[collection_name].find(limit=limit):
        d = strip_oid(d)
        if xform:
            d = xform(d, {"collection_name": collection_name})
        try:
            _ = nmdc_jsonschema_validate({collection_name: [d]})
        except fastjsonschema.JsonSchemaException as e:
            print(d["id"])
            print(e)
            #pprint(d)
            #raise e
            return

# replacements

In [75]:
for coll_name in present_src_collections(mdb_src):
    if coll_name in fieldname_replacements:
        iter_validate(mdb_src, coll_name, xform=replace_fields)
    else:
        iter_validate(mdb_src, coll_name, xform=None)

validating biosample_set...
validating data_object_set...
validating mags_activity_set...
validating metabolomics_analysis_activity_set...
nmdc:8969f454c3944f1eac9da499fb950a18
data.metabolomics_analysis_activity_set[{data__metabolomicsanalysisactivityset_x}] must not contain {'has_calibration', 'has_metabolite_quantifications'} properties
validating metagenome_annotation_activity_set...
validating metagenome_assembly_set...
validating metagenome_sequencing_activity_set...
nmdc:107ade35423143e39dc30b12832ac759
data.metagenome_sequencing_activity_set[{data__metagenomesequencingactivityset_x}].git_url must be string
validating metaproteomics_analysis_activity_set...
nmdc:ec0db6c4103faf47d3393ec27d541639
data.metaproteomics_analysis_activity_set[{data__metaproteomicsanalysisactivityset_x}] must not contain {'has_peptide_quantifications'} properties
validating metatranscriptome_activity_set...
validating nom_analysis_activity_set...
nmdc:a1453a2ae5bf4bcca2165b90b96d1082
data.nom_analysis_a