In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from dotenv import load_dotenv

load_dotenv("../../.env.localhost")

True

In [3]:
import os

from pymongo import MongoClient

admin_client = MongoClient(
    host=os.getenv("MONGO_HOST"),
    username=os.getenv("MONGO_USERNAME"),
    password=os.getenv("MONGO_PASSWORD"),
    directConnection=True, # connect to host as a standalone, rather than to entire replicaset
)
reader_client = MongoClient(
    host=os.getenv("MONGO_HOST"),
    username=os.getenv("MONGO_READONLY_USERNAME"),
    password=os.getenv("MONGO_READONLY_PASSWORD"),
    directConnection=True, # connect to host as a standalone, rather than to entire replicaset
)

mdb_src = reader_client[os.getenv("MONGO_DBNAME")]
mdb_tgt = admin_client[os.getenv("MONGO_DBNAME") + "-scratch"]

In [4]:
"""No output => good."""

from pymongo.errors import OperationFailure

try:
    mdb_src.study_set.insert_one({"_id": "foobar"})
    print("NOT OK")
    mdb_src.study_set.delete_one({"_id": "foobar"})
    print("state restored")
    raise Exception("can write to `mdb_src`!")
except OperationFailure:
    pass

In [38]:
"""Utils"""

from copy import deepcopy
from pprint import pprint

import fastjsonschema
from toolz import dissoc

from nmdc_runtime.util import get_nmdc_jsonschema_dict

def without_id_patterns(nmdc_jsonschema):
    rv = deepcopy(nmdc_jsonschema)
    for cls_, spec in rv["$defs"].items():
        if "properties" in spec:
            if "id" in spec["properties"]:
                spec["properties"]["id"].pop("pattern", None)
    return rv

def strip_oid(doc):
    return dissoc(doc, "_id")

def get_all(mdb=None, collection_name=None) -> list[dict]:
    return {collection_name: [strip_oid(d) for d in mdb_src[collection_name].find()]}

def get_some(mdb=None, collection_name=None, limit=5) -> dict:
    return {collection_name: [strip_oid(d) for d in mdb_src[collection_name].find(limit=limit)]}

def get_one(mdb=None, collection_name=None) -> dict:
    return strip_oid(mdb_src[collection_name].find_one())

nmdc_jsonschema_validate = fastjsonschema.compile(without_id_patterns(get_nmdc_jsonschema_dict()))

def nmdc_schema_collection_names() -> set:
    return {
        k for k, v in get_nmdc_jsonschema_dict()["$defs"]["Database"]["properties"].items()
        if v.get("items",{}).get("$ref")
    }

def present_src_collections(mdb) -> list:
    return sorted(
        n for n in (nmdc_schema_collection_names() & set(mdb_src.list_collection_names()))
        if mdb_src[n].estimated_document_count()
    )

def iter_validate(mdb=None, collection_name=None, limit=0, xform=None):
    for d in mdb_src[collection_name].find(limit=limit):
        d = strip_oid(d)
        if xform:
            d = xform(d, {"collection_name": collection_name})
        try:
            _ = nmdc_jsonschema_validate({collection_name: [d]})
        except fastjsonschema.JsonSchemaException as e:
            print(d["id"])
            print(e)
            pprint(d)

In [39]:
collection_names = present_src_collections(mdb_src)

pprint(collection_names)

['biosample_set',
 'data_object_set',
 'mags_activity_set',
 'metabolomics_analysis_activity_set',
 'metagenome_annotation_activity_set',
 'metagenome_assembly_set',
 'metagenome_sequencing_activity_set',
 'metaproteomics_analysis_activity_set',
 'metatranscriptome_activity_set',
 'nom_analysis_activity_set',
 'omics_processing_set',
 'read_qc_analysis_activity_set',
 'study_set']


In [48]:
"""Declarative config for migration"""

fieldname_replacements = {
    "biosample_set": {
        "INSDC_biosample_identifiers": "insdc_biosample_identifiers",
        # # "part_of": "sample_link",
        "identifier": "samp_name",
        "GOLD_sample_identifiers": "gold_biosample_identifiers",
    },
    "study_set": {
        "GOLD_study_identifiers": "gold_study_identifiers",
    },
    "omics_processing_set": {
        "GOLD_sequencing_project_identifiers": "gold_sequencing_project_identifiers",
    },
    "mags_activity_set": {
        "num_tRNA": "num_trnanum_t_rna",  # inside of mags_list of MagBin instances,
        "lowDepth_contig_num": "low_depth_contig_num",
    },
    "metagenome_assembly_set": {
        "ctg_L50": "ctg_l50",
        "ctg_L90": "ctg_l90",
        "ctg_N50": "ctg_n50",
        "ctg_N90": "ctg_n90",
        "scaf_L50": "scaf_l50",
        "scaf_L90": "scaf_l90",
        "scaf_N50": "scaf_n50",
        "scaf_N90": "scaf_n90",
        "scaf_l_gt50K": "scaf_l_gt50k",
        "scaf_n_gt50K": "scaf_n_gt50k",
        "scaf_pct_gt50K": "scaf_pct_gt50k",
    },
    "read_qc_analysis_activity_set": {
        "output_read_bases": "output_base_count",
        "input_read_bases": "input_base_count",
    },
}

biosample_id_routing = {
    "emsl": "emsl_biosample_identifiers",
    "gold": "gold_biosample_identifiers",
    "igsn": "igsn_biosample_identifiers",
    "img.taxon": "img_identifiers"
}


In [46]:
from toolz import assoc, dissoc

def replace_fields(d, context):
    assert "collection_name" in context
    for old, new in fieldname_replacements[context["collection_name"]].items():
        if old in d:
            d = dissoc(assoc(d, new, d[old]), old)
    return d

# drops

In [50]:
for coll_name, fields in drop.items():
    iter_validate(mdb_src, coll_name, xform=replace_fields)