# imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

from linkml_runtime.utils.schemaview import SchemaView
from toolz import dissoc, assoc
from tqdm.notebook import tqdm

from nmdc_runtime.api.db.mongo import get_mongo_db, nmdc_schema_collection_names
from nmdc_runtime.util import collection_name_to_class_names, nmdc_schema_view, nmdc_database_collection_instance_class_names
from nmdc_schema.nmdc_schema_accepting_legacy_ids import Database as NMDCDatabase
from nmdc_schema.get_nmdc_view import ViewGetter

mdb = get_mongo_db()

# "pre-cleaning"

Only consider populated collections with `id` field.

In [3]:
collection_names = sorted(nmdc_schema_collection_names(mdb))
collection_names = [n for n in collection_names if mdb[n].find_one({"id": {"$exists": True}})]

Remove null-valued optional properties

In [4]:
props = ["used", "git_url", "was_associated_with", "was_generated_by", "compression_type",]

pbar = tqdm(total=len(collection_names))
for p in props:
    for coll_name in collection_names:
        pbar.set_description(f"checking {coll_name}...")
        docs_broken = list(mdb[coll_name].find({p: {"$type": 10}}, ["id"]))
        if docs_broken:
            print(f"removing {len(docs_broken)} null-valued {p} values for {coll_name}...")
            mdb[coll_name].update_many(
                {"id": {"$in": [d["id"] for d in docs_broken]}},
                {"$unset": {p: None}}
            )
        pbar.update(1)

  0%|          | 0/18 [00:00<?, ?it/s]

# materialize single-collection db view

Check assumption that every populated collection currently has documents of one type only.

In [5]:
for name in collection_names:
    assert len(collection_name_to_class_names[name]) == 1

Define helper function.

In [6]:
def class_hierarchy_as_list(obj):
    rv = []
    current_class = obj.__class__
    
    def recurse_through_bases(cls):
        name = cls.__name__
        if name == "YAMLRoot":
            return rv
        rv.append(name)
        for base in cls.__bases__:
            recurse_through_bases(base)
        return rv
    
    return recurse_through_bases(current_class)

Materialize `alldocs` collection, associating all inherited classes with document via `type` field.

In [10]:
mdb.extraction_set.estimated_document_count()
mdb.extraction_set.find_one()

{'_id': ObjectId('64b59413fe178b5f0339ca41'),
 'end_date': '2018-05-08',
 'has_input': ['nmdc:procsm-11-dha8mw20'],
 'has_output': ['nmdc:procsm-11-xb11xa62'],
 'id': 'nmdc:extrp-11-k5fecy41',
 'processing_institution': 'Battelle',
 'quality_control_report': {'status': 'pass'},
 'start_date': '2017-06-07T20:26Z',
 'extraction_target': 'DNA',
 'input_mass': {'has_numeric_value': 0.25, 'has_unit': 'g'}}

In [9]:
mdb.alldocs.drop()

n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)
pbar = tqdm(total=n_docs_total)

#- for each collection name
for coll_name in collection_names:
    pbar.set_description(f"processing {coll_name}...")
    # try:
    nmdcdb = NMDCDatabase(**{coll_name: [dissoc(mdb[coll_name].find_one(), '_id')]})
    # except ValueError as e:
    #     print(f"no {coll_name}!")
    #     raise e
    exemplar = getattr(nmdcdb, coll_name)[0]
    newdoc_type = class_hierarchy_as_list(exemplar)
    # for each doc in collection
    mdb.alldocs.insert_many([assoc(dissoc(doc, 'type', '_id'), 'type', newdoc_type) for doc in mdb[coll_name].find()])
    pbar.update(mdb[coll_name].estimated_document_count())

pbar.close()
mdb.alldocs.create_index("id") # WTF... nmdc:0078a0f981ad3f92693c2bc3b6470791 prevents mdb.alldocs.create_index("id", unique=True)
print("refreshed `alldocs` collection")

  0%|          | 0/171332 [00:00<?, ?it/s]

ValueError:  Unknown argument: quality_control_report = {'status': 'pass'}

# Validation

Collect "top level" (nmdc:Database slot range) classes.

In [8]:
nmdc_view = nmdc_schema_view()
toplevel_classes = set()
for name in nmdc_database_collection_instance_class_names():
    toplevel_classes |= set(nmdc_view.class_ancestors(name))

In [9]:
toplevel_classes

{'Activity',
 'Biosample',
 'BiosampleProcessing',
 'CollectingBiosamplesFromSite',
 'DataObject',
 'Extraction',
 'FieldResearchSite',
 'FunctionalAnnotation',
 'FunctionalAnnotationAggMember',
 'GenomeFeature',
 'LibraryPreparation',
 'MagsAnalysisActivity',
 'MaterialEntity',
 'MetabolomicsAnalysisActivity',
 'MetagenomeAnnotationActivity',
 'MetagenomeAssembly',
 'MetagenomeSequencingActivity',
 'MetaproteomicsAnalysisActivity',
 'MetatranscriptomeActivity',
 'NamedThing',
 'NomAnalysisActivity',
 'OmicsProcessing',
 'PlannedProcess',
 'Pooling',
 'ProcessedSample',
 'ReadBasedTaxonomyAnalysisActivity',
 'ReadQcAnalysisActivity',
 'Site',
 'Study',
 'WorkflowExecutionActivity'}

Referential integrity checking:
- "naive" errors collected in `not_found` list
- (hierarchy-aware) type errors (doc found, but of invalid type) collected in `invalid_type` list

In [None]:
errors = {"not_found": [], "invalid_type": []}

n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)
pbar = tqdm(total=n_docs_total)

for name in sorted(collection_names):
    cls_name = collection_name_to_class_names[name][0]
    slot_map = {
        slot.name: slot
        for slot in nmdc_view.class_induced_slots(cls_name)
    }
    pbar.set_description(f"processing {name}...")
    for doc in mdb[name].find():
        doc = dissoc(doc, "_id")
        for field, value in doc.items():
            assert field in slot_map, f"{name} doc {doc['id']}: field {field} not a valid slot"
            slot_range = str(slot_map[field].range)
            assert slot_range, type(slot_range)
            if not slot_range in toplevel_classes:
                continue
            if not isinstance(value, list):
                value = [value]
            for v in value:
                if mdb.alldocs.find_one({"id": v}, ["_id"]) is None:
                    errors["not_found"].append(f"{name} doc {doc['id']}: field {field} referenced doc {v} not found")
                elif mdb.alldocs.find_one({"id": v, "type": slot_range}, ["_id"]) is None:
                    errors["invalid_type"].append(f"{name} doc {doc['id']}: field {field} referenced doc {v} not of type {slot_range}")
        pbar.update(1)
pbar.close()           

In [None]:
len(errors["not_found"]), len(errors["invalid_type"])

In [None]:
errors["not_found"][:5]

In [None]:
mdb.alldocs.find_one({"id": "nmdc:mga0vx38"}) is None

In [None]:
errors["invalid_type"][:5]

In [None]:
# OmicsProcessing is not subclass of Activity (!)
mdb.alldocs.find_one({"id": "emsl:570856"})