# imports

Before running this notebook, make sure you have done the following:
- `make up-dev` has been run and mongo is mapped to `localhost:27018`
- a recent dump of the production mongo database has been loaded to `localhost:27018` (see `make mongorestore-nmdc-dev` for an example)
- `.env` has updated `MONGO_HOST` to `mongodb://localhost:27018`
- `export $(grep -v '^#' .env | xargs)` has been run in the shell before running `jupyter notebook`


In [1]:
# Ensure code changes in this notebook will be import-able  
# without needing to restart the kernel and lose state
%load_ext autoreload
%autoreload 2

In [2]:
import os

from linkml_runtime.utils.schemaview import SchemaView
from toolz import dissoc, assoc
from tqdm.notebook import tqdm

from nmdc_runtime.api.db.mongo import get_mongo_db, nmdc_schema_collection_names
from nmdc_runtime.util import collection_name_to_class_names, nmdc_schema_view, nmdc_database_collection_instance_class_names
from nmdc_schema.nmdc_schema_accepting_legacy_ids import Database as NMDCDatabase
from nmdc_schema.get_nmdc_view import ViewGetter

mdb = get_mongo_db()

# Pre-cleaning

Only consider populated collections with `id` field.

In [3]:
collection_names = sorted(nmdc_schema_collection_names(mdb))
collection_names = [n for n in collection_names if mdb[n].find_one({"id": {"$exists": True}})]

Remove null-valued optional properties

In [1]:
# check these slots for null values for all docs in collection_names
props = ["used", "git_url", "was_associated_with", "was_generated_by", "compression_type", 
         "metagenome_annotation_id", "metaproteomic_analysis_id"] 

pbar = tqdm(total=len(collection_names))
for p in props:
    for coll_name in collection_names:
        pbar.set_description(f"checking {coll_name}...")
        # The {$type: 10} query matches for BSON Type Null, not just value `null`
        docs_broken = list(mdb[coll_name].find({p: {"$type": 10}}, ["id"]))
        if docs_broken:
            print(f"removing {len(docs_broken)} null-valued {p} values for {coll_name}...")
            mdb[coll_name].update_many(
                {"id": {"$in": [d["id"] for d in docs_broken]}},
                {"$unset": {p: None}}
            )
        pbar.update(1)

NameError: name 'tqdm' is not defined

# materialize single-collection db view

Check assumption that every populated collection currently has documents of one type only.

In [5]:
for name in collection_names:
    assert len(collection_name_to_class_names[name]) == 1

Define a helper function that takes a document and returns its class and all parent classes as a list

In [6]:
def class_hierarchy_as_list(obj):
    rv = []
    current_class = obj.__class__
    
    def recurse_through_bases(cls):
        name = cls.__name__
        if name == "YAMLRoot":
            return rv
        rv.append(name)
        for base in cls.__bases__:
            recurse_through_bases(base)
        return rv
    
    return recurse_through_bases(current_class)

Materialize `alldocs` collection, associating all inherited classes with document via `type` field.

In [12]:
# drop any previously generated alldocs collection
mdb.alldocs.drop()

# progress bar set-up
n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)
pbar = tqdm(total=n_docs_total)

# for each collection name
for coll_name in collection_names:
    pbar.set_description(f"processing {coll_name}...")
    # for each doc in collection dissociate mongo-generated '_id' field
    try:
        nmdcdb = NMDCDatabase(**{coll_name: [dissoc(mdb[coll_name].find_one(), '_id')]})
    except ValueError as e:
        print(f"no {coll_name}!")
        raise e
    # calculate class_hierarchy_as_list once per collection    
    exemplar = getattr(nmdcdb, coll_name)[0]
    newdoc_type = class_hierarchy_as_list(exemplar)
    
    # for each doc in collection
    # replace string value for 'type' with a class_hierarchy_as_list
    # and insert modified doc into materialized alldocs collection
    
    # NOTE: `type` is currently a string, does not exist for all classes, and can have typos. 
    # Both of these are fixed in berkeley schema but is risky to use at this time
    
    mdb.alldocs.insert_many([assoc(dissoc(doc, 'type', '_id'), 'type', newdoc_type) for doc in mdb[coll_name].find()])
    pbar.update(mdb[coll_name].estimated_document_count())

pbar.close()

# Prior to re-ID-ing, some IDs are not unique across Mongo collections (eg nmdc:0078a0f981ad3f92693c2bc3b6470791)
# Re-idx for `alldocs` collection
mdb.alldocs.create_index("id")
print("refreshed `alldocs` collection")

  0%|          | 0/224995 [00:00<?, ?it/s]

refreshed `alldocs` collection


# Validation

Collect "top level" (nmdc:Database slot range) classes.

In [14]:
nmdc_view = nmdc_schema_view()
toplevel_classes = set()
for name in nmdc_database_collection_instance_class_names():
    toplevel_classes |= set(nmdc_view.class_ancestors(name))

toplevel_classes

{'Activity',
 'Biosample',
 'BiosampleProcessing',
 'CollectingBiosamplesFromSite',
 'DataObject',
 'Extraction',
 'FieldResearchSite',
 'FunctionalAnnotation',
 'FunctionalAnnotationAggMember',
 'GenomeFeature',
 'LibraryPreparation',
 'MagsAnalysisActivity',
 'MaterialEntity',
 'MetabolomicsAnalysisActivity',
 'MetagenomeAnnotationActivity',
 'MetagenomeAssembly',
 'MetagenomeSequencingActivity',
 'MetaproteomicsAnalysisActivity',
 'MetatranscriptomeActivity',
 'NamedThing',
 'NomAnalysisActivity',
 'OmicsProcessing',
 'PlannedProcess',
 'Pooling',
 'ProcessedSample',
 'ReadBasedTaxonomyAnalysisActivity',
 'ReadQcAnalysisActivity',
 'Site',
 'Study',
 'WorkflowExecutionActivity'}

## Referential integrity checking:
- "naive" errors collected in `not_found` list
- (hierarchy-aware) type errors (doc found, but of invalid type) collected in `invalid_type` list

In [None]:
errors = {"not_found": [], "invalid_type": []}

n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)
pbar = tqdm(total=n_docs_total)

for name in sorted(collection_names):
    cls_name = collection_name_to_class_names[name][0]
    slot_map = {
        slot.name: slot
        for slot in nmdc_view.class_induced_slots(cls_name)
    }
    pbar.set_description(f"processing {name}...")
    for doc in mdb[name].find():
        doc = dissoc(doc, "_id")
        for field, value in doc.items():
            assert field in slot_map, f"{name} doc {doc['id']}: field {field} not a valid slot"
            slot_range = str(slot_map[field].range)
            assert slot_range, type(slot_range)
            if not slot_range in toplevel_classes:
                continue
            if not isinstance(value, list):
                value = [value]
            for v in value:
                if mdb.alldocs.find_one({"id": v}, ["_id"]) is None:
                    errors["not_found"].append(f"{name} doc {doc['id']}: field {field} referenced doc {v} not found")
                elif mdb.alldocs.find_one({"id": v, "type": slot_range}, ["_id"]) is None:
                    errors["invalid_type"].append(f"{name} doc {doc['id']}: field {field} referenced doc {v} not of type {slot_range}")
        pbar.update(1)
pbar.close()           

  0%|          | 0/224995 [00:00<?, ?it/s]

## Results

In [15]:
len(errors["not_found"]), len(errors["invalid_type"])

(4857, 23503)

In [16]:
errors["not_found"][:5]

['mags_activity_set doc nmdc:fdefb3fa15098906cf788f5cadf17bb3: field part_of referenced doc nmdc:mga0vx38 not found',
 'mags_activity_set doc nmdc:78f8bf24916f01d053378b1bd464cd8a: field has_input referenced doc nmdc:9003278a200d1e7921e978d4c59233c3 not found',
 'mags_activity_set doc nmdc:a57ecfc4dee4e6938a5517ad0961dcd8: field part_of referenced doc nmdc:mga08x19 not found',
 'mags_activity_set doc nmdc:3e0d8aae3b16d5bba2b3faec04391929: field part_of referenced doc nmdc:mga06z11 not found',
 'mags_activity_set doc nmdc:4417090e8ce0e96ff2867b85823d4b26: field part_of referenced doc nmdc:mga07m45 not found']

In [17]:
mdb.alldocs.find_one({"id": "nmdc:mga0vx38"}) is None

True

In [18]:
errors["invalid_type"][:5]

['data_object_set doc emsl:output_570856: field was_generated_by referenced doc emsl:570856 not of type Activity',
 'data_object_set doc emsl:output_570991: field was_generated_by referenced doc emsl:570991 not of type Activity',
 'data_object_set doc emsl:output_570998: field was_generated_by referenced doc emsl:570998 not of type Activity',
 'data_object_set doc emsl:output_570855: field was_generated_by referenced doc emsl:570855 not of type Activity',
 'data_object_set doc emsl:output_570823: field was_generated_by referenced doc emsl:570823 not of type Activity']

In [19]:
# OmicsProcessing is not subclass of Activity (!)
mdb.alldocs.find_one({"id": "emsl:570856"})

{'_id': ObjectId('663fbef9ba64633177320f59'),
 'id': 'emsl:570856',
 'name': 'Rachael_21T_04-15A_M_14Mar17_leopard_Infuse',
 'instrument_name': '21T Agilent',
 'has_input': ['emsl:2f71038a-5dd1-11ec-bf63-0242ac130002'],
 'has_output': ['emsl:output_570856'],
 'omics_type': {'has_raw_value': 'Organic Matter Characterization'},
 'part_of': ['gold:Gs0110138'],
 'description': 'High resolution MS spectra only',
 'processing_institution': 'EMSL',
 'gold_sequencing_project_identifiers': [],
 'type': ['OmicsProcessing', 'PlannedProcess', 'NamedThing']}