In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

from dotenv import load_dotenv
load_dotenv(os.path.expanduser("~/.nmdc_mongo.env"))

from nmdc_mongo import get_db

db = get_db("dwinston_share")

In [3]:
fmap = {
    "filterStats.txt": "Reads QC summary statistics",
    ".filtered.fastq.gz": "Reads QC result fastq (clean data)",
    "mapping_stats.txt": "Assembled contigs coverage information",
    "assembly_contigs.fna": "Final assembly contigs fasta",
    "assembly_scaffolds.fna": "Final assembly scaffolds fasta",
    "assembly.agp": "An AGP format file describes the assembly",
    "pairedMapped_sorted.bam": "Sorted bam file of reads mapping back to the final assembly",
    "KO TSV": "Tab delimited file for KO annotation.",
    "EC TSV": "Tab delimited file for EC annotation.",
    "Protein FAA": "FASTA amino acid file for annotated proteins.",
    "MSGFjobs_MASIC_resultant.tsv": "Tab delimited file of unfiltered metaproteomics results, both identifications and abundances",
    "_Peptide_Report.tsv": "Tab delimited file of peptide results filtered to ~5% FDR, including protein and abundance information",
    "_Protein_Report.tsv": "Tab delimited file of protein results derived from ~5% FDR filtered peptide data, including aggregated abundance information",
    "_QC_metrics.tsv": "Tab delimited file of aggregate statistics derived from workflow results",
}

In [4]:
context = {"@context": {
    "skos": "http://www.w3.org/2004/02/skos/core#",
    "sh": "http://www.w3.org/ns/shacl#",
}}

mappings = []
for pattern, comment in fmap.items():
    mappings.append({
        "sh:pattern": pattern,
        "skos:note": comment,
    })

In [5]:
from toolz import merge

for m in mappings:
    db.notes.replace_one(m, merge(m, context), upsert=True)

In [6]:
from nmdc_mongo import dbschema, collschemas_for
from nmdc_mongo.admin import admin_client, reset_database_schema

dbref = {
    "type": "object",
    "description": "https://docs.mongodb.com/manual/reference/database-references/#dbrefs",
    "required": ["ref", "id"],
    "properties": {
        # XXX $jsonSchema incompatible with $ref and $id convention
        "ref": {"type": "string"},
        "id": {"bsonType": "objectId"},
    }
}

dbschema["definitions"]["DataObject"]["properties"]["_note"] = dbref

reset_database_schema(admin_client["dwinston_share"], ["data_object_set"], collschemas_for(dbschema))

updating data_object_set


In [7]:
for m in mappings:
    pattern = m["sh:pattern"]
    note = db.notes.find_one({"sh:pattern": pattern})
    _ids = [d["_id"] for d in db.data_object_set.find({"name": {"$regex": pattern, "$options": "i"}}, ["_id"])]
    db.data_object_set.update_many(
        {"_id": {"$in": _ids}},
        {"$set": {"_note": {"ref": "notes", "id": note["_id"]}}}
    )