In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from time import time
import os

tic = time()

from dotenv import load_dotenv
load_dotenv(os.path.expanduser("~/.nmdc_mongo.env"))

True

In [3]:
import json
import re
from toolz import assoc_in, dissoc
from zipfile import ZipFile

from mongospawn.schema import collschemas_for

from nmdc_mongo import (
    add_to_db,
    correct_metaP_doc,
    dbschema,
    fetch_and_validate_json,
    fetch_conform_and_persist_from_manifest,
    fetch_json,
    get_db,
    reset_database,
    snake_case_set_name
)

In [4]:
###########################
# Adjustments for GSP below
###########################

defined_object_names = set(dbschema["definitions"])

set_for_object_name = {
    spec["items"]["$ref"].split("#/definitions/")[-1]: set_name
    for set_name, spec in dbschema["properties"].items()
}

existing_set_names = set(dbschema["properties"])

for object_without_set in (defined_object_names - set(set_for_object_name.keys())):
    proposed_set_name = snake_case_set_name(object_without_set)
    if proposed_set_name not in existing_set_names:
        dbschema["properties"][proposed_set_name] = {
            "description": (f"This property links a database object to the set of"
                            f" {object_without_set} objects within it."),
            "items": {"$ref": f"#/definitions/{object_without_set}"},
            "type": "array",
        }
        
dbschema = assoc_in(dbschema, ["definitions", "ControlledTermValue", "properties", "term", "type"], "string")
del dbschema["definitions"]["ControlledTermValue"]["properties"]["term"]["$ref"]

# 'k' not capitalized upstream perhaps. should conform!
#dbschema = assoc_in(dbschema, ["definitions", "MetagenomeAssembly", "properties", "scaf_l_gt50k", "type"], "number")

In [5]:
collschemas = collschemas_for(dbschema)

# Reconstruct
set_for_object_name = {
    spec["items"]["$ref"].split("#/definitions/")[-1]: set_name
    for set_name, spec in dbschema["properties"].items()
}

(Re-)load existing NMDC DB from file.

In [10]:
with ZipFile('../src/data/nmdc_database.json.zip') as myzip:
    # may be e.g. 'metadata-translation/src/bin/output/nmdc_database.json' rather than 'nmdc_database.json'
    name = next(n for n in myzip.namelist() if n.endswith("nmdc_database.json"))
    with myzip.open(name) as f:
        nmdc_database = json.load(f)

In [11]:
db = get_db("dwinston_dev")
reset_database(db)

In [12]:
#add_to_db(nmdc_database["study_set"], get_db("dwinston_share"), "study_set")

In [13]:
from nmdc_mongo import validator_for

#validator_for(db.study_set)

In [14]:
target_collection = {
    "Study": "study_set",
    "OmicsProcessing": "omics_processing_set",
    "Biosample": "biosample_set",
    "DataObject": "data_object_set",
    "MetagenomeAssembly": "metagenome_assembly_set",
    "MetaProteomicAnalysis": "metaproteomics_analysis_activity_set",
    "MetagenomeAnnotation": "metagenome_annotation_activity_set",
    "ReadQCAnalysisActivity": "read_QC_analysis_activity_set",
}
for collection in nmdc_database:
    docs = nmdc_database[collection]
    object_types = {d.get("type", "nmdc:")[5:] for d in docs} - {""}
    if any(d for d in docs if "type" not in d):
        print("some",collection,"docs have no type")
    print(collection, object_types)

study_set {'Study'}
omics_processing_set {'OmicsProcessing'}
biosample_set {'Biosample'}
data_object_set {'DataObject'}
metagenome_assembly_set {'MetagenomeAssembly'}
read_QC_analysis_activity_set {'ReadQCAnalysisActivity'}


In [15]:
from collections import defaultdict

for source_collection in nmdc_database:
    docs = nmdc_database[source_collection]
    docs_per_target = defaultdict(list)
    for d in docs:
        type_ = d.get("type", "nmdc:")[5:]
        d_new = dissoc(d, "type")
        if "lat_lon" in d_new:
            d_new["lat_lon"].pop("type", None)
        for k_float in (
            "asm_score", "ctg_logsum", "ctg_powsum", "gap_pct", "gc_avg", "gc_std",
            "scaf_logsum", "scaf_powsum"):
            if k_float in d_new:
                d_new[k_float] = float(d_new[k_float]) 
        keys_with_term_ids = [
            k for k in d_new
            if isinstance(d_new[k], dict)
            and "term" in d_new[k]
            and "id" in d_new[k]["term"]
        ]
        for k in keys_with_term_ids:
            d_new = assoc_in(d_new, [k, "term"], d_new[k]["term"]["id"])
        
        key = target_collection[type_] if type_ else source_collection
        docs_per_target[key].append(d_new)
                
    for collection_name, docs in docs_per_target.items():
        print(collection_name)
        payload = fetch_and_validate_json(docs, collection_name=collection_name)
        add_to_db(payload, db, collection_name=collection_name)

study_set


  0%|          | 0/12 [00:00<?, ?it/s]

omics_processing_set


  0%|          | 0/7184 [00:00<?, ?it/s]

biosample_set


  0%|          | 0/32230 [00:00<?, ?it/s]

data_object_set


  0%|          | 0/9771 [00:00<?, ?it/s]

metagenome_assembly_set


  0%|          | 0/403 [00:00<?, ?it/s]

read_QC_analysis_activity_set


  0%|          | 0/405 [00:00<?, ?it/s]

study_set: `Additional properties are not allowed ('principal_investigator_name', 'type' were unexpected)`

omics_processing_set: `Additional properties are not allowed ('principal_investigator_name', 'mod_date', 'add_date', 'processing_institution', 'ncbi_project_name' were unexpected)`

biosample_set: 

-`Additional properties are not allowed ('location', 'mod_date', 'identifier', 'habitat', 'ncbi_taxonomy_name', 'add_date', 'community', 'sample_collection_site' were unexpected)`

-`Additional properties are not allowed ('type' was unexpected)` for `lat_lon`

data_object_set: OK

activity_set:
- `Additional properties are not allowed ('contigs', 'ctg_N50', 'scaf_max', 'contig_bp', 'scaffolds', 'gc_std', 'gap_pct', 'num_input_reads', 'scaf_N50', 'ctg_max', 'ctg_L90', 'ctg_powsum', 'gc_avg', 'scaf_powsum', 'scaf_L90', 'ctg_L50', 'scaf_l_gt50k', 'scaf_N90', 'ctg_N90', 'scaf_L50', 'asm_score', 'scaf_n_gt50K', 'scaf_pct_gt50K', 'num_aligned_reads', 'scaf_bp', 'ctg_logsum', 'scaf_logsum' were unexpected)` for type 'nmdc:MetagenomeAssembly'

- `Additional properties are not allowed ('input_read_count', 'output_read_count', 'input_read_bases', 'output_read_bases' were unexpected)` for type 'nmdc:ReadQCAnalysisActivity'

metagenome_assembly_set:
`Additional properties are not allowed ('scaf_l_gt50k' was unexpected)`

read_QC_analysis_activity_set:
- `Additional properties are not allowed ('output_read_bases', 'input_read_bases' were unexpected)`

Load FICUS Brodie spreadsheet and create gold-id-to-igsn map.

In [16]:
import csv
import re

GOLD_ID_IDX = 5
IGSN_IDX = 2

igsn_golds = defaultdict(list)

gold_id_pattern = re.compile(r"Gb\d+")

with open('../src/data/FICUS_Soil_Gs0135149_Brodie-12-23-2020_PS.xlsx - Brodie_Gs0135149_Soil_Metadata.csv') as f:
    reader = csv.reader(f)
    for row in reader:
        gold_id = row[GOLD_ID_IDX]
        igsn = row[IGSN_IDX]
        if gold_id_pattern.fullmatch(gold_id):
            igsn_golds[igsn].append(gold_id)

Prepare helper function to compare timestamps given in e.g. "15-MAY-20 08.30.01.000000000 am" format.

In [17]:
from datetime import datetime

dt_pattern = re.compile(r"\d{2}-(?P<month>\w+)-\d{2} \d{2}\.\d{2}\.\d{2}\.(?P<ns>\d+) [A|P]M")
dt_format = "%d-%b-%y %I.%M.%S.%f %p"

def order_timestamps(timestamps):
    if not all(isinstance(ts, str) for ts in timestamps):
        raise Exception(f"{timestamps} not strings")
    as_datetimes = []
    for ts in timestamps:
        match = dt_pattern.search(ts)
        first, month, rest = ts.partition(match.group("month"))
        ts_new = first + month[0] + month[1:].lower() + rest
        ts_new = ts_new.replace(match.group("ns"), match.group("ns")[:-3]) # truncate to microseconds
        as_datetimes.append(datetime.strptime(ts_new, dt_format))
    sorted_dts = sorted(as_datetimes)
    return [dt.strftime(dt_format) for dt in sorted_dts]

Prepare helper-function pipeline to unify biosample_set documents that should be considered equivalent.

In [18]:
from pprint import pprint
from toolz import compose

er_xna_pattern = re.compile(r"ER_[D|R]NA_\d+$")

def rstrip_name_ER_ID(d):
    s = get_in(["name"], d)
    s_new = er_xna_pattern.split(s)[0] if er_xna_pattern.search(s) else s
    return assoc_in(d, ["name"], s_new)

def capitalize_location(d):
    s = get_in(["location"], d)
    if s is not None:
        s_new = (s[0].upper() + s[1:])
        return assoc_in(d, ["location"], s_new)
    else:
        return d

pipeline = compose(
    capitalize_location,
    rstrip_name_ER_ID,
    lambda d: dissoc(d, "_id", "id", "add_date", "mod_date", "identifier"),
)

Produce new biosample objects with ISGN ids.

In [19]:
from dictdiffer import diff
from toolz import get_in

merged_biosample_docs = []

for igsn, golds in igsn_golds.items():
    igsn_curie = "igsn:"+igsn
    to_change = list(db.biosample_set.find({"id": {"$in": [f"gold:{g}" for g in golds]}}))
    
    # No merge needed, just change of id.
    if len(to_change) == 1:
        merged = assoc_in(to_change[0], ["id"], igsn_curie)
        #merged = assoc_in(merged, ["identifier"], igsn_curie)
        merged_biosample_docs.append(merged)
        continue
    elif len(to_change) == 0:
        continue

    # Ensure that unification pipeline is adequate to resolve differences.
    distilled = list(map(pipeline, to_change))
    result = list(diff(distilled[0], distilled[1]))
    assert result == []
    
    # Produce a merged document
    earlier_ts, _ = order_timestamps([get_in(["add_date"], d) for d in to_change])
    merged = assoc_in(distilled[0], ["add_date"], earlier_ts)
    _, later_ts = order_timestamps([get_in(["mod_date"], d) for d in to_change])
    merged = assoc_in(merged, ["mod_date"], later_ts)
    merged = assoc_in(merged, ["id"], igsn_curie)
    merged = assoc_in(merged, ["identifier"], igsn_curie)
    
    merged_biosample_docs.append(merged)
    merged = None # defense against accidental reuse during next iteration.

assert len(merged_biosample_docs) == len(igsn_golds)

Delete old biosample objects and insert new ones in one bulk-write operation.

In [20]:
from pymongo import DeleteMany, InsertOne
from toolz import concat

requests = [DeleteMany({"id": {"$in": ["gold:"+g for g in concat(igsn_golds.values())]}})]
requests.extend([InsertOne(d) for d in merged_biosample_docs])
result = db.biosample_set.bulk_write(requests)
result.deleted_count, result.inserted_count

(93, 48)

Update omics_processing_set references to biosample_set ids.

In [21]:
goldid_igsn = {}
for igsn, gids in igsn_golds.items():
    for gid in gids:
        goldid_igsn[gid] = igsn

In [22]:
requests = []
to_replace = {"gold:"+k: "igsn:"+v for k, v in goldid_igsn.items()}

for doc in db.omics_processing_set.find({"has_input": {"$in": list(to_replace)}}):
    operations = {"$set": {
        "has_input": [to_replace.get(i, i) for i in doc["has_input"]],
    }}
    requests.append({"filter": {"_id": doc["_id"]}, "update": operations})

In [23]:
from pymongo import UpdateOne

rv = db.omics_processing_set.bulk_write([UpdateOne(**r) for r in requests])

In [24]:
rv.modified_count

93

Update omics_processing_set references from EMSL ids to IGSNs.

In [25]:
EMSL_IDS_IDX = 7
IGSN_IDX = 2

igsn_emsls = {}

emsl_ids_pattern = re.compile(r"\d+")

with open('../src/data/FICUS_Soil_Gs0135149_Brodie-12-23-2020_PS.xlsx - Brodie_Gs0135149_Soil_Metadata.csv') as f:
    reader = csv.reader(f)
    for row in reader:
        emsl_ids = row[EMSL_IDS_IDX]
        igsn = row[IGSN_IDX]
        ids = emsl_ids_pattern.findall(emsl_ids)
        # XXX some rows have emsl ids but no IGSN, so igsn.strip() check here
        if igsn.strip() and ids:
            igsn_emsls[igsn] = ids

In [26]:
emslid_igsn = {}
for igsn, eids in igsn_emsls.items():
    for eid in eids:
        emslid_igsn[eid] = igsn

In [27]:
n_with_emsl_id = db.omics_processing_set.count_documents(
    {"id": {"$in": ["emsl:"+i for i in emslid_igsn]}})

In [28]:
requests = []
to_replace = {"emsl:"+k: "igsn:"+v for k, v in emslid_igsn.items()}
to_replace.update({"emsl:output_"+k: "igsn:"+v for k, v in emslid_igsn.items()})

def omit(blacklist, d):
    return keyfilter(lambda k: k not in blacklist, d)

def sans_mongo_id(d):
    return omit(["_id"], d)


for doc in db.omics_processing_set.find({"has_input": {"$in": list(to_replace)}}):
    operations = {"$set": {
        "has_input": [to_replace.get(i, i) for i in doc["has_input"]],
    }}
    requests.append({"filter": {"_id": doc["_id"]}, "update": operations})

In [29]:
if requests:
    rv = db.omics_processing_set.bulk_write([UpdateOne(**r) for r in requests])
    print(rv.modified_count)

In [30]:
from nmdc_mongo import validator_for
db_share = get_db("dwinston_share")
#validator_for(db_share.biosample_set)

In [31]:
from pymongo import MongoClient

admin_client = MongoClient(
    host=os.getenv("NMDC_MONGO_HOST"),
    username="nmdc-admin",
    password=os.getenv("NMDC_MONGO_ADMIN_PWD")
)
admin_dwinston_share = admin_client["dwinston_share"]

In [32]:
target_collection_names = sorted(set(target_collection.values()))

In [33]:
target_collection_names

['biosample_set',
 'data_object_set',
 'metagenome_annotation_activity_set',
 'metagenome_assembly_set',
 'metaproteomics_analysis_activity_set',
 'omics_processing_set',
 'read_QC_analysis_activity_set',
 'study_set']

In [34]:
from nmdc_mongo.admin import reset_database_schema

reset_database_schema(admin_client["dwinston_share"], target_collection_names, collschemas)

updating biosample_set
updating data_object_set
updating metagenome_annotation_activity_set
updating metagenome_assembly_set
updating metaproteomics_analysis_activity_set
updating omics_processing_set
updating read_QC_analysis_activity_set
updating study_set


In [35]:
from toolz import keyfilter

db_share = get_db("dwinston_share")
target_collection_names = sorted(set(target_collection.values()))
for name in target_collection_names:
    docs = [sans_mongo_id(d) for d in db[name].find()]
    if docs:
        print("adding", name)
        add_to_db(docs, db_share, collection_name=name)
    else:
        print("none to add for", name)

adding biosample_set
adding data_object_set
none to add for metagenome_annotation_activity_set
adding metagenome_assembly_set
none to add for metaproteomics_analysis_activity_set
adding omics_processing_set
adding read_QC_analysis_activity_set
adding study_set
