In [1]:
from collections import defaultdict
import csv
from datetime import datetime
from functools import partial, reduce
import json
import os
import re
from pprint import pprint
from zipfile import ZipFile

from dictdiffer import diff
from pymongo import DeleteMany, InsertOne, UpdateOne, MongoClient
from toolz import assoc_in, compose, concat, dissoc, get_in, merge, merge_with

(Re-)load existing NMDC DB from file.

In [2]:
with ZipFile('../src/data/nmdc_database.json.zip') as myzip:
    with myzip.open('nmdc_database.json') as f:
        nmdc_database = json.load(f)

In [3]:
client = MongoClient(
    host=os.getenv("NMDC_MONGO_HOST"),
    username="dwinston_rw",
    password=os.getenv("NMDC_MONGO_RW_PWD"))
dbname = "dwinston_share"
db = client[dbname]

for collection in nmdc_database:
    db[collection].delete_many({})
    db[collection].insert_many(nmdc_database[collection])
print(sorted(db.list_collection_names()))

['activity_set', 'biosample_set', 'data_object_set', 'foo', 'omics_processing_set', 'study_set']


Load FICUS Brodie spreadsheet and create gold-id-to-igsn map.

In [4]:
GOLD_ID_IDX = 5
IGSN_IDX = 2

igsn_golds = defaultdict(list)

gold_id_pattern = re.compile(r"Gb\d+")

with open('../src/data/FICUS_Soil_Gs0135149_Brodie-12-23-2020_PS.xlsx - Brodie_Gs0135149_Soil_Metadata.csv') as f:
    reader = csv.reader(f)
    for row in reader:
        gold_id = row[GOLD_ID_IDX]
        igsn = row[IGSN_IDX]
        if gold_id_pattern.fullmatch(gold_id):
            igsn_golds[igsn].append(gold_id)

Prepare helper function to compare timestamps given in e.g. "15-MAY-20 08.30.01.000000000 am" format.

In [5]:
dt_pattern = re.compile(r"\d{2}-(?P<month>\w+)-\d{2} \d{2}\.\d{2}\.\d{2}\.(?P<ns>\d+) [A|P]M")
dt_format = "%d-%b-%y %I.%M.%S.%f %p"

def order_timestamps(timestamps):
    if not all(isinstance(ts, str) for ts in timestamps):
        raise Exception(f"{timestamps} not strings")
    as_datetimes = []
    for ts in timestamps:
        match = dt_pattern.search(ts)
        first, month, rest = ts.partition(match.group("month"))
        ts_new = first + month[0] + month[1:].lower() + rest
        ts_new = ts_new.replace(match.group("ns"), match.group("ns")[:-3]) # truncate to microseconds
        as_datetimes.append(datetime.strptime(ts_new, dt_format))
    sorted_dts = sorted(as_datetimes)
    return [dt.strftime(dt_format) for dt in sorted_dts]

Prepare helper-function pipeline to unify biosample_set documents that should be considered equivalent.

In [6]:
er_xna_pattern = re.compile(r"ER_[D|R]NA_\d+$")

def rstrip_name_ER_ID(d):
    s = get_in(["name"], d)
    s_new = er_xna_pattern.split(s)[0] if er_xna_pattern.search(s) else s
    return assoc_in(d, ["name"], s_new)

def capitalize_location_raw_value(d):
    s = get_in(["location", "has_raw_value"], d)
    s_new = s[0].upper() + s[1:]
    return assoc_in(d, ["location", "has_raw_value"], s_new)

pipeline = compose(
    capitalize_location_raw_value,
    rstrip_name_ER_ID,
    lambda d: dissoc(d, "_id", "id", "add_date", "mod_date", "identifier"),
)

Produce new biosample objects with ISGN ids.

In [7]:
merged_biosample_docs = []

for igsn, golds in igsn_golds.items():
    igsn_curie = "igsn:"+igsn
    to_change = list(db.biosample_set.find({"id": {"$in": [f"gold:{g}" for g in golds]}}))
    
    # No merge needed, just change of id.
    if len(to_change) == 1:
        merged = assoc_in(to_change[0], ["id"], igsn_curie)
        merged = assoc_in(merged, ["identifier", "has_raw_value"], igsn_curie)
        merged_biosample_docs.append(merged)
        continue

    # Ensure that unification pipeline is adequate to resolve differences.
    distilled = list(map(pipeline, to_change))
    result = list(diff(distilled[0], distilled[1]))
    assert result == []
    
    # Produce a merged document
    earlier_ts, _ = order_timestamps([get_in(["add_date", "has_raw_value"], d) for d in to_change])
    merged = assoc_in(distilled[0], ["add_date", "has_raw_value"], earlier_ts)
    _, later_ts = order_timestamps([get_in(["mod_date", "has_raw_value"], d) for d in to_change])
    merged = assoc_in(merged, ["mod_date", "has_raw_value"], later_ts)
    merged = assoc_in(merged, ["id"], igsn_curie)
    merged = assoc_in(merged, ["identifier", "has_raw_value"], igsn_curie)
    
    merged_biosample_docs.append(merged)
    merged = None # defense against accidental reuse during next iteration.

assert len(merged_biosample_docs) == len(igsn_golds)

Delete old biosample objects and insert new ones in one bulk-write operation.

In [8]:
requests = [DeleteMany({"id": {"$in": ["gold:"+g for g in concat(igsn_golds.values())]}})]
requests.extend([InsertOne(d) for d in merged_biosample_docs])
result = db.biosample_set.bulk_write(requests)
result.deleted_count, result.inserted_count

(93, 48)

Update omics_processing_set references to biosample_set ids.

In [9]:
goldid_igsn = {}
for igsn, gids in igsn_golds.items():
    for gid in gids:
        goldid_igsn[gid] = igsn

In [10]:
requests = []
to_replace = {"gold:"+k: "igsn:"+v for k, v in goldid_igsn.items()}

for doc in db.omics_processing_set.find({"$or": [
    {"id": {"$in": list(to_replace)}},
    {"has_input": {"$in": list(to_replace)}},
    {"has_output": {"$in": list(to_replace)}},
    {"part_of": {"$in": list(to_replace)}}
]}):
    operations = {"$set": {
        "id": to_replace.get(doc["id"], doc["id"]),
        "has_input": [to_replace.get(i, i) for i in doc["has_input"]],
        "has_output": [to_replace.get(i, i) for i in doc["has_output"]],
        "part_of": [to_replace.get(i, i) for i in doc["part_of"]],
    }}
    requests.append({"filter": {"_id": doc["_id"]}, "update": operations})

In [11]:
rv = db.omics_processing_set.bulk_write([UpdateOne(**r) for r in requests])

In [12]:
rv.modified_count

93

Update omics_processing_set references from EMSL ids to IGSNs.

In [13]:
EMSL_IDS_IDX = 7
IGSN_IDX = 2

igsn_emsls = {}

emsl_ids_pattern = re.compile(r"\d+")

with open('../src/data/FICUS_Soil_Gs0135149_Brodie-12-23-2020_PS.xlsx - Brodie_Gs0135149_Soil_Metadata.csv') as f:
    reader = csv.reader(f)
    for row in reader:
        emsl_ids = row[EMSL_IDS_IDX]
        igsn = row[IGSN_IDX]
        ids = emsl_ids_pattern.findall(emsl_ids)
        # XXX some rows have emsl ids but no IGSN, so igsn.strip() check here
        if igsn.strip() and ids:
            igsn_emsls[igsn] = ids

In [14]:
emslid_igsn = {}
for igsn, eids in igsn_emsls.items():
    for eid in eids:
        emslid_igsn[eid] = igsn

In [15]:
n_with_emsl_id = db.omics_processing_set.count_documents(
    {"id": {"$in": ["emsl:"+i for i in emslid_igsn]}})

In [16]:
requests = []
to_replace = {"emsl:"+k: "igsn:"+v for k, v in emslid_igsn.items()}
to_replace.update({"emsl:output_"+k: "igsn:"+v for k, v in emslid_igsn.items()})

for doc in db.omics_processing_set.find({"$or": [
    {"id": {"$in": list(to_replace)}},
    {"has_input": {"$in": list(to_replace)}},
    {"has_output": {"$in": list(to_replace)}},
    {"part_of": {"$in": list(to_replace)}}
]}):
    operations = {"$set": {"id": to_replace.get(doc["id"], doc["id"])}}
    for field in ["has_input", "has_output", "part_of"]:
        if field in doc:
            operations["$set"][field] = [to_replace.get(i, i) for i in doc[field]]
    requests.append({"filter": {"_id": doc["_id"]}, "update": operations})

In [17]:
rv = db.omics_processing_set.bulk_write([UpdateOne(**r) for r in requests])
assert n_with_emsl_id == rv.modified_count

Note that some newly IGSN id'ed docs identify as part_of biosamples without IGSNs

In [18]:
db.omics_processing_set.find_one(
    {"id": {"$regex": "^igsn"}, "part_of": {"$regex": "^gold"}})

{'_id': ObjectId('6008a50432e1b154f91ab624'),
 'id': 'igsn:IEWFS000A',
 'name': 'Brodie_134A_CHCl3_15Oct18_IAT_p1_1_01_35922',
 'description': 'High resolution MS spectra only',
 'part_of': ['gold:Gs0135149'],
 'has_output': ['igsn:IEWFS000A'],
 'omics_type': {'has_raw_value': 'Organic Matter Characterization'},
 'type': 'nmdc:OmicsProcessing',
 'instrument_name': {'has_raw_value': '12T_FTICR_B'},
 'processing_institution': {'has_raw_value': 'Environmental Molecular Sciences Lab'}}

In [19]:
"gold:Gs0135149" in goldid_igsn

False