In [1]:
from collections import defaultdict
import csv
from datetime import datetime
import json
import os
import re
from pprint import pprint
from zipfile import ZipFile

from dictdiffer import diff
from pymongo import MongoClient
from toolz import assoc_in, compose, concat, dissoc, get_in

In [2]:
with ZipFile('../src/data/nmdc_database.json.zip') as myzip:
    with myzip.open('nmdc_database.json') as f:
        nmdc_database = json.load(f)

In [3]:
client = MongoClient(
    host=os.getenv("NMDC_MONGO_HOST"),
    username="dwinston_rw",
    password=os.getenv("NMDC_MONGO_RW_PWD"))
dbname = "dwinston_share"
db = client[dbname]

for collection in nmdc_database:
    db[collection].delete_many({})
    db[collection].insert_many(nmdc_database[collection])
print(sorted(db.list_collection_names()))

['activity_set', 'biosample_set', 'data_object_set', 'omics_processing_set', 'study_set']


In [4]:
GOLD_ID_IDX = 5
IGSN_IDX = 2

igsn_golds = defaultdict(list)

gold_id_pattern = re.compile(r"Gb\d+")

with open('../src/data/FICUS_Soil_Gs0135149_Brodie-12-23-2020_PS.xlsx - Brodie_Gs0135149_Soil_Metadata.csv') as f:
    reader = csv.reader(f)
    for row in reader:
        gold_id = row[GOLD_ID_IDX]
        igsn = row[IGSN_IDX]
        if gold_id_pattern.fullmatch(gold_id):
            igsn_golds[igsn].append(gold_id)

In [5]:
dt_pattern = re.compile(r"\d{2}-(?P<month>\w+)-\d{2} \d{2}\.\d{2}\.\d{2}\.(?P<ns>\d+) [A|P]M")
dt_format = "%d-%b-%y %I.%M.%S.%f %p"

def order_timestamps(timestamps):
    if not all(isinstance(ts, str) for ts in timestamps):
        raise Exception(f"{timestamps} not strings")
    as_datetimes = []
    for ts in timestamps:
        match = dt_pattern.search(ts)
        first, month, rest = ts.partition(match.group("month"))
        ts_new = first + month[0] + month[1:].lower() + rest
        ts_new = ts_new.replace(match.group("ns"), match.group("ns")[:-3]) # truncate to microseconds
        as_datetimes.append(datetime.strptime(ts_new, dt_format))
    sorted_dts = sorted(as_datetimes)
    return [dt.strftime(dt_format) for dt in sorted_dts]

In [6]:
er_xna_pattern = re.compile(r"ER_[D|R]NA_\d+$")

def rstrip_name_ER_ID(d):
    s = get_in(["name"], d)
    s_new = er_xna_pattern.split(s)[0] if er_xna_pattern.search(s) else s
    return assoc_in(d, ["name"], s_new)

def capitalize_location_raw_value(d):
    s = get_in(["location", "has_raw_value"], d)
    s_new = s[0].upper() + s[1:]
    return assoc_in(d, ["location", "has_raw_value"], s_new)

pipeline = compose(
    capitalize_location_raw_value,
    rstrip_name_ER_ID,
    lambda d: dissoc(d, "_id", "id", "add_date", "mod_date", "identifier"),
)

merged_biosample_docs = []

for igsn, golds in igsn_golds.items():
    igsn_curie = "igsn:"+igsn
    to_change = list(db.biosample_set.find({"id": {"$in": [f"gold:{g}" for g in golds]}}))
    distilled = list(map(pipeline, to_change))
    if len(to_change) == 1:
        merged = assoc_in(merged, ["id"], igsn_curie)
        merged = assoc_in(merged, ["identifier", "has_raw_value"], igsn_curie)
        merged_biosample_docs.append(merged)
        continue

    result = list(diff(distilled[0], distilled[1]))
    assert result == []
    earlier_ts, _ = order_timestamps([get_in(["add_date", "has_raw_value"], d) for d in to_change])
    merged = assoc_in(distilled[0], ["add_date", "has_raw_value"], earlier_ts)
    _, later_ts = order_timestamps([get_in(["mod_date", "has_raw_value"], d) for d in to_change])
    merged = assoc_in(merged, ["mod_date", "has_raw_value"], later_ts)
    merged = assoc_in(merged, ["id"], igsn_curie)
    merged = assoc_in(merged, ["identifier", "has_raw_value"], igsn_curie)
    merged_biosample_docs.append(merged)

assert len(merged_biosample_docs) == len(igsn_golds)

In [7]:
from pymongo import DeleteMany, InsertOne

requests = [DeleteMany({"id": {"$in": ["gold:"+g for g in concat(igsn_golds.values())]}})]
requests.extend([InsertOne(d) for d in merged_biosample_docs])
result = db.biosample_set.bulk_write(requests)
result.deleted_count, result.inserted_count

(93, 48)