In [236]:
# autoreload modules; useful for testing
# %load_ext autoreload
# %autoreload 2

In [134]:
import json
import copy
import pandas as pd
from toolz.dicttoolz import assoc_in, merge, dissoc
from dotenv import dotenv_values
from pymongo import MongoClient
from pymongo.database import Database as MongoDatabase
from nmdc_runtime.api.core.metadata import load_changesheet, update_mongo_db, mongo_update_command_for, copy_docs_in_update_cmd

### load mongodb via env info

In [135]:
config = dotenv_values("../../.env.localhost")
config["MONGO_HOST"]

'mongodb://localhost:27018'

In [198]:
client = MongoClient(host=config["MONGO_HOST"], username=config["MONGO_USERNAME"], password=config["MONGO_PASSWORD"])
mongodb = client["nmdc_etl_staging"]

### creat temp database and temp_set collection for testing

In [237]:
if "temp_db" in client.list_database_names():
    client.drop_database("temp_db")
temp_db = client["temp_db"]

### helper functons

In [232]:
# wraps the mongo_update_command_for and update_mongo_db into
# a single function to process the change sheet
def process_changesheet(changeDf, mdb: MongoDatabase, temp_db: MongoDatabase):
    update_cmd = mongo_update_command_for(changeDf)
    copy_docs_in_update_cmd(update_cmd, mdb, temp_db)
    return update_mongo_db(temp_db, update_cmd)

In [201]:
# puts the change sheet results in dataframe
def print_results(results):
    for i, result in enumerate(results):
        print(f"\n============== {result['id']} ==============")
        print("------------------ BEFORE ------------------")
        print(json.dumps(result["doc_before"], indent=2))
        print("------------------ AFTER ------------------")
        print(json.dumps(result["doc_after"], indent=2))
        print("------------------ ERRORS ------------------")
        print("\n".join(result["validation_errors"]))

### set dataframe display options

In [202]:
pd.set_option("display.max_columns", None)
pd.set_option('display.width', 1000)

### process change sheet

In [239]:
pd.read_csv("data/changesheet-without-separator3.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0114663,update,name,NEW STUDY NAME
1,,update,ecosystem,NEW ECOSYSTEM
2,,update,ecosystem_type,NEW ECOSYSTEM_TYPE
3,,update,ecosystem_subtype,NEW ECOSYSTEM_SUBTYPE
4,,update,doi,v1
5,v1,update,has_raw_value,NEW DOI
6,,update,principal_investigator,v2
7,v2,,name,NEW PI NAME
8,v2,,has_raw_value,NEW RAW NAME
9,,update,description,NEW DESCRIPTION


In [225]:
sheetDf = load_changesheet('data/changesheet-without-separator3.tsv', mongodb)
# sheetDf

### update mongodb using change sheet

In [238]:
print_results(process_changesheet(sheetDf, mongodb, temp_db))


------------------ BEFORE ------------------
{
  "id": "gold:Gs0114663",
  "name": "Riverbed sediment microbial communities from the Columbia River, Washington, USA",
  "description": "A metagenomic study to couple microbial communities to carbon and contaminant biogeochemistry in the groundwater-surface water interaction zone (hyporheic zone).",
  "ecosystem": "Environmental",
  "ecosystem_category": "Aquatic",
  "ecosystem_type": "Freshwater",
  "ecosystem_subtype": "River",
  "specific_ecosystem": "Sediment",
  "principal_investigator": {
    "has_raw_value": "James Stegen"
  },
  "doi": {
    "has_raw_value": "10.25585/1487765"
  },
  "type": "nmdc:Study"
}
------------------ AFTER ------------------
{
  "id": "gold:Gs0114663",
  "name": "NEW STUDY NAME",
  "description": "NEW DESCRIPTION",
  "ecosystem": "NEW ECOSYSTEM",
  "ecosystem_category": "Aquatic",
  "ecosystem_type": "NEW ECOSYSTEM_TYPE",
  "ecosystem_subtype": "NEW ECOSYSTEM_SUBTYPE",
  "specific_ecosystem": "Sediment",
