In [1]:
# autoreload modules; useful for testing
# %load_ext autoreload
# %autoreload 2

In [2]:
import json
import copy
import pandas as pd
from toolz.dicttoolz import assoc_in, merge, dissoc
from dotenv import dotenv_values
from pymongo import MongoClient
from pymongo.database import Database as MongoDatabase
from nmdc_runtime.api.core.metadata import load_changesheet, update_mongo_db, mongo_update_command_for, copy_docs_in_update_cmd

### load mongodb via env info

In [3]:
config = dotenv_values("../../.env.localhost")
config["MONGO_HOST"]

'mongodb://localhost:27018'

In [4]:
client = MongoClient(host=config["MONGO_HOST"], username=config["MONGO_USERNAME"], password=config["MONGO_PASSWORD"])
mongodb = client["nmdc_etl_staging"]

### create temp database and temp_set collection for testing

In [5]:
if "temp_db" in client.list_database_names():
    client.drop_database("temp_db")
temp_db = client["temp_db"]

### helper functons

In [6]:
# wraps the mongo_update_command_for and update_mongo_db into
# a single function to process the change sheet
def process_changesheet(changeDf, mdb: MongoDatabase, temp_db: MongoDatabase):
    update_cmd = mongo_update_command_for(changeDf)
    copy_docs_in_update_cmd(update_cmd, mdb, temp_db)
    return update_mongo_db(temp_db, update_cmd)

In [7]:
# puts the change sheet results in dataframe
def print_results(results):
    for i, result in enumerate(results):
        print(f"\n============== {result['id']} ==============")
        print("------------------ BEFORE ------------------")
        print(json.dumps(result["doc_before"], indent=2))
        print("------------------ AFTER ------------------")
        print(json.dumps(result["doc_after"], indent=2))
        print("------------------ ERRORS ------------------")
        print("\n".join(result["validation_errors"]))

### set dataframe display options

In [8]:
pd.set_option("display.max_columns", None)
pd.set_option('display.width', 1000)

### process change sheet 1
Simple test with two studies. The doi is changed using "." path separator.

In [9]:
pd.read_csv("data/changesheet-with-separator1.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0114663,update,name,UPDATED NAME 1
1,,update,ecosystem,SOIL
2,gold:Gs0103573,update,doi.has_raw_value,10.9999/8888
3,,update,name,UPDATED NAME 2


In [10]:
sheetDf = load_changesheet("data/changesheet-with-separator1.tsv", mongodb)
# sheetDf

In [11]:
print_results(process_changesheet(sheetDf, mongodb, temp_db))


------------------ BEFORE ------------------
{
  "id": "gold:Gs0103573",
  "name": "Populus root and rhizosphere microbial communities from Tennessee, USA",
  "description": "Defining the functional diversity of the Populus root microbiome",
  "ecosystem": "Host-associated",
  "ecosystem_category": "Plants",
  "ecosystem_type": "Unclassified",
  "ecosystem_subtype": "Unclassified",
  "specific_ecosystem": "Unclassified",
  "principal_investigator": {
    "has_raw_value": "Dale Pelletier"
  },
  "type": "nmdc:Study"
}
------------------ AFTER ------------------
{
  "id": "gold:Gs0103573",
  "name": "UPDATED NAME 2",
  "description": "Defining the functional diversity of the Populus root microbiome",
  "ecosystem": "Host-associated",
  "ecosystem_category": "Plants",
  "ecosystem_type": "Unclassified",
  "ecosystem_subtype": "Unclassified",
  "specific_ecosystem": "Unclassified",
  "principal_investigator": {
    "has_raw_value": "Dale Pelletier"
  },
  "type": "nmdc:Study",
  "doi": {


### process change sheet 2
Test with two studies, but the second study uses grouping variables/symbols to update object values.

In [12]:
pd.read_csv("data/changesheet-without-separator1.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0103573,update,name,NEW NAME 1
1,,,ecosystem,SOIL
2,gold:Gs0114663,update,doi,v1
3,v1,,has_raw_value,10.9999/8888
4,,update,name,NEW NAME 2
5,,update,principal_investigator,v2
6,v2,,has_raw_value,NEW RAW VALUE 2


In [13]:
sheetDf = load_changesheet("data/changesheet-without-separator1.tsv", mongodb)
# sheetDf

In [14]:
print_results(process_changesheet(sheetDf, mongodb, temp_db))


------------------ BEFORE ------------------
{
  "id": "gold:Gs0103573",
  "name": "Populus root and rhizosphere microbial communities from Tennessee, USA",
  "description": "Defining the functional diversity of the Populus root microbiome",
  "ecosystem": "Host-associated",
  "ecosystem_category": "Plants",
  "ecosystem_type": "Unclassified",
  "ecosystem_subtype": "Unclassified",
  "specific_ecosystem": "Unclassified",
  "principal_investigator": {
    "has_raw_value": "Dale Pelletier"
  },
  "type": "nmdc:Study"
}
------------------ AFTER ------------------
{
  "id": "gold:Gs0103573",
  "name": "NEW NAME 1",
  "description": "Defining the functional diversity of the Populus root microbiome",
  "ecosystem": "SOIL",
  "ecosystem_category": "Plants",
  "ecosystem_type": "Unclassified",
  "ecosystem_subtype": "Unclassified",
  "specific_ecosystem": "Unclassified",
  "principal_investigator": {
    "has_raw_value": "Dale Pelletier"
  },
  "type": "nmdc:Study"
}
------------------ ERRORS

### process change sheet 3
Test with two studies. Both studies are updated using grouping variables/symbols.  
Note the reuse of variables/symbols in each id group. A variable/symbol is local the id group it appears in.

In [15]:
pd.read_csv("data/changesheet-without-separator3.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0114663,update,name,NEW STUDY NAME 1
1,,update,ecosystem,NEW ECOSYSTEM 1
2,,update,ecosystem_type,NEW ECOSYSTEM_TYPE 1
3,,update,ecosystem_subtype,NEW ECOSYSTEM_SUBTYPE 1
4,,update,doi,v1
5,v1,update,has_raw_value,NEW DOI 1
6,,update,principal_investigator,v2
7,v2,,name,NEW PI NAME 1
8,v2,,has_raw_value,NEW RAW NAME 1
9,,update,description,NEW DESCRIPTION 1


In [16]:
sheetDf = load_changesheet("data/changesheet-without-separator3.tsv", mongodb)
# sheetDf

In [17]:
print_results(process_changesheet(sheetDf, mongodb, temp_db))


------------------ BEFORE ------------------
{
  "id": "gold:Gs0103573",
  "name": "Populus root and rhizosphere microbial communities from Tennessee, USA",
  "description": "Defining the functional diversity of the Populus root microbiome",
  "ecosystem": "Host-associated",
  "ecosystem_category": "Plants",
  "ecosystem_type": "Unclassified",
  "ecosystem_subtype": "Unclassified",
  "specific_ecosystem": "Unclassified",
  "principal_investigator": {
    "has_raw_value": "Dale Pelletier"
  },
  "type": "nmdc:Study"
}
------------------ AFTER ------------------
{
  "id": "gold:Gs0103573",
  "name": "NEW STUDY NAME 2",
  "description": "NEW DESCRIPTION 2",
  "ecosystem": "NEW ECOSYSTEM 2",
  "ecosystem_category": "Plants",
  "ecosystem_type": "NEW ECOSYSTEM_TYPE 2",
  "ecosystem_subtype": "NEW ECOSYSTEM_SUBTYPE 2",
  "specific_ecosystem": "Unclassified",
  "principal_investigator": {
    "has_raw_value": "NEW RAW NAME 2",
    "name": "NEW PI NAME 2"
  },
  "type": "nmdc:Study",
  "websi

### set multiple values for credit associations

In [18]:
pd.read_csv("data/changesheet-array-item-nested-attributes.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0114675,update,has_credit_associations,ca1
1,ca1,update,applied_role,Conceptualization
2,ca1,update,applies_to_person.name,Kelly Wrighton
3,ca1,update,applies_to_person.email,Kelly.Wrighton@colostate.edu
4,ca1,update,applies_to_person.orcid,orcid:0000-0003-0434-4217


In [19]:
sheetDf = load_changesheet("data/changesheet-array-item-nested-attributes.tsv", mongodb)
# sheetDf

In [20]:
print_results(process_changesheet(sheetDf, mongodb, temp_db))


------------------ BEFORE ------------------
{
  "id": "gold:Gs0114675",
  "name": "Deep subsurface shale carbon reservoir microbial communities from Ohio and West Virginia, USA",
  "description": "Despite the importance of carbon-rich deep shale systems, little is known about microbe-catalyzed processes that may occur naturally within these formations, and how these change as a result of engineered activities. This has important implications on biodiversity change within the deep terrestrial biosphere, the efficiency of energy recovery, and the sustainability of our energy infrastructure. Although limited studies have identified diverse active microorganisms in deep shale formations, more research is needed to understand microbial mechanisms for growth and survival, and how such activities impact carbon and other biogeochemical cycles.",
  "ecosystem": "Environmental",
  "ecosystem_category": "Terrestrial",
  "ecosystem_type": "Deep subsurface",
  "ecosystem_subtype": "Unclassified",