In [1]:
# autoreload modules; useful for testing
# %load_ext autoreload
# %autoreload 2

In [2]:
import json
import copy
import pandas as pd
from toolz.dicttoolz import assoc_in, merge, dissoc
from dotenv import dotenv_values
from pymongo import MongoClient
from pymongo.database import Database as MongoDatabase
from nmdc_runtime.api.core.metadata import load_changesheet, update_mongo_db, mongo_update_command_for, copy_docs_in_update_cmd

### load mongodb via env info

In [3]:
config = dotenv_values("../../.env.localhost")
config["MONGO_HOST"]

'mongodb://localhost:27018'

In [4]:
client = MongoClient(host=config["MONGO_HOST"], username=config["MONGO_USERNAME"], password=config["MONGO_PASSWORD"])
mongodb = client["nmdc"]

### create temp database and temp_set collection for testing

In [5]:
if "temp_db" in client.list_database_names():
    client.drop_database("temp_db")
temp_db = client["temp_db"]

### helper functons

In [37]:
# wraps the mongo_update_command_for and update_mongo_db into
# a single function to process the change sheet
def process_changesheet(changeDf, mdb: MongoDatabase, temp_db: MongoDatabase):
    update_cmd = mongo_update_command_for(changeDf)
    copy_docs_in_update_cmd(update_cmd, mdb, temp_db)
    return update_mongo_db(temp_db, update_cmd)

In [26]:
# puts the change sheet results in dataframe
def print_results(results, print_before=True, print_after=True, print_errors=True):
    for i, result in enumerate(results):
        print(f"\n============== {result['id']} ==============")
        if print_before:
            print("------------------ BEFORE ------------------")
            print(json.dumps(result["doc_before"], indent=2))
        if print_after:
            print("------------------ AFTER ------------------")
            print(json.dumps(result["doc_after"], indent=2))
        if print_errors:
            print("------------------ ERRORS ------------------")
            print("\n".join(result["validation_errors"]))

### set dataframe display options

In [8]:
pd.set_option("display.max_columns", None)
pd.set_option('display.width', 1000)

### process change sheet 1
Simple test with two studies. The doi is changed using "." path separator.

In [55]:
pd.read_csv("data/changesheet-with-separator1.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0114663,update,name,UPDATED NAME 1
1,,update,ecosystem,SOIL
2,gold:Gs0103573,update,doi.has_raw_value,10.9999/8888
3,,update,name,UPDATED NAME 2


In [54]:
sheetDf = load_changesheet("data/changesheet-with-separator1.tsv", mongodb)
# sheetDf

In [56]:
print_results(process_changesheet(sheetDf, mongodb, temp_db))


------------------ BEFORE ------------------
{
  "id": "gold:Gs0103573",
  "name": "Populus root and rhizosphere microbial communities from Tennessee, USA",
  "description": "This study is part of the Plant-Microbe Interfaces Science Focus Area, which aims to gain a deeper understanding of the diversity and functioning of mutually beneficial interactions between plants and microbes in the rhizosphere. Ongoing efforts focus on characterizing and interpreting such interfaces using systems comprising plants and microbes, in particular the poplar tree (Populus) and its microbial community in the context of favorable plant microbe interactions.",
  "ecosystem": "Host-associated",
  "ecosystem_category": "Plants",
  "ecosystem_type": "Unclassified",
  "ecosystem_subtype": "Unclassified",
  "specific_ecosystem": "Unclassified",
  "principal_investigator": {
    "has_raw_value": "Dale Pelletier"
  },
  "type": "nmdc:Study",
  "websites": [
    "https://pmiweb.ornl.gov/pmi-project-aims/"
  ],


### process change sheet 2
Test with two studies, but the second study uses grouping variables/symbols to update object values.

In [57]:
pd.read_csv("data/changesheet-without-separator1.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0103573,update,name,NEW NAME 1
1,,,ecosystem,SOIL
2,gold:Gs0114663,update,doi,v1
3,v1,,has_raw_value,10.9999/8888
4,,update,name,NEW NAME 2
5,,update,principal_investigator,v2
6,v2,,has_raw_value,NEW RAW VALUE 2


In [58]:
sheetDf = load_changesheet("data/changesheet-without-separator1.tsv", mongodb)
# sheetDf

In [59]:
print_results(process_changesheet(sheetDf, mongodb, temp_db))


------------------ BEFORE ------------------
{
  "id": "gold:Gs0103573",
  "name": "Populus root and rhizosphere microbial communities from Tennessee, USA",
  "description": "This study is part of the Plant-Microbe Interfaces Science Focus Area, which aims to gain a deeper understanding of the diversity and functioning of mutually beneficial interactions between plants and microbes in the rhizosphere. Ongoing efforts focus on characterizing and interpreting such interfaces using systems comprising plants and microbes, in particular the poplar tree (Populus) and its microbial community in the context of favorable plant microbe interactions.",
  "ecosystem": "Host-associated",
  "ecosystem_category": "Plants",
  "ecosystem_type": "Unclassified",
  "ecosystem_subtype": "Unclassified",
  "specific_ecosystem": "Unclassified",
  "principal_investigator": {
    "has_raw_value": "Dale Pelletier"
  },
  "type": "nmdc:Study",
  "websites": [
    "https://pmiweb.ornl.gov/pmi-project-aims/"
  ],


### process change sheet 3
Test with two studies. Both studies are updated using grouping variables/symbols.  
Note the reuse of variables/symbols in each id group. A variable/symbol is local the id group it appears in.

In [60]:
pd.read_csv("data/changesheet-without-separator3.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0114663,update,name,NEW STUDY NAME 1
1,,update,ecosystem,NEW ECOSYSTEM 1
2,,update,ecosystem_type,NEW ECOSYSTEM_TYPE 1
3,,update,ecosystem_subtype,NEW ECOSYSTEM_SUBTYPE 1
4,,update,doi,v1
5,v1,update,has_raw_value,NEW DOI 1
6,,update,principal_investigator,v2
7,v2,,name,NEW PI NAME 1
8,v2,,has_raw_value,NEW RAW NAME 1
9,,update,description,NEW DESCRIPTION 1


In [61]:
sheetDf = load_changesheet("data/changesheet-without-separator3.tsv", mongodb)
# sheetDf

In [62]:
print_results(process_changesheet(sheetDf, mongodb, temp_db), print_before=True)


------------------ BEFORE ------------------
{
  "id": "gold:Gs0103573",
  "name": "Populus root and rhizosphere microbial communities from Tennessee, USA",
  "description": "This study is part of the Plant-Microbe Interfaces Science Focus Area, which aims to gain a deeper understanding of the diversity and functioning of mutually beneficial interactions between plants and microbes in the rhizosphere. Ongoing efforts focus on characterizing and interpreting such interfaces using systems comprising plants and microbes, in particular the poplar tree (Populus) and its microbial community in the context of favorable plant microbe interactions.",
  "ecosystem": "Host-associated",
  "ecosystem_category": "Plants",
  "ecosystem_type": "Unclassified",
  "ecosystem_subtype": "Unclassified",
  "specific_ecosystem": "Unclassified",
  "principal_investigator": {
    "has_raw_value": "Dale Pelletier"
  },
  "type": "nmdc:Study",
  "websites": [
    "https://pmiweb.ornl.gov/pmi-project-aims/"
  ],


### set multiple values for credit associations

In [18]:
# pd.read_csv("data/changesheet-array-item-nested-attributes.tsv", sep="\t", dtype="string").fillna('')

In [19]:
# sheetDf = load_changesheet("data/changesheet-array-item-nested-attributes.tsv", mongodb)
# sheetDf

In [20]:
# print_results(process_changesheet(sheetDf, mongodb, temp_db))

### set website for principal investigator

In [63]:
pd.read_csv("data/changesheet-update-pi-websites.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0103573,update,principal_investigator.has_raw_value,NEW PI NAME
1,,update,principal_investigator.name,NEW PI NAME
2,,update,principal_investigator.profile_image_url,https://portal.nersc.gov/NEW-PI-NAME.jpg
3,,update,principal_investigator.orcid,orcid:0000-0000-0000-0000
4,,update,principal_investigator.websites,https://www.ornl.gov/staff-profile/NEW-PI-NAME


In [64]:
sheetDf = load_changesheet("data/changesheet-update-pi-websites.tsv", mongodb)
# sheetDf

In [65]:
print_results(process_changesheet(sheetDf, mongodb, temp_db), print_before=True)


------------------ BEFORE ------------------
{
  "id": "gold:Gs0103573",
  "name": "Populus root and rhizosphere microbial communities from Tennessee, USA",
  "description": "This study is part of the Plant-Microbe Interfaces Science Focus Area, which aims to gain a deeper understanding of the diversity and functioning of mutually beneficial interactions between plants and microbes in the rhizosphere. Ongoing efforts focus on characterizing and interpreting such interfaces using systems comprising plants and microbes, in particular the poplar tree (Populus) and its microbial community in the context of favorable plant microbe interactions.",
  "ecosystem": "Host-associated",
  "ecosystem_category": "Plants",
  "ecosystem_type": "Unclassified",
  "ecosystem_subtype": "Unclassified",
  "specific_ecosystem": "Unclassified",
  "principal_investigator": {
    "has_raw_value": "Dale Pelletier"
  },
  "type": "nmdc:Study",
  "websites": [
    "https://pmiweb.ornl.gov/pmi-project-aims/"
  ],
