In [1]:
# autoreload modules; useful for testing
%load_ext autoreload
%autoreload 2

In [2]:
import json
import copy
import pandas as pd
from toolz.dicttoolz import assoc_in, merge, dissoc
from dotenv import dotenv_values
from pymongo import MongoClient
from pymongo.database import Database as MongoDatabase
from nmdc_runtime.api.core.metadata import load_changesheet, update_mongo_db, mongo_update_command_for, copy_docs_in_update_cmd

### load mongodb via env info

In [3]:
config = dotenv_values("../../.env.localhost")
config["MONGO_HOST"]

'mongodb://localhost:27018'

In [4]:
client = MongoClient(host=config["MONGO_HOST"], username=config["MONGO_USERNAME"], password=config["MONGO_PASSWORD"])
mongodb = client["nmdc"]

### create temp database and temp_set collection for testing

In [5]:
if "temp_db" in client.list_database_names():
    client.drop_database("temp_db")
temp_db = client["temp_db"]

### helper functons

In [6]:
# wraps the mongo_update_command_for and update_mongo_db into
# a single function to process the change sheet
def process_changesheet(changeDf, mdb: MongoDatabase, temp_db: MongoDatabase):
    update_cmd = mongo_update_command_for(changeDf)
    copy_docs_in_update_cmd(update_cmd, mdb, temp_db)
    return update_mongo_db(temp_db, update_cmd)

In [7]:
# puts the change sheet results in dataframe
def print_results(results, print_before=True, print_after=True, print_errors=True):
    for i, result in enumerate(results):
        print(f"\n============== {result['id']} ==============")
        if print_before:
            print("------------------ BEFORE ------------------")
            print(json.dumps(result["doc_before"], indent=2))
        if print_after:
            print("------------------ AFTER ------------------")
            print(json.dumps(result["doc_after"], indent=2))
        if print_errors:
            print("------------------ ERRORS ------------------")
            print("\n".join(result["validation_errors"]))

### set dataframe display options

In [8]:
pd.set_option("display.max_columns", None)
pd.set_option('display.width', 1000)

### process change sheet 1
Simple test with two studies. The doi is changed using "." path separator.

In [17]:
pd.read_csv("data/changesheet-with-separator1.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0114663,update,name,UPDATED NAME 1
1,,update,ecosystem,SOIL
2,gold:Gs0103573,update,doi.has_raw_value,10.9999/8888
3,,update,name,UPDATED NAME 2


In [61]:
sheetDf = load_changesheet("data/changesheet-with-separator1.tsv", mongodb)
# sheetDf

Unnamed: 0,id,action,attribute,value,group_id,group_var,path,collection_name,class_name,base_range,base_item_type,prop_range,item_type
0,gold:Gs0114663,update,name,UPDATED NAME 1,gold:Gs0114663,,name,study_set,Study,,,string,
1,,update,ecosystem,SOIL,gold:Gs0114663,,ecosystem,study_set,Study,,,string,
2,gold:Gs0103573,update,doi.has_raw_value,10.9999/8888,gold:Gs0103573,,doi.has_raw_value,study_set,Study,object:AttributeValue,,string,
3,,update,name,UPDATED NAME 2,gold:Gs0103573,,name,study_set,Study,,,string,


In [None]:
print_results(process_changesheet(sheetDf, mongodb, temp_db))

### process change sheet 2
Test with two studies, but the second study uses grouping variables/symbols to update object values.

In [None]:
pd.read_csv("data/changesheet-without-separator1.tsv", sep="\t", dtype="string").fillna('')

In [None]:
sheetDf = load_changesheet("data/changesheet-without-separator1.tsv", mongodb)
# sheetDf

In [None]:
print_results(process_changesheet(sheetDf, mongodb, temp_db))

### process change sheet 3
Test with two studies. Both studies are updated using grouping variables/symbols.  
Note the reuse of variables/symbols in each id group. A variable/symbol is local the id group it appears in.

In [None]:
pd.read_csv("data/changesheet-without-separator3.tsv", sep="\t", dtype="string").fillna('')

In [None]:
sheetDf = load_changesheet("data/changesheet-without-separator3.tsv", mongodb)
# sheetDf

In [None]:
print_results(process_changesheet(sheetDf, mongodb, temp_db), print_before=True)

### set multiple values for credit associations

In [None]:
# pd.read_csv("data/changesheet-array-item-nested-attributes.tsv", sep="\t", dtype="string").fillna('')

In [None]:
# sheetDf = load_changesheet("data/changesheet-array-item-nested-attributes.tsv", mongodb)
# sheetDf

In [None]:
# print_results(process_changesheet(sheetDf, mongodb, temp_db))

### set website for principal investigator

In [None]:
pd.read_csv("data/changesheet-update-pi-websites.tsv", sep="\t", dtype="string").fillna('')

In [None]:
sheetDf = load_changesheet("data/changesheet-update-pi-websites.tsv", mongodb)
# sheetDf

In [None]:
print_results(process_changesheet(sheetDf, mongodb, temp_db), print_before=True)