In [1]:
# autoreload modules; useful for testing
%load_ext autoreload
%autoreload 2

In [2]:
import json
import copy
import pandas as pd
from toolz.dicttoolz import assoc_in, merge, merge_with, get_in, dissoc
from dotenv import dotenv_values
from pymongo import MongoClient
from pymongo.database import Database as MongoDatabase
from nmdc_runtime.api.core.metadata import load_changesheet, update_mongo_db, mongo_update_command_for, copy_docs_in_update_cmd

### load mongodb via env info

In [3]:
config = dotenv_values("../../.env.localhost")
config["MONGO_HOST"]

'mongodb://localhost:27018'

In [4]:
client = MongoClient(host=config["MONGO_HOST"], username=config["MONGO_USERNAME"], password=config["MONGO_PASSWORD"])
mongodb = client["nmdc"]

### create temp database and temp_set collection for testing

In [5]:
if "temp_db" in client.list_database_names():
    client.drop_database("temp_db")
temp_db = client["temp_db"]

### helper functons

In [6]:
# wraps the mongo_update_command_for and update_mongo_db into
# a single function to process the change sheet
def process_changesheet(changeDf, mdb: MongoDatabase, temp_db: MongoDatabase):
    update_cmd = mongo_update_command_for(changeDf)
    copy_docs_in_update_cmd(update_cmd, mdb, temp_db)
    return update_mongo_db(temp_db, update_cmd)
    
#     for id_, cmd in update_cmd.items():
#         print('id:', id_)
#         print(cmd)
#         print('\n')

In [7]:
# puts the change sheet results in dataframe
def print_results(results, print_before=True, print_after=True, print_errors=True):
    for i, result in enumerate(results):
        print(f"\n============== {result['id']} ==============")
        if print_before:
            print("------------------ BEFORE ------------------")
            print(json.dumps(result["doc_before"], indent=2))
        if print_after:
            print("------------------ AFTER ------------------")
            print(json.dumps(result["doc_after"], indent=2))
        if print_errors:
            print("------------------ ERRORS ------------------")
            print("\n".join(result["validation_errors"]))

### set dataframe display options

In [8]:
pd.set_option("display.max_columns", None)
pd.set_option('display.width', 1000)

### process change sheet 1
Simple test with two studies. The doi is changed using "." path separator.

In [9]:
pd.read_csv("data/changesheet-with-separator1.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0114663,update,name,UPDATED NAME 1
1,,update,ecosystem,SOIL
2,gold:Gs0103573,update,doi.has_raw_value,10.9999/8888
3,,update,name,UPDATED NAME 2


In [10]:
sheetDf = load_changesheet("data/changesheet-with-separator1.tsv", mongodb)
sheetDf

Unnamed: 0,id,action,attribute,value,group_id,group_var,path,collection_name,linkml_class,linkml_slots,ranges,multivalues
0,gold:Gs0114663,update,name,UPDATED NAME 1,gold:Gs0114663,,name,study_set,study,name,string,False
1,,update,ecosystem,SOIL,gold:Gs0114663,,ecosystem,study_set,study,ecosystem,string,False
2,gold:Gs0103573,update,doi.has_raw_value,10.9999/8888,gold:Gs0103573,,doi.has_raw_value,study_set,study,doi|has raw value,attribute value|string,False|False
3,,update,name,UPDATED NAME 2,gold:Gs0103573,,name,study_set,study,name,string,False


In [11]:
print_results(process_changesheet(sheetDf, mongodb, temp_db))


------------------ BEFORE ------------------
{
  "id": "gold:Gs0103573",
  "name": "Populus root and rhizosphere microbial communities from Tennessee, USA",
  "description": "This study is part of the Plant-Microbe Interfaces Science Focus Area, which aims to gain a deeper understanding of the diversity and functioning of mutually beneficial interactions between plants and microbes in the rhizosphere. Ongoing efforts focus on characterizing and interpreting such interfaces using systems comprising plants and microbes, in particular the poplar tree (Populus) and its microbial community in the context of favorable plant microbe interactions.",
  "ecosystem": "Host-associated",
  "ecosystem_category": "Plants",
  "ecosystem_type": "Unclassified",
  "ecosystem_subtype": "Unclassified",
  "specific_ecosystem": "Unclassified",
  "principal_investigator": {
    "has_raw_value": "Mitchel J. Doktycz",
    "name": "Mitchel J. Doktycz",
    "profile_image_url": "https://portal.nersc.gov/project/

### process change sheet 2
Test with two studies, but the second study uses grouping variables/symbols to update object values.

In [12]:
pd.read_csv("data/changesheet-without-separator1.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0103573,update,name,NEW NAME 1
1,,,ecosystem,SOIL
2,gold:Gs0114663,update,doi,v1
3,v1,,has_raw_value,10.9999/8888
4,,update,name,NEW NAME 2
5,,update,principal_investigator,v2
6,v2,,has_raw_value,NEW RAW VALUE 2


In [13]:
sheetDf = load_changesheet("data/changesheet-without-separator1.tsv", mongodb)
sheetDf

Unnamed: 0,id,action,attribute,value,group_id,group_var,path,collection_name,linkml_class,linkml_slots,ranges,multivalues
0,gold:Gs0103573,update,name,NEW NAME 1,gold:Gs0103573,,name,study_set,study,name,string,False
1,,update,ecosystem,SOIL,gold:Gs0103573,,ecosystem,study_set,study,ecosystem,string,False
2,gold:Gs0114663,update,doi,v1,gold:Gs0114663,v1,,study_set,study,doi,attribute value,False
3,v1,update,has_raw_value,10.9999/8888,gold:Gs0114663,v1,doi.has_raw_value,study_set,study,doi|has raw value,attribute value|string,False|False
4,,update,name,NEW NAME 2,gold:Gs0114663,,name,study_set,study,name,string,False
5,,update,principal_investigator,v2,gold:Gs0114663,v2,,study_set,study,principal investigator,person value,False
6,v2,update,has_raw_value,NEW RAW VALUE 2,gold:Gs0114663,v2,principal_investigator.has_raw_value,study_set,study,principal investigator|has raw value,person value|string,False|False


In [14]:
print_results(process_changesheet(sheetDf, mongodb, temp_db))


------------------ BEFORE ------------------
{
  "id": "gold:Gs0103573",
  "name": "Populus root and rhizosphere microbial communities from Tennessee, USA",
  "description": "This study is part of the Plant-Microbe Interfaces Science Focus Area, which aims to gain a deeper understanding of the diversity and functioning of mutually beneficial interactions between plants and microbes in the rhizosphere. Ongoing efforts focus on characterizing and interpreting such interfaces using systems comprising plants and microbes, in particular the poplar tree (Populus) and its microbial community in the context of favorable plant microbe interactions.",
  "ecosystem": "Host-associated",
  "ecosystem_category": "Plants",
  "ecosystem_type": "Unclassified",
  "ecosystem_subtype": "Unclassified",
  "specific_ecosystem": "Unclassified",
  "principal_investigator": {
    "has_raw_value": "Mitchel J. Doktycz",
    "name": "Mitchel J. Doktycz",
    "profile_image_url": "https://portal.nersc.gov/project/

### process change sheet 3
Test with two studies. Both studies are updated using grouping variables/symbols.  
Note the reuse of variables/symbols in each id group. A variable/symbol is local the id group it appears in.

In [15]:
pd.read_csv("data/changesheet-without-separator3.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0114663,update,name,NEW STUDY NAME 1
1,,update,ecosystem,NEW ECOSYSTEM 1
2,,update,ecosystem_type,NEW ECOSYSTEM_TYPE 1
3,,update,ecosystem_subtype,NEW ECOSYSTEM_SUBTYPE 1
4,,update,doi,v1
5,v1,update,has_raw_value,NEW DOI 1
6,,update,principal_investigator,v2
7,v2,,name,NEW PI NAME 1
8,v2,,has_raw_value,NEW RAW NAME 1
9,,update,description,NEW DESCRIPTION 1


In [16]:
sheetDf = load_changesheet("data/changesheet-without-separator3.tsv", mongodb)
sheetDf 

Unnamed: 0,id,action,attribute,value,group_id,group_var,path,collection_name,linkml_class,linkml_slots,ranges,multivalues
0,gold:Gs0114663,update,name,NEW STUDY NAME 1,gold:Gs0114663,,name,study_set,study,name,string,False
1,,update,ecosystem,NEW ECOSYSTEM 1,gold:Gs0114663,,ecosystem,study_set,study,ecosystem,string,False
2,,update,ecosystem_type,NEW ECOSYSTEM_TYPE 1,gold:Gs0114663,,ecosystem_type,study_set,study,ecosystem_type,string,False
3,,update,ecosystem_subtype,NEW ECOSYSTEM_SUBTYPE 1,gold:Gs0114663,,ecosystem_subtype,study_set,study,ecosystem_subtype,string,False
4,,update,doi,v1,gold:Gs0114663,v1,,study_set,study,doi,attribute value,False
5,v1,update,has_raw_value,NEW DOI 1,gold:Gs0114663,v1,doi.has_raw_value,study_set,study,doi|has raw value,attribute value|string,False|False
6,,update,principal_investigator,v2,gold:Gs0114663,v2,,study_set,study,principal investigator,person value,False
7,v2,update,name,NEW PI NAME 1,gold:Gs0114663,v2,principal_investigator.name,study_set,study,principal investigator|name,person value|string,False|False
8,v2,update,has_raw_value,NEW RAW NAME 1,gold:Gs0114663,v2,principal_investigator.has_raw_value,study_set,study,principal investigator|has raw value,person value|string,False|False
9,,update,description,NEW DESCRIPTION 1,gold:Gs0114663,,description,study_set,study,description,string,False


In [17]:
print_results(process_changesheet(sheetDf, mongodb, temp_db), print_before=True)


------------------ BEFORE ------------------
{
  "id": "gold:Gs0103573",
  "name": "Populus root and rhizosphere microbial communities from Tennessee, USA",
  "description": "This study is part of the Plant-Microbe Interfaces Science Focus Area, which aims to gain a deeper understanding of the diversity and functioning of mutually beneficial interactions between plants and microbes in the rhizosphere. Ongoing efforts focus on characterizing and interpreting such interfaces using systems comprising plants and microbes, in particular the poplar tree (Populus) and its microbial community in the context of favorable plant microbe interactions.",
  "ecosystem": "Host-associated",
  "ecosystem_category": "Plants",
  "ecosystem_type": "Unclassified",
  "ecosystem_subtype": "Unclassified",
  "specific_ecosystem": "Unclassified",
  "principal_investigator": {
    "has_raw_value": "Mitchel J. Doktycz",
    "name": "Mitchel J. Doktycz",
    "profile_image_url": "https://portal.nersc.gov/project/

### set multiple values for credit associations

In [18]:
pd.read_csv("data/changesheet-array-item-nested-attributes.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0114675,update,has_credit_associations,ca1
1,ca1,update,applied_role,Conceptualization
2,ca1,update,applies_to_person.name,CREDIT NAME 1
3,ca1,update,applies_to_person.email,CREDIT_NAME_1@foo.edu
4,ca1,update,applies_to_person.orcid,orcid:0000-0000-0000-0001


In [19]:
sheetDf = load_changesheet("data/changesheet-array-item-nested-attributes.tsv", mongodb)
sheetDf

Unnamed: 0,id,action,attribute,value,group_id,group_var,path,collection_name,linkml_class,linkml_slots,ranges,multivalues
0,gold:Gs0114675,update,has_credit_associations,ca1,gold:Gs0114675,ca1,,study_set,study,has credit associations,credit association,True
1,ca1,update,applied_role,Conceptualization,gold:Gs0114675,ca1,has_credit_associations.applied_role,study_set,study,has credit associations|applied role,credit association|credit enum,True|False
2,ca1,update,applies_to_person.name,CREDIT NAME 1,gold:Gs0114675,ca1,has_credit_associations.applies_to_person.name,study_set,study,has credit associations|applies to person|name,credit association|person value|string,True|False|False
3,ca1,update,applies_to_person.email,CREDIT_NAME_1@foo.edu,gold:Gs0114675,ca1,has_credit_associations.applies_to_person.email,study_set,study,has credit associations|applies to person|email,credit association|person value|string,True|False|False
4,ca1,update,applies_to_person.orcid,orcid:0000-0000-0000-0001,gold:Gs0114675,ca1,has_credit_associations.applies_to_person.orcid,study_set,study,has credit associations|applies to person|orcid,credit association|person value|string,True|False|False


In [20]:
print_results(process_changesheet(sheetDf, mongodb, temp_db), print_before=False)


------------------ AFTER ------------------
{
  "id": "gold:Gs0114675",
  "name": "Deep subsurface shale carbon reservoir microbial communities from Ohio and West Virginia, USA",
  "description": "This project aims to improve the understanding of microbial diversity and metabolism in deep shale, with implications for novel enzyme discovery and energy development. This project was conducted along two Appalachian basin shales, the Marcellus and Utica/Point Pleasant formations in Pennsylvania and Ohio, respectively. Samples were collected from input and produced fluids up to a year after hydraulic fracturing at varying depths and locations (4 wells, 2 basin shales).\n",
  "ecosystem": "Environmental",
  "ecosystem_category": "Terrestrial",
  "ecosystem_type": "Deep subsurface",
  "ecosystem_subtype": "Unclassified",
  "specific_ecosystem": "Unclassified",
  "principal_investigator": {
    "has_raw_value": "Kelly Wrighton",
    "profile_image_url": "https://portal.nersc.gov/project/m3408/

### set website for principal investigator

In [21]:
pd.read_csv("data/changesheet-update-pi-websites.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0103573,update,principal_investigator.has_raw_value,NEW PI NAME
1,,update,principal_investigator.name,NEW PI NAME
2,,update,principal_investigator.profile_image_url,https://portal.nersc.gov/NEW-PI-NAME.jpg
3,,update,principal_investigator.orcid,orcid:0000-0000-0000-0000
4,,update,principal_investigator.websites,https://www.ornl.gov/staff-profile/NEW-PI-NAME


In [22]:
sheetDf = load_changesheet("data/changesheet-update-pi-websites.tsv", mongodb)
sheetDf

Unnamed: 0,id,action,attribute,value,group_id,group_var,path,collection_name,linkml_class,linkml_slots,ranges,multivalues
0,gold:Gs0103573,update,principal_investigator.has_raw_value,NEW PI NAME,gold:Gs0103573,,principal_investigator.has_raw_value,study_set,study,principal investigator|has raw value,person value|string,False|False
1,,update,principal_investigator.name,NEW PI NAME,gold:Gs0103573,,principal_investigator.name,study_set,study,principal investigator|name,person value|string,False|False
2,,update,principal_investigator.profile_image_url,https://portal.nersc.gov/NEW-PI-NAME.jpg,gold:Gs0103573,,principal_investigator.profile_image_url,study_set,study,principal investigator|profile image url,person value|string,False|False
3,,update,principal_investigator.orcid,orcid:0000-0000-0000-0000,gold:Gs0103573,,principal_investigator.orcid,study_set,study,principal investigator|orcid,person value|string,False|False
4,,update,principal_investigator.websites,https://www.ornl.gov/staff-profile/NEW-PI-NAME,gold:Gs0103573,,principal_investigator.websites,study_set,study,principal investigator|websites,person value|string,False|True


In [23]:
print_results(process_changesheet(sheetDf, mongodb, temp_db), print_before=False)


------------------ AFTER ------------------
{
  "id": "gold:Gs0103573",
  "name": "Populus root and rhizosphere microbial communities from Tennessee, USA",
  "description": "This study is part of the Plant-Microbe Interfaces Science Focus Area, which aims to gain a deeper understanding of the diversity and functioning of mutually beneficial interactions between plants and microbes in the rhizosphere. Ongoing efforts focus on characterizing and interpreting such interfaces using systems comprising plants and microbes, in particular the poplar tree (Populus) and its microbial community in the context of favorable plant microbe interactions.",
  "ecosystem": "Host-associated",
  "ecosystem_category": "Plants",
  "ecosystem_type": "Unclassified",
  "ecosystem_subtype": "Unclassified",
  "specific_ecosystem": "Unclassified",
  "principal_investigator": {
    "has_raw_value": "NEW PI NAME",
    "name": "NEW PI NAME",
    "profile_image_url": "https://portal.nersc.gov/NEW-PI-NAME.jpg",
    "

### remove items from list/array (e.g. a funding source) from document

In [24]:
pd.read_csv("data/changesheet-remove-item.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0135149,remove items,funding_sources,A portion of this research was performed under...


In [25]:
sheetDf = load_changesheet("data/changesheet-remove-item.tsv", mongodb)
sheetDf

Unnamed: 0,id,action,attribute,value,group_id,group_var,path,collection_name,linkml_class,linkml_slots,ranges,multivalues
0,gold:Gs0135149,remove items,funding_sources,A portion of this research was performed under...,gold:Gs0135149,,funding_sources,study_set,study,funding sources,string,True


In [27]:
print_results(process_changesheet(sheetDf, mongodb, temp_db), print_before=True)


------------------ BEFORE ------------------
{
  "id": "gold:Gs0135149",
  "name": "Bulk soil microbial communities from the East River watershed near Crested Butte, Colorado, United States",
  "description": "This research project aimed to understand how snow accumulation and snowmelt influences the mobilization of nitrogen through the soil microbiome in a mountainous catchment at the East River Watershed in Colorado. This project sought to identify bacteria, archaea, and fungi that were associated with the microbial biomass bloom that occurs during winter and the biomass crash following snowmelt. This project also sought to understand whether the traits that govern microbial community assembly during and after snowmelt were phylogenetically conserved. Samples were collected during winter, the snowmelt period, and after snowmelt in spring, from an area that transitioned from an upland hillslope to a riparian floodplain.\n\nThis project is part of the Watershed Function Science Focus 

### remove (i.e., unset, remove property) from document

In [28]:
pd.read_csv("data/changesheet-remove-property.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0135149,remove,ess_dive_datasets,doi:10.21952/WTR/1573029


In [29]:
sheetDf = load_changesheet("data/changesheet-remove-property.tsv", mongodb)
sheetDf

Unnamed: 0,id,action,attribute,value,group_id,group_var,path,collection_name,linkml_class,linkml_slots,ranges,multivalues
0,gold:Gs0135149,remove,ess_dive_datasets,doi:10.21952/WTR/1573029,gold:Gs0135149,,ess_dive_datasets,study_set,study,ess dive datasets,string,True


In [30]:
print_results(process_changesheet(sheetDf, mongodb, temp_db), print_before=True)


------------------ BEFORE ------------------
{
  "id": "gold:Gs0135149",
  "name": "Bulk soil microbial communities from the East River watershed near Crested Butte, Colorado, United States",
  "description": "This research project aimed to understand how snow accumulation and snowmelt influences the mobilization of nitrogen through the soil microbiome in a mountainous catchment at the East River Watershed in Colorado. This project sought to identify bacteria, archaea, and fungi that were associated with the microbial biomass bloom that occurs during winter and the biomass crash following snowmelt. This project also sought to understand whether the traits that govern microbial community assembly during and after snowmelt were phylogenetically conserved. Samples were collected during winter, the snowmelt period, and after snowmelt in spring, from an area that transitioned from an upland hillslope to a riparian floodplain.\n\nThis project is part of the Watershed Function Science Focus 

### replace value with new value in document
Note the use of the "|" for multiple values.  
**Update** will also perform this.

In [31]:
pd.read_csv("data/changesheet-replace.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0135149,replace,ess_dive_datasets,doi:ess_dive1|doi:ess_dive2


In [32]:
sheetDf = load_changesheet("data/changesheet-replace.tsv", mongodb)
sheetDf

Unnamed: 0,id,action,attribute,value,group_id,group_var,path,collection_name,linkml_class,linkml_slots,ranges,multivalues
0,gold:Gs0135149,replace,ess_dive_datasets,doi:ess_dive1|doi:ess_dive2,gold:Gs0135149,,ess_dive_datasets,study_set,study,ess dive datasets,string,True


In [33]:
print_results(process_changesheet(sheetDf, mongodb, temp_db), print_before=True)


------------------ BEFORE ------------------
{
  "id": "gold:Gs0135149",
  "name": "Bulk soil microbial communities from the East River watershed near Crested Butte, Colorado, United States",
  "description": "This research project aimed to understand how snow accumulation and snowmelt influences the mobilization of nitrogen through the soil microbiome in a mountainous catchment at the East River Watershed in Colorado. This project sought to identify bacteria, archaea, and fungi that were associated with the microbial biomass bloom that occurs during winter and the biomass crash following snowmelt. This project also sought to understand whether the traits that govern microbial community assembly during and after snowmelt were phylogenetically conserved. Samples were collected during winter, the snowmelt period, and after snowmelt in spring, from an area that transitioned from an upland hillslope to a riparian floodplain.\n\nThis project is part of the Watershed Function Science Focus 

### insert items into a list/array in the document
Note use of "|" for multiple values. If the property does not exist, it will be created.

In [34]:
pd.read_csv("data/changesheet-insert.tsv", sep="\t", dtype="string").fillna('')

Unnamed: 0,id,action,attribute,value
0,gold:Gs0135149,insert items,ess_dive_datasets,doi:ESS_DIVE_1|doi:ESS_DIVE_2
1,gold:Gs0135149,insert items,websites,http://WEBSITE_1|http://WEBSITE_2
2,gold:Gs0135149,insert items,publications,PUBLICATION 1


In [35]:
sheetDf = load_changesheet("data/changesheet-insert.tsv", mongodb)
sheetDf

Unnamed: 0,id,action,attribute,value,group_id,group_var,path,collection_name,linkml_class,linkml_slots,ranges,multivalues
0,gold:Gs0135149,insert items,ess_dive_datasets,doi:ESS_DIVE_1|doi:ESS_DIVE_2,gold:Gs0135149,,ess_dive_datasets,study_set,study,ess dive datasets,string,True
1,gold:Gs0135149,insert items,websites,http://WEBSITE_1|http://WEBSITE_2,gold:Gs0135149,,websites,study_set,study,websites,string,True
2,gold:Gs0135149,insert items,publications,PUBLICATION 1,gold:Gs0135149,,publications,study_set,study,publications,string,True


In [36]:
print_results(process_changesheet(sheetDf, mongodb, temp_db), print_before=True)


------------------ BEFORE ------------------
{
  "id": "gold:Gs0135149",
  "name": "Bulk soil microbial communities from the East River watershed near Crested Butte, Colorado, United States",
  "description": "This research project aimed to understand how snow accumulation and snowmelt influences the mobilization of nitrogen through the soil microbiome in a mountainous catchment at the East River Watershed in Colorado. This project sought to identify bacteria, archaea, and fungi that were associated with the microbial biomass bloom that occurs during winter and the biomass crash following snowmelt. This project also sought to understand whether the traits that govern microbial community assembly during and after snowmelt were phylogenetically conserved. Samples were collected during winter, the snowmelt period, and after snowmelt in spring, from an area that transitioned from an upland hillslope to a riparian floodplain.\n\nThis project is part of the Watershed Function Science Focus 