In [1]:
# append path info to access api
import os, sys
from git_root import git_root
sys.path.append(os.path.abspath(git_root("nmdc_runtime/api/core")))

In [2]:
import json
import copy
import pandas as pds
from toolz import dicttoolz as dtlz
from pprint import pprint
from changesheets import load_changesheet, update_data
from linkml_runtime.utils.schemaview import SchemaView

### load test study data

In [3]:
with open('data/study-data1.json') as f:
    study1 = json.load(f)
with open('data/study-data2.json') as f:
    study2 = json.load(f)
with open('data/study-data3.json') as f:
    study3 = json.load(f)

### function to simulate data retrieval

In [4]:
def get_study_data(_id:str):
    global study1, study2, study3
    for s in (study1, study2, study3):
        if s['id'] == _id.strip():
            return s
    return None # if not found

In [5]:
# pprint(get_study_data('gold:Gs0110132')) # testing
# study3

### load change sheet

In [6]:
sheetDf = load_changesheet('data/changesheet-with-separator1.tsv')
sheetDf

Unnamed: 0,id,action,attribute,value,group_id,path
0,gold:Gs0110115,update,name,soil study,gold:Gs0110115,name
1,,update,ecosystem,soil,gold:Gs0110115,ecosystem
2,gold:Gs0112340,update,doi/has_raw_value,10.9999/8888,gold:Gs0112340,doi/has_raw_value
3,,update,name,data for study 2,gold:Gs0112340,name


### split changesheet into groups
Groups are returned as tuples in which:
- The first element is the grouping id.
- The second element is the **dataframe** with the changesheet info.

In [7]:
grouped = list(sheetDf.groupby("group_id"))

### update data associated with first id

In [8]:
grouped[0]

('gold:Gs0110115',
                id  action  attribute       value        group_id       path
 0  gold:Gs0110115  update       name  soil study  gold:Gs0110115       name
 1                  update  ecosystem        soil  gold:Gs0110115  ecosystem)

In [9]:
_id = grouped[0][0] # id is first element
data1 = get_study_data(_id) # get data for id
data1

{'id': 'gold:Gs0110115',
 'name': 'Avena fatua rhizosphere microbial communities from ...',
 'ecosystem': 'Host-associated',
 'ecosystem_category': 'Plants',
 'ecosystem_type': 'Rhizosphere',
 'ecosystem_subtype': 'Soil',
 'specific_ecosystem': 'Unclassified',
 'principal_investigator': {'has_raw_value': 'Mary Firestone'},
 'doi': {'has_raw_value': '10.25585/1487760'},
 'type': 'nmdc:Study'}

In [10]:
# the change dataframe is the second element
changeDf = grouped[0][1]
changeDf

Unnamed: 0,id,action,attribute,value,group_id,path
0,gold:Gs0110115,update,name,soil study,gold:Gs0110115,name
1,,update,ecosystem,soil,gold:Gs0110115,ecosystem


In [11]:
update_data(data1, changeDf, print_data=True)
# old data is on top
# updated data is on bottom

{'id': 'gold:Gs0110115', 'name': 'Avena fatua rhizosphere microbial communities from ...', 'ecosystem': 'Host-associated', 'ecosystem_category': 'Plants', 'ecosystem_type': 'Rhizosphere', 'ecosystem_subtype': 'Soil', 'specific_ecosystem': 'Unclassified', 'principal_investigator': {'has_raw_value': 'Mary Firestone'}, 'doi': {'has_raw_value': '10.25585/1487760'}, 'type': 'nmdc:Study'}


{'id': 'gold:Gs0110115',
 'name': 'soil study',
 'ecosystem': 'soil',
 'ecosystem_category': 'Plants',
 'ecosystem_type': 'Rhizosphere',
 'ecosystem_subtype': 'Soil',
 'specific_ecosystem': 'Unclassified',
 'principal_investigator': {'has_raw_value': 'Mary Firestone'},
 'doi': {'has_raw_value': '10.25585/1487760'},
 'type': 'nmdc:Study'}

### update data associated with second id

In [12]:
grouped[1]

('gold:Gs0112340',
                id  action          attribute             value  \
 2  gold:Gs0112340  update  doi/has_raw_value      10.9999/8888   
 3                  update               name  data for study 2   
 
          group_id               path  
 2  gold:Gs0112340  doi/has_raw_value  
 3  gold:Gs0112340               name  )

In [14]:
_id = grouped[1][0] # id is first element
data2 = get_study_data(_id) # get data for id
data2

{'id': 'gold:Gs0112340',
 'name': 'Thawing permafrost microbial communities from the Arctic, studying carbon transformations',
 'description': 'A fundamental challenge of microbial environmental science is ...',
 'ecosystem': 'Environmental',
 'ecosystem_category': 'Terrestrial',
 'ecosystem_type': 'Soil',
 'ecosystem_subtype': 'Wetlands',
 'specific_ecosystem': 'Permafrost',
 'principal_investigator': {'has_raw_value': 'Virginia Rich'},
 'doi': {'has_raw_value': '10.25585/1487764'},
 'type': 'nmdc:Study'}

In [15]:
# the change dataframe is the second element
changeDf = grouped[1][1]
changeDf

Unnamed: 0,id,action,attribute,value,group_id,path
2,gold:Gs0112340,update,doi/has_raw_value,10.9999/8888,gold:Gs0112340,doi/has_raw_value
3,,update,name,data for study 2,gold:Gs0112340,name


In [16]:
update_data(data2, changeDf, print_data=True)
# old data is on top
# updated data is on bottom

{'id': 'gold:Gs0112340', 'name': 'Thawing permafrost microbial communities from the Arctic, studying carbon transformations', 'description': 'A fundamental challenge of microbial environmental science is ...', 'ecosystem': 'Environmental', 'ecosystem_category': 'Terrestrial', 'ecosystem_type': 'Soil', 'ecosystem_subtype': 'Wetlands', 'specific_ecosystem': 'Permafrost', 'principal_investigator': {'has_raw_value': 'Virginia Rich'}, 'doi': {'has_raw_value': '10.25585/1487764'}, 'type': 'nmdc:Study'}


{'id': 'gold:Gs0112340',
 'name': 'data for study 2',
 'description': 'A fundamental challenge of microbial environmental science is ...',
 'ecosystem': 'Environmental',
 'ecosystem_category': 'Terrestrial',
 'ecosystem_type': 'Soil',
 'ecosystem_subtype': 'Wetlands',
 'specific_ecosystem': 'Permafrost',
 'principal_investigator': {'has_raw_value': 'Virginia Rich'},
 'doi': {'has_raw_value': '10.9999/8888'},
 'type': 'nmdc:Study'}

### putting it all together in a loop

In [17]:
grouped = list(sheetDf.groupby("group_id"))

In [18]:
for g in grouped:
    _id = g[0] # id is the first element
    changeDf = g[1] # dataframe is the second element
    data = get_study_data(_id) # get data for id
    print(changeDf)
    print(json.dumps(update_data(data2, changeDf), indent=2))
    

               id  action  attribute       value        group_id       path
0  gold:Gs0110115  update       name  soil study  gold:Gs0110115       name
1                  update  ecosystem        soil  gold:Gs0110115  ecosystem
{
  "id": "gold:Gs0112340",
  "name": "soil study",
  "description": "A fundamental challenge of microbial environmental science is ...",
  "ecosystem": "soil",
  "ecosystem_category": "Terrestrial",
  "ecosystem_type": "Soil",
  "ecosystem_subtype": "Wetlands",
  "specific_ecosystem": "Permafrost",
  "principal_investigator": {
    "has_raw_value": "Virginia Rich"
  },
  "doi": {
    "has_raw_value": "10.25585/1487764"
  },
  "type": "nmdc:Study"
}
               id  action          attribute             value  \
2  gold:Gs0112340  update  doi/has_raw_value      10.9999/8888   
3                  update               name  data for study 2   

         group_id               path  
2  gold:Gs0112340  doi/has_raw_value  
3  gold:Gs0112340               name  
{
