In [105]:
import json
import copy
import pandas as pds
from toolz import dicttoolz as dtlz
from pprint import pprint
from linkml_runtime.utils.schemaview import SchemaView

### load test study data

In [158]:
with open('data/study-data1.json') as f:
    study1 = json.load(f)
with open('data/study-data2.json') as f:
    study2 = json.load(f)
with open('data/study-data3.json') as f:
    study3 = json.load(f)

### function to simulate data retrieval

In [21]:
def get_study_data(id:str):
    global study1, study2, study3
    for s in (study1, study2, study3):
        if s['id'] == id.strip():
            return s
    return None # if not found

In [26]:
# pprint(get_study_data('gold:Gs0110132')) # testing
# study3

### function to load the changesheet into dataframe

In [44]:
def load_changesheet(filename):
    # load dataframe replacing NaN with ''
    df = pds.read_csv(filename, sep='\t').fillna('')
    
    # add a group id column but copy only iris (has ":" in it)
    df['group_id'] = df['id'].map(lambda x: x if ':' in x else '')
    
    # fill in blank group ids
    for i in range(len(df)):
        if len(str(df.loc[i, "group_id"]).strip()) < 1:
            df.loc[i, "group_id"] = df.loc[i-1, "group_id"]
            
    return df

### funciton to update data

In [153]:
def update_data(data:dict, 
                changeDf:pds.DataFrame,
                separator='/',
                print_data=False,
                print_update=False) -> dict:
    # local functon to get attribute list
    def get_attributes(attribute:str):
        # split on the separator
        # results in a list of lists
        attributes = attribute.split(separator)
            
        # flatten list
        rv = []
        for a in attributes:
            if type([]) == type(a):
                rv.extend(a)
            else:
                rv.append(a)
        return rv
    
    # make a copy of the data for testing purposes
    new_data = copy.deepcopy(data)
    
    if True == print_data:
        print(new_data)
    
    # the grouped dataframes may have indexes that don't
    # line with the row number, so reset the index
    changeDf = changeDf.reset_index(drop=True)

    for i in range(len(changeDf)):
        attributes = get_attributes(changeDf.loc[i, "attribute"])
        new_val = changeDf.loc[i, "value"]
        new_data = dtlz.assoc_in(new_data, attributes, new_val)
    
    if True == print_update:
        print(new_data)
        
    return new_data
        

### load change sheet

In [60]:
sheetDf = load_changesheet('data/changesheet-with-separator1.tsv')
sheetDf

Unnamed: 0,id,action,attribute,value,group_id
0,gold:Gs0110115,update,name,soil study,gold:Gs0110115
1,,,ecosystem,soil,gold:Gs0110115
2,gold:Gs0112340,update,doi/has_raw_value,10.9999/8888,gold:Gs0112340
3,,update,name,data for study 2,gold:Gs0112340


### split changesheet into groups

In [61]:
grouped = sheetDf.groupby("group_id")

### update data associated with first id

In [65]:
list(grouped)[0]

('gold:Gs0110115',
                id  action  attribute       value        group_id
 0  gold:Gs0110115  update       name  soil study  gold:Gs0110115
 1                          ecosystem        soil  gold:Gs0110115)

In [69]:
id = list(grouped)[0][0] # id is first element
data1 = get_study_data(id) # get data for id
data1

{'id': 'gold:Gs0110115',
 'name': 'Avena fatua rhizosphere microbial communities from ...',
 'ecosystem': 'Host-associated',
 'ecosystem_category': 'Plants',
 'ecosystem_type': 'Rhizosphere',
 'ecosystem_subtype': 'Soil',
 'specific_ecosystem': 'Unclassified',
 'principal_investigator': {'has_raw_value': 'Mary Firestone'},
 'doi': {'has_raw_value': '10.25585/1487760'},
 'type': 'nmdc:Study'}

In [68]:
# the change dataframe is the second element
changeDf = list(grouped)[0][1]
changeDf

Unnamed: 0,id,action,attribute,value,group_id
0,gold:Gs0110115,update,name,soil study,gold:Gs0110115
1,,,ecosystem,soil,gold:Gs0110115


In [119]:
update_data(data1, changeDf, print_data=True)
# old data is on top
# updated data is on bottom

{'id': 'gold:Gs0110115', 'name': 'Avena fatua rhizosphere microbial communities from ...', 'ecosystem': 'Host-associated', 'ecosystem_category': 'Plants', 'ecosystem_type': 'Rhizosphere', 'ecosystem_subtype': 'Soil', 'specific_ecosystem': 'Unclassified', 'principal_investigator': {'has_raw_value': 'Mary Firestone'}, 'doi': {'has_raw_value': '10.25585/1487760'}, 'type': 'nmdc:Study'}


{'id': 'gold:Gs0110115',
 'name': 'soil study',
 'ecosystem': 'soil',
 'ecosystem_category': 'Plants',
 'ecosystem_type': 'Rhizosphere',
 'ecosystem_subtype': 'Soil',
 'specific_ecosystem': 'Unclassified',
 'principal_investigator': {'has_raw_value': 'Mary Firestone'},
 'doi': {'has_raw_value': '10.25585/1487760'},
 'type': 'nmdc:Study'}

### update data associated with second id

In [154]:
list(grouped)[1]

('gold:Gs0112340',
                id  action          attribute             value        group_id
 2  gold:Gs0112340  update  doi/has_raw_value      10.9999/8888  gold:Gs0112340
 3                  update               name  data for study 2  gold:Gs0112340)

In [155]:
id = list(grouped)[1][0] # id is first element
data2 = get_study_data(id) # get data for id
data2

{'id': 'gold:Gs0112340',
 'name': 'Thawing permafrost microbial communities from the Arctic, studying carbon transformations',
 'description': 'A fundamental challenge of microbial environmental science is ...',
 'ecosystem': 'Environmental',
 'ecosystem_category': 'Terrestrial',
 'ecosystem_type': 'Soil',
 'ecosystem_subtype': 'Wetlands',
 'specific_ecosystem': 'Permafrost',
 'principal_investigator': {'has_raw_value': 'Virginia Rich'},
 'doi': {'has_raw_value': '10.25585/1487764'},
 'type': 'nmdc:Study'}

In [156]:
# the change dataframe is the second element
changeDf = list(grouped)[1][1]
changeDf

Unnamed: 0,id,action,attribute,value,group_id
2,gold:Gs0112340,update,doi/has_raw_value,10.9999/8888,gold:Gs0112340
3,,update,name,data for study 2,gold:Gs0112340


In [157]:
update_data(data2, changeDf, print_data=True)
# old data is on top
# updated data is on bottom

{'id': 'gold:Gs0112340', 'name': 'Thawing permafrost microbial communities from the Arctic, studying carbon transformations', 'description': 'A fundamental challenge of microbial environmental science is ...', 'ecosystem': 'Environmental', 'ecosystem_category': 'Terrestrial', 'ecosystem_type': 'Soil', 'ecosystem_subtype': 'Wetlands', 'specific_ecosystem': 'Permafrost', 'principal_investigator': {'has_raw_value': 'Virginia Rich'}, 'doi': {'has_raw_value': '10.25585/1487764'}, 'type': 'nmdc:Study'}


{'id': 'gold:Gs0112340',
 'name': 'data for study 2',
 'description': 'A fundamental challenge of microbial environmental science is ...',
 'ecosystem': 'Environmental',
 'ecosystem_category': 'Terrestrial',
 'ecosystem_type': 'Soil',
 'ecosystem_subtype': 'Wetlands',
 'specific_ecosystem': 'Permafrost',
 'principal_investigator': {'has_raw_value': 'Virginia Rich'},
 'doi': {'has_raw_value': '10.9999/8888'},
 'type': 'nmdc:Study'}

### putting it all together in a loop

In [159]:
grouped = sheetDf.groupby("group_id")

In [168]:
for g in grouped:
    id = g[0] # id is the first element
    changeDf = g[1] # dataframe is the second element
    data = get_study_data(id) # get data for id
    print(changeDf)
    print(json.dumps(update_data(data2, changeDf), indent=2))
    

               id  action  attribute       value        group_id
0  gold:Gs0110115  update       name  soil study  gold:Gs0110115
1                          ecosystem        soil  gold:Gs0110115
{
  "id": "gold:Gs0112340",
  "name": "soil study",
  "description": "A fundamental challenge of microbial environmental science is ...",
  "ecosystem": "soil",
  "ecosystem_category": "Terrestrial",
  "ecosystem_type": "Soil",
  "ecosystem_subtype": "Wetlands",
  "specific_ecosystem": "Permafrost",
  "principal_investigator": {
    "has_raw_value": "Virginia Rich"
  },
  "doi": {
    "has_raw_value": "10.25585/1487764"
  },
  "type": "nmdc:Study"
}
               id  action          attribute             value        group_id
2  gold:Gs0112340  update  doi/has_raw_value      10.9999/8888  gold:Gs0112340
3                  update               name  data for study 2  gold:Gs0112340
{
  "id": "gold:Gs0112340",
  "name": "data for study 2",
  "description": "A fundamental challenge of microbial e