In [1]:
import requests
import pandas as pd

In [88]:
# Define a function to get first page of results (we will use again for other requests, which is why we are defining a function)
def get_first_page_results(collection: str, filter: str, max_page_size: int, fields: str):
    og_url = f'https://api.microbiomedata.org/nmdcschema/{collection}?&filter={filter}&max_page_size={max_page_size}&projection={fields}'
    resp = requests.get(og_url)
    data = resp.json()
    
    return data

In [3]:
# Define another function to get the rest of the results using the next_page_token by calling the first funtion to get
# the initial results and then using the next_page_token to get the other results. This function returns a list of the results

def get_next_results(collection: str, filter: str, max_page_size: int, fields: str):

    # Get initial results (before next_page_token is given in results)
    initial_data = get_first_page_results(collection, filter, max_page_size, fields)
    results = initial_data["resources"]
    next_page_token = initial_data["next_page_token"]
    result_list = []

    # append first page of results to an empty list
    for result in results:
        result_list.append(result)
    
    while True:
        url = f'https://api.microbiomedata.org/nmdcschema/{collection}?&filter={filter}&max_page_size={max_page_size}&page_token={next_page_token}&projection={fields}'
        response = requests.get(url)
        data_next = response.json()
        
        results = data_next.get("resources", [])
        result_list.extend(results)
        next_page_token = data_next.get("next_page_token")
    
        if not next_page_token:
            break

    return result_list


In [4]:
# Get biosamples using functions
biosamples = get_next_results("biosample_set", '{"soil_horizon":{"$exists": true}}', 100, "id,soil_horizon")

# clarify names
for biosample in biosamples:
    biosample["biosample_id"] = biosample.pop("id")

print(len(biosamples))
print(biosamples[:4])

4650
[{'soil_horizon': 'O horizon', 'biosample_id': 'nmdc:bsm-11-002vgm56'}, {'soil_horizon': 'M horizon', 'biosample_id': 'nmdc:bsm-11-00dkyf35'}, {'soil_horizon': 'O horizon', 'biosample_id': 'nmdc:bsm-11-00hrxp98'}, {'soil_horizon': 'M horizon', 'biosample_id': 'nmdc:bsm-11-00m15h97'}]


In [5]:
# Define a function to split ids into chunks
def split_list(input_list, chunk_size=100):
    result = []
    
    for i in range(0, len(input_list), chunk_size):
        result.append(input_list[i:i + chunk_size])
        
    return result

In [6]:
# Adjust filter list for double quote string - important for mongo queries
def string_mongo_list(a_list: list):
    
    string_with_double_quotes = str(a_list).replace("'", '"')

    return string_with_double_quotes


In [7]:
# Get list of ids (to eventually feed into query)
def get_id_list(result_list, id_name):
    id_list = []
    for item in result_list:
        if type(item[id_name]) == str:
            id_list.append(item[id_name])
        elif type(item[id_name]) == list:
            for another_item in item[id_name]:
                id_list.append(another_item)

    return id_list

In [None]:
# Define function to request NMDC metadata based on list of identifiers
def get_id_results(newest_results: list, id_field: str, query_collection: str, match_id_field: str, query_fields: str):

    # split old results into list
    result_ids = get_id_list(newest_results, id_field)

    # # make sure match_id_field has double quotes (important for mongo query)
    match_id_field.replace("'", "\"")

    # chunk up the results into sets of 100 using the split_list function and call the get_first_page_results function and append
    # results to list
    chunked_list = split_list(result_ids)
    next_results = []
    for chunk in chunked_list:
        filter_string = string_mongo_list(chunk)
        # quotes around match_id_field need to look a lot different for the final data object query
        if "data_object_type" in match_id_field:
            data = get_first_page_results(query_collection, f'{{{match_id_field}: {{"$in": {filter_string}}}}}', 100, query_fields)
        else: 
            data = get_first_page_results(query_collection, f'{{"{match_id_field}": {{"$in": {filter_string}}}}}', 100, query_fields)
        next_results.extend(data["resources"])

    return next_results

In [13]:
# Get pooling results where biosample identifiers are "has_input" in the pooling_set collection
pooling = get_id_results(biosamples, "biosample_id", "pooling_set", "has_input", "has_input,id,has_output")

# clarify names/keys/identifiers
for pool in pooling:
    pool["pooling_has_input"] = pool.pop("has_input")
    pool["pooling_has_output"] = pool.pop("has_output")
    pool["pooling_id"] = pool.pop("id")

print(len(pooling))
print(pooling[:2])

4289
[{'pooling_has_input': ['nmdc:bsm-11-zw0jr671', 'nmdc:bsm-11-ftr88019', 'nmdc:bsm-11-06qrej20'], 'pooling_has_output': ['nmdc:procsm-11-m6cgda89'], 'pooling_id': 'nmdc:poolp-11-myygnt07'}, {'pooling_has_input': ['nmdc:bsm-11-pgpaf592', 'nmdc:bsm-11-07qq9z23', 'nmdc:bsm-11-tgfagd06'], 'pooling_has_output': ['nmdc:procsm-11-4x35gd93'], 'pooling_id': 'nmdc:poolp-11-fg19qm11'}]


In [14]:
# Function to merge new results with old results (mapping keys together that match)
def merge_items(one_list, two_list, lml_key1, nl_key2):
    
    merged_list = [
        {**item, **object}
        for item in one_list
        for object in two_list
        if item[lml_key1] in object[nl_key2]
    ]

    return merged_list


In [15]:
# Merge the initial biosample results with the pooling results
merged_list = merge_items(biosamples, pooling, "biosample_id", "pooling_has_input")
print(merged_list[:2])

[{'soil_horizon': 'O horizon', 'biosample_id': 'nmdc:bsm-11-002vgm56', 'pooling_has_input': ['nmdc:bsm-11-002vgm56', 'nmdc:bsm-11-pn5gjv86', 'nmdc:bsm-11-190hex25'], 'pooling_has_output': ['nmdc:procsm-11-tdf7e971'], 'pooling_id': 'nmdc:poolp-11-0gyq5c98'}, {'soil_horizon': 'O horizon', 'biosample_id': 'nmdc:bsm-11-002vgm56', 'pooling_has_input': ['nmdc:bsm-11-002vgm56', 'nmdc:bsm-11-pn5gjv86', 'nmdc:bsm-11-190hex25'], 'pooling_has_output': ['nmdc:procsm-11-tdf7e971'], 'pooling_id': 'nmdc:poolp-11-0gyq5c98'}]


In [16]:
# Get processed sample results from pooling_has_ouput identifiers
# We use the pooling results and the identifier "pooling_has_output" to query the "processed_sample_set" matching the "id" field, and
# we only need the "id" field returned
process_set1 = get_id_results(pooling, "pooling_has_output", "processed_sample_set", "id", "id")

# clarify names
for processed_sample in process_set1:
    processed_sample["processed_sample1"] = processed_sample.pop("id")
print(process_set1[:4])

[{'processed_sample1': 'nmdc:procsm-11-07kg2w70'}, {'processed_sample1': 'nmdc:procsm-11-09rv7c30'}, {'processed_sample1': 'nmdc:procsm-11-0rn9p334'}, {'processed_sample1': 'nmdc:procsm-11-22s7xc89'}]


In [17]:
# Merge the process_set1 with the merged_list based on "id" in process_set1 with "pooling_has_output" from the merged_list
# merged_list = merge_items(process_set1, merged_list, "processed_sample1", "pooling_has_output")
merged_list = merge_items(process_set1, merged_list, "processed_sample1", "pooling_has_output")
print(merged_list[:2])

[{'processed_sample1': 'nmdc:procsm-11-07kg2w70', 'soil_horizon': 'M horizon', 'biosample_id': 'nmdc:bsm-11-0fg7rn04', 'pooling_has_input': ['nmdc:bsm-11-0fg7rn04', 'nmdc:bsm-11-x70qj124', 'nmdc:bsm-11-bvnp5w79'], 'pooling_has_output': ['nmdc:procsm-11-07kg2w70'], 'pooling_id': 'nmdc:poolp-11-ksfepg24'}, {'processed_sample1': 'nmdc:procsm-11-07kg2w70', 'soil_horizon': 'M horizon', 'biosample_id': 'nmdc:bsm-11-0fg7rn04', 'pooling_has_input': ['nmdc:bsm-11-0fg7rn04', 'nmdc:bsm-11-x70qj124', 'nmdc:bsm-11-bvnp5w79'], 'pooling_has_output': ['nmdc:procsm-11-07kg2w70'], 'pooling_id': 'nmdc:poolp-11-ksfepg24'}]


In [18]:
# Get extraction results from "processed_sample1" identifiers
# We use the process_set1 results and the identifier "processed_sample1" to query the "extraction_set" collection matching the 
# "has_input" field. We need three fields returned from the extraction set: "id", "has_input", and "has_output"
extraction_set = get_id_results(process_set1, "processed_sample1", "extraction_set", "has_input", "id,has_input,has_output")

# clarify names
for extraction in extraction_set:
    extraction["extract_has_input"] = extraction.pop("has_input")
    extraction["extract_has_output"] = extraction.pop("has_output")
    extraction["extract_id"] = extraction.pop("id")

print(extraction_set[:4])

[{'extract_has_input': ['nmdc:procsm-11-m6cgda89'], 'extract_has_output': ['nmdc:procsm-11-e3m9am88'], 'extract_id': 'nmdc:extrp-11-2hbzth07'}, {'extract_has_input': ['nmdc:procsm-11-zxr4eq64'], 'extract_has_output': ['nmdc:procsm-11-69w7d751'], 'extract_id': 'nmdc:extrp-11-8aaf5q49'}, {'extract_has_input': ['nmdc:procsm-11-50046m08'], 'extract_has_output': ['nmdc:procsm-11-wfr9gq72'], 'extract_id': 'nmdc:extrp-11-7v16d010'}, {'extract_has_input': ['nmdc:procsm-11-4x35gd93'], 'extract_has_output': ['nmdc:procsm-11-r8c2df02'], 'extract_id': 'nmdc:extrp-11-7mhth480'}]


In [19]:
# Merge the extraction_set with the merged_list based on the extract_has_input of extraction_set with processed_sample1
# from the merged list
merged_list = merge_items(merged_list, extraction_set, "processed_sample1", "extract_has_input") 
print(merged_list[:2])

[{'processed_sample1': 'nmdc:procsm-11-07kg2w70', 'soil_horizon': 'M horizon', 'biosample_id': 'nmdc:bsm-11-0fg7rn04', 'pooling_has_input': ['nmdc:bsm-11-0fg7rn04', 'nmdc:bsm-11-x70qj124', 'nmdc:bsm-11-bvnp5w79'], 'pooling_has_output': ['nmdc:procsm-11-07kg2w70'], 'pooling_id': 'nmdc:poolp-11-ksfepg24', 'extract_has_input': ['nmdc:procsm-11-07kg2w70'], 'extract_has_output': ['nmdc:procsm-11-tfxm8j10'], 'extract_id': 'nmdc:extrp-11-1b4fe422'}, {'processed_sample1': 'nmdc:procsm-11-07kg2w70', 'soil_horizon': 'M horizon', 'biosample_id': 'nmdc:bsm-11-0fg7rn04', 'pooling_has_input': ['nmdc:bsm-11-0fg7rn04', 'nmdc:bsm-11-x70qj124', 'nmdc:bsm-11-bvnp5w79'], 'pooling_has_output': ['nmdc:procsm-11-07kg2w70'], 'pooling_id': 'nmdc:poolp-11-ksfepg24', 'extract_has_input': ['nmdc:procsm-11-07kg2w70'], 'extract_has_output': ['nmdc:procsm-11-tfxm8j10'], 'extract_id': 'nmdc:extrp-11-1b4fe422'}]


In [20]:
# Get processed_sample_set results again from "extract_has_output"
# We use the extraction_set results and identifier "extract_has_output" to query the "processed_sample_set" collection matching the
# "id". We return only the id of the processed_sample_set
process_set2 = get_id_results(extraction_set, "extract_has_output", "processed_sample_set", "id", "id")

# clarify names
for samp in process_set2:
    samp["processed_sample2"] = samp.pop("id")

print(process_set2[:4])

[{'processed_sample2': 'nmdc:procsm-11-06bnpy24'}, {'processed_sample2': 'nmdc:procsm-11-0q8aqj17'}, {'processed_sample2': 'nmdc:procsm-11-0sva2t89'}, {'processed_sample2': 'nmdc:procsm-11-18j5nz50'}]


In [21]:
# Merge the process_set2 with the merged_list based on the proccessed_sample2 of process_set2 with extract_has_output
#from merged list
merged_list = list(merge_items(process_set2, merged_list, "processed_sample2", "extract_has_output"))
print(merged_list[:2])

[{'processed_sample2': 'nmdc:procsm-11-06bnpy24', 'processed_sample1': 'nmdc:procsm-11-zyvwg950', 'soil_horizon': 'M horizon', 'biosample_id': 'nmdc:bsm-11-02n85875', 'pooling_has_input': ['nmdc:bsm-11-9hb6s366', 'nmdc:bsm-11-02n85875', 'nmdc:bsm-11-wnzbjx02'], 'pooling_has_output': ['nmdc:procsm-11-zyvwg950'], 'pooling_id': 'nmdc:poolp-11-xbcbbs40', 'extract_has_input': ['nmdc:procsm-11-zyvwg950'], 'extract_has_output': ['nmdc:procsm-11-06bnpy24'], 'extract_id': 'nmdc:extrp-11-9v7j4t61'}, {'processed_sample2': 'nmdc:procsm-11-06bnpy24', 'processed_sample1': 'nmdc:procsm-11-zyvwg950', 'soil_horizon': 'M horizon', 'biosample_id': 'nmdc:bsm-11-02n85875', 'pooling_has_input': ['nmdc:bsm-11-9hb6s366', 'nmdc:bsm-11-02n85875', 'nmdc:bsm-11-wnzbjx02'], 'pooling_has_output': ['nmdc:procsm-11-zyvwg950'], 'pooling_id': 'nmdc:poolp-11-xbcbbs40', 'extract_has_input': ['nmdc:procsm-11-zyvwg950'], 'extract_has_output': ['nmdc:procsm-11-06bnpy24'], 'extract_id': 'nmdc:extrp-11-9v7j4t61'}]


In [22]:
# Get library_preparation_set results from "processed_sample2" 
# We use the process_set2 results and identifier "processed_sample2" to query the library_preparation_set collection matching the 
# "has_input" field. We return the "has_input", "has_output", and "id" fields.
library_prep_set = get_id_results(process_set2, "processed_sample2", "library_preparation_set", "has_input", "id,has_input,has_output")

# clarify names
for prep in library_prep_set:
    prep["lp_id"] = prep.pop("id")
    prep["lp_has_input"] = prep.pop("has_input")
    prep["lp_has_output"] = prep.pop("has_output")

print(library_prep_set[:4])

[{'lp_id': 'nmdc:libprp-11-acbfh839', 'lp_has_input': ['nmdc:procsm-11-e3m9am88'], 'lp_has_output': ['nmdc:procsm-11-s71h1s64']}, {'lp_id': 'nmdc:libprp-11-e39ky379', 'lp_has_input': ['nmdc:procsm-11-69w7d751'], 'lp_has_output': ['nmdc:procsm-11-pvq3cw40']}, {'lp_id': 'nmdc:libprp-11-7v2sqk43', 'lp_has_input': ['nmdc:procsm-11-wfr9gq72'], 'lp_has_output': ['nmdc:procsm-11-g7btv939']}, {'lp_id': 'nmdc:libprp-11-7y2d1222', 'lp_has_input': ['nmdc:procsm-11-r8c2df02'], 'lp_has_output': ['nmdc:procsm-11-vvhhwt22']}]


In [23]:
# Merge library preparation with merged_list on "processed_sample2" and "lp_has_input"
merged_list = merge_items(merged_list, library_prep_set, "processed_sample2", "lp_has_input")
print(merged_list[:1])

[{'processed_sample2': 'nmdc:procsm-11-06bnpy24', 'processed_sample1': 'nmdc:procsm-11-zyvwg950', 'soil_horizon': 'M horizon', 'biosample_id': 'nmdc:bsm-11-02n85875', 'pooling_has_input': ['nmdc:bsm-11-9hb6s366', 'nmdc:bsm-11-02n85875', 'nmdc:bsm-11-wnzbjx02'], 'pooling_has_output': ['nmdc:procsm-11-zyvwg950'], 'pooling_id': 'nmdc:poolp-11-xbcbbs40', 'extract_has_input': ['nmdc:procsm-11-zyvwg950'], 'extract_has_output': ['nmdc:procsm-11-06bnpy24'], 'extract_id': 'nmdc:extrp-11-9v7j4t61', 'lp_id': 'nmdc:libprp-11-z6t9c910', 'lp_has_input': ['nmdc:procsm-11-06bnpy24'], 'lp_has_output': ['nmdc:procsm-11-vyg4vn94']}]


In [24]:
# Get process_set3 results from "lp_has_output"
# Use the library_prep_set results and identifier "lp_has_output" to query the processed_sample_set collection matching the
# "id" field. We return only the "id" field
process_set3 = get_id_results(library_prep_set, "lp_has_output", "processed_sample_set", "id", "id")

# clarify keys
for samp in process_set3:
    samp["processed_sample3"] = samp.pop("id")

print(process_set3[:4])

[{'processed_sample3': 'nmdc:procsm-11-062rbk44'}, {'processed_sample3': 'nmdc:procsm-11-0tkbt064'}, {'processed_sample3': 'nmdc:procsm-11-1cmwcb97'}, {'processed_sample3': 'nmdc:procsm-11-211dc865'}]


In [25]:
# Merge proces_set3 results with merged_list on "lp_has_output" with "processed_sample3" Takes 7+ minutes??
merged_list = merge_items(process_set3, merged_list, "processed_sample3", "lp_has_output")
print(merged_list[:1])

[{'processed_sample3': 'nmdc:procsm-11-062rbk44', 'processed_sample2': 'nmdc:procsm-11-6p7xet10', 'processed_sample1': 'nmdc:procsm-11-z19p2488', 'soil_horizon': 'O horizon', 'biosample_id': 'nmdc:bsm-11-01g9wf51', 'pooling_has_input': ['nmdc:bsm-11-01g9wf51', 'nmdc:bsm-11-d8mzds05', 'nmdc:bsm-11-qf98ze18'], 'pooling_has_output': ['nmdc:procsm-11-z19p2488'], 'pooling_id': 'nmdc:poolp-11-phdxxg80', 'extract_has_input': ['nmdc:procsm-11-z19p2488'], 'extract_has_output': ['nmdc:procsm-11-6p7xet10'], 'extract_id': 'nmdc:extrp-11-1w7rz874', 'lp_id': 'nmdc:libprp-11-2cy94060', 'lp_has_input': ['nmdc:procsm-11-6p7xet10'], 'lp_has_output': ['nmdc:procsm-11-062rbk44']}]


In [26]:
# Get omics_processing results from "processed_sample3"
# Use the process_set3 results identifier "processed_sample3" to query the omics_processing_set collection matching the
# "has_input" field. Return "has_input" and "id"
omics_process_set = get_id_results(process_set3, "processed_sample3", "omics_processing_set", "has_input", "has_input,id")

# clarify keys
for op in omics_process_set:
    op["op_has_input"] = op.pop("has_input")
    op["op_id"] = op.pop("id")

print(omics_process_set[:4])

[{'op_has_input': ['nmdc:procsm-11-062rbk44'], 'op_id': 'nmdc:omprc-11-bn309345'}, {'op_has_input': ['nmdc:procsm-11-0tkbt064'], 'op_id': 'nmdc:omprc-11-db9g5v27'}, {'op_has_input': ['nmdc:procsm-11-1cmwcb97'], 'op_id': 'nmdc:omprc-11-83e9ph40'}, {'op_has_input': ['nmdc:procsm-11-211dc865'], 'op_id': 'nmdc:omprc-11-kfxafd58'}]


In [27]:
# Merge omics_process_set with merged_list on processed "processed_sample3"
merged_list = merge_items(merged_list, omics_process_set, "processed_sample3", "op_has_input")
print(merged_list[:2])

[{'processed_sample3': 'nmdc:procsm-11-062rbk44', 'processed_sample2': 'nmdc:procsm-11-6p7xet10', 'processed_sample1': 'nmdc:procsm-11-z19p2488', 'soil_horizon': 'O horizon', 'biosample_id': 'nmdc:bsm-11-01g9wf51', 'pooling_has_input': ['nmdc:bsm-11-01g9wf51', 'nmdc:bsm-11-d8mzds05', 'nmdc:bsm-11-qf98ze18'], 'pooling_has_output': ['nmdc:procsm-11-z19p2488'], 'pooling_id': 'nmdc:poolp-11-phdxxg80', 'extract_has_input': ['nmdc:procsm-11-z19p2488'], 'extract_has_output': ['nmdc:procsm-11-6p7xet10'], 'extract_id': 'nmdc:extrp-11-1w7rz874', 'lp_id': 'nmdc:libprp-11-2cy94060', 'lp_has_input': ['nmdc:procsm-11-6p7xet10'], 'lp_has_output': ['nmdc:procsm-11-062rbk44'], 'op_has_input': ['nmdc:procsm-11-062rbk44'], 'op_id': 'nmdc:omprc-11-bn309345'}, {'processed_sample3': 'nmdc:procsm-11-062rbk44', 'processed_sample2': 'nmdc:procsm-11-6p7xet10', 'processed_sample1': 'nmdc:procsm-11-z19p2488', 'soil_horizon': 'O horizon', 'biosample_id': 'nmdc:bsm-11-01g9wf51', 'pooling_has_input': ['nmdc:bsm-11-0

In [57]:
# Get metagenome_annotation_activity_set from "op_id"
# Use the omics_process_set results identifier "op_id" to query the metagenome_annotation_activity_set collection matching
# the "was_informed_by" field. Return "was_informed_by" and "has_output"
meta_act_ann_set = get_id_results(omics_process_set, "op_id", "metagenome_annotation_activity_set", "was_informed_by", "has_output,was_informed_by,id")

# clarify names
for mga in meta_act_ann_set:
    mga["mga_id"] = mga.pop("id")
    mga["mga_was_informed_by"] = mga.pop("was_informed_by")
    mga["mga_has_output"] = mga.pop("has_output")

print(meta_act_ann_set[:2])

[{'mga_id': 'nmdc:wfmgan-11-h05qba23.1', 'mga_was_informed_by': 'nmdc:omprc-11-5qs1cd02', 'mga_has_output': ['nmdc:dobj-11-808cyq50', 'nmdc:dobj-11-13d79a82', 'nmdc:dobj-11-4tq40b21', 'nmdc:dobj-11-489g6c73', 'nmdc:dobj-11-vym3q434', 'nmdc:dobj-11-q65rb692', 'nmdc:dobj-11-pk4m4d30', 'nmdc:dobj-11-q63qm173', 'nmdc:dobj-11-wsxxdm32', 'nmdc:dobj-11-p8zj8b80', 'nmdc:dobj-11-8t1q9z93', 'nmdc:dobj-11-vc24a775', 'nmdc:dobj-11-hqwthj56', 'nmdc:dobj-11-yj5dtn21', 'nmdc:dobj-11-a111qk95', 'nmdc:dobj-11-sb7j8333', 'nmdc:dobj-11-zvd8x106', 'nmdc:dobj-11-yd3jx520', 'nmdc:dobj-11-9mdcj023', 'nmdc:dobj-11-21c88519', 'nmdc:dobj-11-f5n4f987', 'nmdc:dobj-11-zbvkx435', 'nmdc:dobj-11-cet4za93']}, {'mga_id': 'nmdc:wfmgan-11-4h48ff64.1', 'mga_was_informed_by': 'nmdc:omprc-11-1zehaw93', 'mga_has_output': ['nmdc:dobj-11-m70ftn50', 'nmdc:dobj-11-kwbeyv20', 'nmdc:dobj-11-gzt8qq86', 'nmdc:dobj-11-v18qy761', 'nmdc:dobj-11-030k6n13', 'nmdc:dobj-11-mwz10b29', 'nmdc:dobj-11-6x081742', 'nmdc:dobj-11-2hqp2e61', 'nmdc:

In [59]:
# Merge metagenome activity set with merged_list on mga_has_input
merged_list = merge_items(merged_list, meta_act_ann_set,  "op_id", "mga_was_informed_by")
print(merged_list[:2])

[{'processed_sample3': 'nmdc:procsm-11-062rbk44', 'processed_sample2': 'nmdc:procsm-11-6p7xet10', 'processed_sample1': 'nmdc:procsm-11-z19p2488', 'soil_horizon': 'O horizon', 'biosample_id': 'nmdc:bsm-11-01g9wf51', 'pooling_has_input': ['nmdc:bsm-11-01g9wf51', 'nmdc:bsm-11-d8mzds05', 'nmdc:bsm-11-qf98ze18'], 'pooling_has_output': ['nmdc:procsm-11-z19p2488'], 'pooling_id': 'nmdc:poolp-11-phdxxg80', 'extract_has_input': ['nmdc:procsm-11-z19p2488'], 'extract_has_output': ['nmdc:procsm-11-6p7xet10'], 'extract_id': 'nmdc:extrp-11-1w7rz874', 'lp_id': 'nmdc:libprp-11-2cy94060', 'lp_has_input': ['nmdc:procsm-11-6p7xet10'], 'lp_has_output': ['nmdc:procsm-11-062rbk44'], 'op_has_input': ['nmdc:procsm-11-062rbk44'], 'op_id': 'nmdc:omprc-11-bn309345', 'mga_id': 'nmdc:wfmgan-11-f8cseh97.1', 'mga_was_informed_by': 'nmdc:omprc-11-bn309345', 'mga_has_output': ['nmdc:dobj-11-f1trcg75', 'nmdc:dobj-11-5pqvfm47', 'nmdc:dobj-11-55n2dk28', 'nmdc:dobj-11-5x2ebc40', 'nmdc:dobj-11-18y0s926', 'nmdc:dobj-11-2tmqv

In [89]:
# Get data_object_set from "mga_has_output"
# Use the met_act_ann_set results identifier "mga_has_output" to query the data_object_set collection matching
# the "id" field. Return "id", "data_object_type": "Scaffold Lineage tsv", "data_object_url"
data_ob_set = get_id_results(meta_act_ann_set, "mga_has_output", "data_object_set", '"data_object_type": "Scaffold Lineage tsv", "id"', 
                             "id,data_object_type,url")

print(data_ob_set[:2])

[{'id': 'nmdc:dobj-11-1txpq765', 'data_object_type': 'Scaffold Lineage tsv', 'url': 'https://data.microbiomedata.org/data/nmdc:omprc-11-897qak81/nmdc:wfmgan-11-0w0rnd53.1/nmdc_wfmgan-11-0w0rnd53.1_scaffold_lineage.tsv'}, {'id': 'nmdc:dobj-11-2tmqv903', 'data_object_type': 'Scaffold Lineage tsv', 'url': 'https://data.microbiomedata.org/data/nmdc:omprc-11-bn309345/nmdc:wfmgan-11-f8cseh97.1/nmdc_wfmgan-11-f8cseh97.1_scaffold_lineage.tsv'}]


In [90]:
# clarify fields
for ob in data_ob_set:
    ob["data_ob_id"] = ob.pop("id")
print(data_ob_set[:2])

[{'data_object_type': 'Scaffold Lineage tsv', 'url': 'https://data.microbiomedata.org/data/nmdc:omprc-11-897qak81/nmdc:wfmgan-11-0w0rnd53.1/nmdc_wfmgan-11-0w0rnd53.1_scaffold_lineage.tsv', 'data_ob_id': 'nmdc:dobj-11-1txpq765'}, {'data_object_type': 'Scaffold Lineage tsv', 'url': 'https://data.microbiomedata.org/data/nmdc:omprc-11-bn309345/nmdc:wfmgan-11-f8cseh97.1/nmdc_wfmgan-11-f8cseh97.1_scaffold_lineage.tsv', 'data_ob_id': 'nmdc:dobj-11-2tmqv903'}]


In [None]:
# Merge data_object_set with merged_list on id
merged_list = merge_items(data_ob_set, merged_list, "data_ob_id", "mga_has_output")
print(merged_list[:2])