In [43]:
# How does the taxonomic distribution of contigs differ by soil layer (mineral vs organic)?

In [58]:
import requests
import pandas as pd
from io import StringIO

In [2]:
# Define a function to get first page of results (we will use again for other requests, which is why we are defining a function)
def get_first_page_results(collection: str, filter: str, max_page_size: int, fields: str):
    og_url = f'https://api.microbiomedata.org/nmdcschema/{collection}?&filter={filter}&max_page_size={max_page_size}&projection={fields}'
    resp = requests.get(og_url)
    data = resp.json()
    
    return data

In [3]:
# Define another function to get the rest of the results using the next_page_token by calling the first funtion to get
# the initial results and then using the next_page_token to get the other results. This function returns a list of the results

def get_next_results(collection: str, filter: str, max_page_size: int, fields: str):

    # Get initial results (before next_page_token is given in results)
    initial_data = get_first_page_results(collection, filter, max_page_size, fields)
    results = initial_data["resources"]
    next_page_token = initial_data["next_page_token"]
    result_list = []

    # append first page of results to an empty list
    for result in results:
        result_list.append(result)
    
    while True:
        url = f'https://api.microbiomedata.org/nmdcschema/{collection}?&filter={filter}&max_page_size={max_page_size}&page_token={next_page_token}&projection={fields}'
        response = requests.get(url)
        data_next = response.json()
        
        results = data_next.get("resources", [])
        result_list.extend(results)
        next_page_token = data_next.get("next_page_token")
    
        if not next_page_token:
            break

    return result_list


In [4]:
def convert_df(results_list: list):

    df = pd.DataFrame(results_list)

    return df

In [5]:
# Get biosamples using functions
biosamples = get_next_results("biosample_set", '{"soil_horizon":{"$exists": true}}', 100, "id,soil_horizon")

# clarify names
for biosample in biosamples:
    biosample["biosample_id"] = biosample.pop("id")

# convert to df
biosample_df = convert_df(biosamples)

biosample_df

Unnamed: 0,soil_horizon,biosample_id
0,O horizon,nmdc:bsm-11-002vgm56
1,M horizon,nmdc:bsm-11-00dkyf35
2,O horizon,nmdc:bsm-11-00hrxp98
3,M horizon,nmdc:bsm-11-00m15h97
4,M horizon,nmdc:bsm-11-00yhef97
...,...,...
4645,M horizon,nmdc:bsm-11-zyhk8g66
4646,B horizon,nmdc:bsm-11-zz9hq391
4647,A horizon,nmdc:bsm-11-zzahyk07
4648,M horizon,nmdc:bsm-11-zzdpcm17


In [6]:
# Define a function to split ids into chunks
def split_list(input_list, chunk_size=100):
    result = []
    
    for i in range(0, len(input_list), chunk_size):
        result.append(input_list[i:i + chunk_size])
        
    return result

In [7]:
# Adjust filter list for double quote string - important for mongo queries
def string_mongo_list(a_list: list):
    
    string_with_double_quotes = str(a_list).replace("'", '"')

    return string_with_double_quotes


In [8]:
# Get list of ids (to eventually feed into query)
def get_id_list(result_list, id_name):
    id_list = []
    for item in result_list:
        if type(item[id_name]) == str:
            id_list.append(item[id_name])
        elif type(item[id_name]) == list:
            for another_item in item[id_name]:
                id_list.append(another_item)

    return id_list

In [9]:
# Define function to request NMDC metadata based on list of identifiers
def get_id_results(newest_results: list, id_field: str, query_collection: str, match_id_field: str, query_fields: str):

    # split old results into list
    result_ids = get_id_list(newest_results, id_field)

    # # make sure match_id_field has double quotes (important for mongo query)
    match_id_field.replace("'", "\"")

    # chunk up the results into sets of 100 using the split_list function and call the get_first_page_results function and append
    # results to list
    chunked_list = split_list(result_ids)
    next_results = []
    for chunk in chunked_list:
        filter_string = string_mongo_list(chunk)
        # quotes around match_id_field need to look a lot different for the final data object query
        if "data_object_type" in match_id_field:
            data = get_first_page_results(query_collection, f'{{{match_id_field}: {{"$in": {filter_string}}}}}', 100, query_fields)
        else: 
            data = get_first_page_results(query_collection, f'{{"{match_id_field}": {{"$in": {filter_string}}}}}', 100, query_fields)
        next_results.extend(data["resources"])

    return next_results

In [10]:
# Get pooling results where biosample identifiers are "has_input" in the pooling_set collection
pooling = get_id_results(biosamples, "biosample_id", "pooling_set", "has_input", "id,has_input,has_output")

# clarify names/keys/identifiers
for pool in pooling:
    pool["pooling_has_input"] = pool.pop("has_input")
    pool["pooling_has_output"] = pool.pop("has_output")
    pool["pooling_id"] = pool.pop("id")

# convert to data frame
pooling_df = convert_df(pooling)

pooling_df

Unnamed: 0,pooling_has_input,pooling_has_output,pooling_id
0,"[nmdc:bsm-11-zw0jr671, nmdc:bsm-11-ftr88019, n...",[nmdc:procsm-11-m6cgda89],nmdc:poolp-11-myygnt07
1,"[nmdc:bsm-11-pgpaf592, nmdc:bsm-11-07qq9z23, n...",[nmdc:procsm-11-4x35gd93],nmdc:poolp-11-fg19qm11
2,"[nmdc:bsm-11-01g9wf51, nmdc:bsm-11-d8mzds05, n...",[nmdc:procsm-11-z19p2488],nmdc:poolp-11-phdxxg80
3,"[nmdc:bsm-11-t21kbh64, nmdc:bsm-11-0d4a3v31, n...",[nmdc:procsm-11-f6fy8y22],nmdc:poolp-11-d51jda20
4,"[nmdc:bsm-11-32fzkb25, nmdc:bsm-11-0earmn44, n...",[nmdc:procsm-11-qkhd3y74],nmdc:poolp-11-5q9ye786
...,...,...,...
4284,"[nmdc:bsm-11-d8254g62, nmdc:bsm-11-zr06ct75, n...",[nmdc:procsm-11-wy01d353],nmdc:poolp-11-zqwvqd96
4285,"[nmdc:bsm-11-zx7j9919, nmdc:bsm-11-ywmc3x87, n...",[nmdc:procsm-11-1dyvaw49],nmdc:poolp-11-h8hehp15
4286,"[nmdc:bsm-11-1kzrnd52, nmdc:bsm-11-zx0m7409]",[nmdc:procsm-11-qs5tn036],nmdc:poolp-11-59743d48
4287,"[nmdc:bsm-11-r0nqvg54, nmdc:bsm-11-3w0vf491, n...",[nmdc:procsm-11-33n43s88],nmdc:poolp-11-dzhbts63


In [12]:
# Function to merge new results with old results (mapping keys together that match). df1 is the data frame with the 
# list where key1 is a list. If key_explode == True, then it means to explode the matching list key (in df1). next_key_explode == True
# then explode the next key given from the new results that will be used in the next merge.
def merge_items(df1, df2, key1: str, key2: str, key_explode: bool, next_key_explode: bool, next_key = None):

    if key_explode == True:
        if next_key_explode == True:
            
            # explode the data frame with the matching key that is a list. Df1 is dataframe with key1 that is a list
            df1_exploded = df1.explode(key1)
            merged_df = pd.merge(df1_exploded, df2, left_on=key1, right_on=key2)
        
            # explode next key:
            merged_df_exploded = merged_df.explode(next_key)

            return merged_df_exploded

        elif next_key_explode == False:
            # explode the data frame with the matching key that is a list. Df1 is dataframe with key1 that is a list
            df1_exploded = df1.explode(key1)
            merged_df = pd.merge(df1_exploded, df2, left_on=key1, right_on=key2)

            return merged_df

    elif key_explode == False:
        if next_key:
            
            # Merge the dfs and return the next key expldoed
            merged_df = pd.merge(df1, df2, left_on=key1, right_on=key2)
            merged_df_exploded = merged_df.explode(next_key)

            return merged_df_exploded

        elif not next_key:

            merged_df = pd.merge(df1, df2, left_on=key1, right_on=key2)

            return merged_df

In [13]:
# Merge the initial biosample results with the pooling results
merged_df1 = merge_items(pooling_df, biosample_df, "pooling_has_input", "biosample_id", True, True, "pooling_has_output")
merged_df1

Unnamed: 0,pooling_has_input,pooling_has_output,pooling_id,soil_horizon,biosample_id
0,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671
1,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671
2,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671
3,nmdc:bsm-11-ftr88019,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-ftr88019
4,nmdc:bsm-11-ftr88019,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-ftr88019
...,...,...,...,...,...
12672,nmdc:bsm-11-zr7b5t24,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-zr7b5t24
12673,nmdc:bsm-11-zr7b5t24,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-zr7b5t24
12674,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39
12675,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39


In [14]:
# Get processed sample results from pooling_has_ouput identifiers
# We use the pooling results and the identifier "pooling_has_output" to query the "processed_sample_set" matching the "id" field, and
# we only need the "id" field returned
process_set1 = get_id_results(pooling, "pooling_has_output", "processed_sample_set", "id", "id")

# clarify names
for processed_sample in process_set1:
    processed_sample["processed_sample1"] = processed_sample.pop("id")

# convert to data frame
ps1_df = convert_df(process_set1)

ps1_df

Unnamed: 0,processed_sample1
0,nmdc:procsm-11-07kg2w70
1,nmdc:procsm-11-09rv7c30
2,nmdc:procsm-11-0rn9p334
3,nmdc:procsm-11-22s7xc89
4,nmdc:procsm-11-258vbz70
...,...
4262,nmdc:procsm-11-vqe1dh14
4263,nmdc:procsm-11-wm0mqq15
4264,nmdc:procsm-11-wy01d353
4265,nmdc:procsm-11-x0gpxh72


In [15]:
# Merge the process_set1 with the merged_list based on "id" in process_set1 with "pooling_has_output" from the merged_list
# merged_list = merge_items(process_set1, merged_list, "processed_sample1", "pooling_has_output")
merged_df2 = merge_items(merged_df1, ps1_df, "pooling_has_output", "processed_sample1", False, False)
merged_df2

Unnamed: 0,pooling_has_input,pooling_has_output,pooling_id,soil_horizon,biosample_id,processed_sample1
0,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89
1,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89
2,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89
3,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89
4,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89
...,...,...,...,...,...,...
36943,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54
36944,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54
36945,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54
36946,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54


In [16]:
# Get extraction results from "processed_sample1" identifiers
# We use the process_set1 results and the identifier "processed_sample1" to query the "extraction_set" collection matching the 
# "has_input" field. We need three fields returned from the extraction set: "id", "has_input", and "has_output"
extraction_set = get_id_results(process_set1, "processed_sample1", "extraction_set", "has_input", "id,has_input,has_output")

# clarify names
for extraction in extraction_set:
    extraction["extract_has_input"] = extraction.pop("has_input")
    extraction["extract_has_output"] = extraction.pop("has_output")
    extraction["extract_id"] = extraction.pop("id")

# convert to data frame
extract_df = convert_df(extraction_set)
extract_df

Unnamed: 0,extract_has_input,extract_has_output,extract_id
0,[nmdc:procsm-11-m6cgda89],[nmdc:procsm-11-e3m9am88],nmdc:extrp-11-2hbzth07
1,[nmdc:procsm-11-zxr4eq64],[nmdc:procsm-11-69w7d751],nmdc:extrp-11-8aaf5q49
2,[nmdc:procsm-11-50046m08],[nmdc:procsm-11-wfr9gq72],nmdc:extrp-11-7v16d010
3,[nmdc:procsm-11-4x35gd93],[nmdc:procsm-11-r8c2df02],nmdc:extrp-11-7mhth480
4,[nmdc:procsm-11-z19p2488],[nmdc:procsm-11-6p7xet10],nmdc:extrp-11-1w7rz874
...,...,...,...
4245,[nmdc:procsm-11-g4shfj37],[nmdc:procsm-11-tbzk2q40],nmdc:extrp-11-c07fy379
4246,[nmdc:procsm-11-f4ec8e58],[nmdc:procsm-11-pce7pn80],nmdc:extrp-11-v53rv933
4247,[nmdc:procsm-11-en24rw77],[nmdc:procsm-11-6ca4kh69],nmdc:extrp-11-b89p0n80
4248,[nmdc:procsm-11-j5t8v154],[nmdc:procsm-11-e88p7q45],nmdc:extrp-11-9exgqh10


In [17]:
# Merge the extraction_set with the merged_list based on the extract_has_input of extraction_set with processed_sample1
# from the merged list
merged_df3 = merge_items(extract_df, merged_df2, "extract_has_input", "processed_sample1", True, True, "extract_has_output") 
merged_df3

Unnamed: 0,extract_has_input,extract_has_output,extract_id,pooling_has_input,pooling_has_output,pooling_id,soil_horizon,biosample_id,processed_sample1
0,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89
1,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89
2,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89
3,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89
4,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89
...,...,...,...,...,...,...,...,...,...
108246,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54
108247,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54
108248,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54
108249,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54


In [18]:
# Get processed_sample_set results again from "extract_has_output"
# We use the extraction_set results and identifier "extract_has_output" to query the "processed_sample_set" collection matching the
# "id". We return only the id of the processed_sample_set
process_set2 = get_id_results(extraction_set, "extract_has_output", "processed_sample_set", "id", "id")

# clarify names
for samp in process_set2:
    samp["processed_sample2"] = samp.pop("id")

# convert to data frame
ps2_df = convert_df(process_set2)
ps2_df

Unnamed: 0,processed_sample2
0,nmdc:procsm-11-06bnpy24
1,nmdc:procsm-11-0q8aqj17
2,nmdc:procsm-11-0sva2t89
3,nmdc:procsm-11-18j5nz50
4,nmdc:procsm-11-1rzdbe82
...,...
4241,nmdc:procsm-11-xf9ka912
4242,nmdc:procsm-11-xwhjh309
4243,nmdc:procsm-11-xwpjt103
4244,nmdc:procsm-11-yrmwcm28


In [19]:
# Merge the process_set2 with the merged_list based on the proccessed_sample2 of process_set2 with extract_has_output
#from merged list
merged_df4 = merge_items(merged_df3, ps2_df, "extract_has_output", "processed_sample2", False, False)
merged_df4

Unnamed: 0,extract_has_input,extract_has_output,extract_id,pooling_has_input,pooling_has_output,pooling_id,soil_horizon,biosample_id,processed_sample1,processed_sample2
0,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88
1,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88
2,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88
3,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88
4,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88
...,...,...,...,...,...,...,...,...,...,...
320080,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88
320081,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88
320082,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88
320083,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88


In [20]:
# Get library_preparation_set results from "processed_sample2" 
# We use the process_set2 results and identifier "processed_sample2" to query the library_preparation_set collection matching the 
# "has_input" field. We return the "has_input", "has_output", and "id" fields.
library_prep_set = get_id_results(process_set2, "processed_sample2", "library_preparation_set", "has_input", "id,has_input,has_output")

# clarify names
for prep in library_prep_set:
    prep["lp_has_input"] = prep.pop("has_input")
    prep["lp_has_output"] = prep.pop("has_output")
    prep["lp_id"] = prep.pop("id")

# convert to data frame
lp_df = convert_df(library_prep_set)
lp_df

Unnamed: 0,lp_has_input,lp_has_output,lp_id
0,[nmdc:procsm-11-e3m9am88],[nmdc:procsm-11-s71h1s64],nmdc:libprp-11-acbfh839
1,[nmdc:procsm-11-69w7d751],[nmdc:procsm-11-pvq3cw40],nmdc:libprp-11-e39ky379
2,[nmdc:procsm-11-wfr9gq72],[nmdc:procsm-11-g7btv939],nmdc:libprp-11-7v2sqk43
3,[nmdc:procsm-11-r8c2df02],[nmdc:procsm-11-vvhhwt22],nmdc:libprp-11-7y2d1222
4,[nmdc:procsm-11-6p7xet10],[nmdc:procsm-11-062rbk44],nmdc:libprp-11-2cy94060
...,...,...,...
4240,[nmdc:procsm-11-tbzk2q40],[nmdc:procsm-11-xpfxk275],nmdc:libprp-11-m33s2x71
4241,[nmdc:procsm-11-pce7pn80],[nmdc:procsm-11-z1jy1841],nmdc:libprp-11-vwd3tn32
4242,[nmdc:procsm-11-6ca4kh69],[nmdc:procsm-11-241nns31],nmdc:libprp-11-ftccjg43
4243,[nmdc:procsm-11-e88p7q45],[nmdc:procsm-11-en1yqa35],nmdc:libprp-11-46m5k206


In [21]:
# Merge library preparation with merged_list on "processed_sample2" and "lp_has_input"
merged_df5 = merge_items(lp_df, merged_df4, "lp_has_input", "processed_sample2", True, True, "lp_has_output")
merged_df5

Unnamed: 0,lp_has_input,lp_has_output,lp_id,extract_has_input,extract_has_output,extract_id,pooling_has_input,pooling_has_output,pooling_id,soil_horizon,biosample_id,processed_sample1,processed_sample2
0,nmdc:procsm-11-e3m9am88,nmdc:procsm-11-s71h1s64,nmdc:libprp-11-acbfh839,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88
1,nmdc:procsm-11-e3m9am88,nmdc:procsm-11-s71h1s64,nmdc:libprp-11-acbfh839,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88
2,nmdc:procsm-11-e3m9am88,nmdc:procsm-11-s71h1s64,nmdc:libprp-11-acbfh839,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88
3,nmdc:procsm-11-e3m9am88,nmdc:procsm-11-s71h1s64,nmdc:libprp-11-acbfh839,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88
4,nmdc:procsm-11-e3m9am88,nmdc:procsm-11-s71h1s64,nmdc:libprp-11-acbfh839,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88
...,...,...,...,...,...,...,...,...,...,...,...,...,...
950943,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88
950944,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88
950945,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88
950946,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88


In [22]:
# Get process_set3 results from "lp_has_output"
# Use the library_prep_set results and identifier "lp_has_output" to query the processed_sample_set collection matching the
# "id" field. We return only the "id" field
process_set3 = get_id_results(library_prep_set, "lp_has_output", "processed_sample_set", "id", "id")

# clarify keys
for samp in process_set3:
    samp["processed_sample3"] = samp.pop("id")

# convert to data frame
ps3_df = convert_df(process_set3)
ps3_df

Unnamed: 0,processed_sample3
0,nmdc:procsm-11-062rbk44
1,nmdc:procsm-11-0tkbt064
2,nmdc:procsm-11-1cmwcb97
3,nmdc:procsm-11-211dc865
4,nmdc:procsm-11-21q8jj02
...,...
4239,nmdc:procsm-11-xpfxk275
4240,nmdc:procsm-11-yzae4f85
4241,nmdc:procsm-11-z1jy1841
4242,nmdc:procsm-11-z3476h43


In [23]:
# Merge proces_set3 results with merged_list on "lp_has_output" with "processed_sample3" Takes 7+ minutes??
merged_df6 = merge_items(merged_df5, ps3_df, "lp_has_output", "processed_sample3", False, False)
merged_df6

Unnamed: 0,lp_has_input,lp_has_output,lp_id,extract_has_input,extract_has_output,extract_id,pooling_has_input,pooling_has_output,pooling_id,soil_horizon,biosample_id,processed_sample1,processed_sample2,processed_sample3
0,nmdc:procsm-11-e3m9am88,nmdc:procsm-11-s71h1s64,nmdc:libprp-11-acbfh839,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:procsm-11-s71h1s64
1,nmdc:procsm-11-e3m9am88,nmdc:procsm-11-s71h1s64,nmdc:libprp-11-acbfh839,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:procsm-11-s71h1s64
2,nmdc:procsm-11-e3m9am88,nmdc:procsm-11-s71h1s64,nmdc:libprp-11-acbfh839,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:procsm-11-s71h1s64
3,nmdc:procsm-11-e3m9am88,nmdc:procsm-11-s71h1s64,nmdc:libprp-11-acbfh839,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:procsm-11-s71h1s64
4,nmdc:procsm-11-e3m9am88,nmdc:procsm-11-s71h1s64,nmdc:libprp-11-acbfh839,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:extrp-11-2hbzth07,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:poolp-11-myygnt07,M horizon,nmdc:bsm-11-zw0jr671,nmdc:procsm-11-m6cgda89,nmdc:procsm-11-e3m9am88,nmdc:procsm-11-s71h1s64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2826736,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29
2826737,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29
2826738,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29
2826739,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29


In [24]:
# Get omics_processing results from "processed_sample3"
# Use the process_set3 results identifier "processed_sample3" to query the omics_processing_set collection matching the
# "has_input" field. Return "has_input" and "id"
omics_process_set = get_id_results(process_set3, "processed_sample3", "omics_processing_set", "has_input", "has_input,id")

# clarify keys
for op in omics_process_set:
    op["op_has_input"] = op.pop("has_input")
    op["op_id"] = op.pop("id")

# convert to data frame
op_df = convert_df(omics_process_set)
op_df

Unnamed: 0,op_has_input,op_id
0,[nmdc:procsm-11-062rbk44],nmdc:omprc-11-bn309345
1,[nmdc:procsm-11-0tkbt064],nmdc:omprc-11-db9g5v27
2,[nmdc:procsm-11-1cmwcb97],nmdc:omprc-11-83e9ph40
3,[nmdc:procsm-11-211dc865],nmdc:omprc-11-kfxafd58
4,[nmdc:procsm-11-21q8jj02],nmdc:omprc-11-pwveft17
...,...,...
4205,[nmdc:procsm-11-xpfxk275],nmdc:omprc-11-f9x2k996
4206,[nmdc:procsm-11-yzae4f85],nmdc:omprc-11-9n5fc730
4207,[nmdc:procsm-11-z1jy1841],nmdc:omprc-11-cv9ymw96
4208,[nmdc:procsm-11-z3476h43],nmdc:omprc-11-4wq97r16


In [25]:
# Merge omics_process_set with merged_list on processed "processed_sample3"
merged_df7 = merge_items(op_df, merged_df6, "op_has_input", "processed_sample3", True, False, "op_id")
merged_df7

Unnamed: 0,op_has_input,op_id,lp_has_input,lp_has_output,lp_id,extract_has_input,extract_has_output,extract_id,pooling_has_input,pooling_has_output,pooling_id,soil_horizon,biosample_id,processed_sample1,processed_sample2,processed_sample3
0,nmdc:procsm-11-062rbk44,nmdc:omprc-11-bn309345,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44,nmdc:libprp-11-2cy94060,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:extrp-11-1w7rz874,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:poolp-11-phdxxg80,O horizon,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44
1,nmdc:procsm-11-062rbk44,nmdc:omprc-11-bn309345,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44,nmdc:libprp-11-2cy94060,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:extrp-11-1w7rz874,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:poolp-11-phdxxg80,O horizon,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44
2,nmdc:procsm-11-062rbk44,nmdc:omprc-11-bn309345,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44,nmdc:libprp-11-2cy94060,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:extrp-11-1w7rz874,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:poolp-11-phdxxg80,O horizon,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44
3,nmdc:procsm-11-062rbk44,nmdc:omprc-11-bn309345,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44,nmdc:libprp-11-2cy94060,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:extrp-11-1w7rz874,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:poolp-11-phdxxg80,O horizon,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44
4,nmdc:procsm-11-062rbk44,nmdc:omprc-11-bn309345,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44,nmdc:libprp-11-2cy94060,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:extrp-11-1w7rz874,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:poolp-11-phdxxg80,O horizon,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8362274,nmdc:procsm-11-h6y3cf29,nmdc:omprc-11-bj2afq20,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29
8362275,nmdc:procsm-11-h6y3cf29,nmdc:omprc-11-bj2afq20,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29
8362276,nmdc:procsm-11-h6y3cf29,nmdc:omprc-11-bj2afq20,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29
8362277,nmdc:procsm-11-h6y3cf29,nmdc:omprc-11-bj2afq20,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29


In [26]:
# Get metagenome_annotation_activity_set from "op_id"
# Use the omics_process_set results identifier "op_id" to query the metagenome_annotation_activity_set collection matching
# the "was_informed_by" field. Return "was_informed_by" and "has_output"
meta_act_ann_set = get_id_results(omics_process_set, "op_id", "metagenome_annotation_activity_set", "was_informed_by", "has_output,was_informed_by,id")

# clarify names
for mga in meta_act_ann_set:
    mga["mga_id"] = mga.pop("id")
    mga["mga_was_informed_by"] = mga.pop("was_informed_by")
    mga["mga_has_output"] = mga.pop("has_output")

# convert to data frame
mga_df = convert_df(meta_act_ann_set)
mga_df

Unnamed: 0,mga_id,mga_was_informed_by,mga_has_output
0,nmdc:wfmgan-11-h05qba23.1,nmdc:omprc-11-5qs1cd02,"[nmdc:dobj-11-808cyq50, nmdc:dobj-11-13d79a82,..."
1,nmdc:wfmgan-11-4h48ff64.1,nmdc:omprc-11-1zehaw93,"[nmdc:dobj-11-m70ftn50, nmdc:dobj-11-kwbeyv20,..."
2,nmdc:wfmgan-11-f8cseh97.1,nmdc:omprc-11-bn309345,"[nmdc:dobj-11-f1trcg75, nmdc:dobj-11-5pqvfm47,..."
3,nmdc:wfmgan-11-bajw6h61.1,nmdc:omprc-11-dsv3f325,"[nmdc:dobj-11-qzsjp850, nmdc:dobj-11-tmp0t041,..."
4,nmdc:wfmgan-11-0w0rnd53.1,nmdc:omprc-11-897qak81,"[nmdc:dobj-11-0dfd5b92, nmdc:dobj-11-04jsg262,..."
...,...,...,...
3748,nmdc:wfmgan-11-r9xyg197.1,nmdc:omprc-11-qg2e3t41,"[nmdc:dobj-11-aav9vk45, nmdc:dobj-11-eats2y81,..."
3749,nmdc:wfmgan-11-dxqxre60.1,nmdc:omprc-11-cpqbew37,"[nmdc:dobj-11-7f8e4a45, nmdc:dobj-11-4rmq5t75,..."
3750,nmdc:wfmgan-11-14gcar54.1,nmdc:omprc-11-px5df021,"[nmdc:dobj-11-dy2jsc18, nmdc:dobj-11-3amwd664,..."
3751,nmdc:wfmgan-11-z6d94p37.1,nmdc:omprc-11-cv9ymw96,"[nmdc:dobj-11-f251gr10, nmdc:dobj-11-207dgx57,..."


In [28]:
# Merge metagenome activity set with merged_list on mga_has_input
merged_df8 = merge_items(merged_df7, mga_df,  "op_id", "mga_was_informed_by", False, True, "mga_has_output")
merged_df8

Unnamed: 0,op_has_input,op_id,lp_has_input,lp_has_output,lp_id,extract_has_input,extract_has_output,extract_id,pooling_has_input,pooling_has_output,pooling_id,soil_horizon,biosample_id,processed_sample1,processed_sample2,processed_sample3,mga_id,mga_was_informed_by,mga_has_output
0,nmdc:procsm-11-062rbk44,nmdc:omprc-11-bn309345,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44,nmdc:libprp-11-2cy94060,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:extrp-11-1w7rz874,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:poolp-11-phdxxg80,O horizon,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44,nmdc:wfmgan-11-f8cseh97.1,nmdc:omprc-11-bn309345,nmdc:dobj-11-f1trcg75
0,nmdc:procsm-11-062rbk44,nmdc:omprc-11-bn309345,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44,nmdc:libprp-11-2cy94060,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:extrp-11-1w7rz874,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:poolp-11-phdxxg80,O horizon,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44,nmdc:wfmgan-11-f8cseh97.1,nmdc:omprc-11-bn309345,nmdc:dobj-11-5pqvfm47
0,nmdc:procsm-11-062rbk44,nmdc:omprc-11-bn309345,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44,nmdc:libprp-11-2cy94060,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:extrp-11-1w7rz874,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:poolp-11-phdxxg80,O horizon,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44,nmdc:wfmgan-11-f8cseh97.1,nmdc:omprc-11-bn309345,nmdc:dobj-11-55n2dk28
0,nmdc:procsm-11-062rbk44,nmdc:omprc-11-bn309345,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44,nmdc:libprp-11-2cy94060,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:extrp-11-1w7rz874,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:poolp-11-phdxxg80,O horizon,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44,nmdc:wfmgan-11-f8cseh97.1,nmdc:omprc-11-bn309345,nmdc:dobj-11-5x2ebc40
0,nmdc:procsm-11-062rbk44,nmdc:omprc-11-bn309345,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44,nmdc:libprp-11-2cy94060,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:extrp-11-1w7rz874,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:poolp-11-phdxxg80,O horizon,nmdc:bsm-11-01g9wf51,nmdc:procsm-11-z19p2488,nmdc:procsm-11-6p7xet10,nmdc:procsm-11-062rbk44,nmdc:wfmgan-11-f8cseh97.1,nmdc:omprc-11-bn309345,nmdc:dobj-11-18y0s926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22250971,nmdc:procsm-11-h6y3cf29,nmdc:omprc-11-bj2afq20,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:wfmgan-11-wcy28p27.1,nmdc:omprc-11-bj2afq20,nmdc:dobj-11-csq6r402
22250971,nmdc:procsm-11-h6y3cf29,nmdc:omprc-11-bj2afq20,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:wfmgan-11-wcy28p27.1,nmdc:omprc-11-bj2afq20,nmdc:dobj-11-wv9ha238
22250971,nmdc:procsm-11-h6y3cf29,nmdc:omprc-11-bj2afq20,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:wfmgan-11-wcy28p27.1,nmdc:omprc-11-bj2afq20,nmdc:dobj-11-eq8ytw35
22250971,nmdc:procsm-11-h6y3cf29,nmdc:omprc-11-bj2afq20,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:extrp-11-qw3ca676,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:wfmgan-11-wcy28p27.1,nmdc:omprc-11-bj2afq20,nmdc:dobj-11-f0eg5v22


In [39]:
# Get data_object_set from "mga_has_output"
# Use the met_act_ann_set results identifier "mga_has_output" to query the data_object_set collection matching
# the "id" field. Return "id", "data_object_type": "Scaffold Lineage tsv", "data_object_url"
data_ob_set = get_id_results(meta_act_ann_set, "mga_has_output", "data_object_set", '"data_object_type": "Scaffold Lineage tsv", "id"', 
                             "id,data_object_type,url")

# clarify fields
for ob in data_ob_set:
    ob["data_ob_id"] = ob.pop("id")

# convert to data frame
do_df = convert_df(data_ob_set)
data_ob_set

[{'data_object_type': 'Scaffold Lineage tsv',
  'url': 'https://data.microbiomedata.org/data/nmdc:omprc-11-897qak81/nmdc:wfmgan-11-0w0rnd53.1/nmdc_wfmgan-11-0w0rnd53.1_scaffold_lineage.tsv',
  'data_ob_id': 'nmdc:dobj-11-1txpq765'},
 {'data_object_type': 'Scaffold Lineage tsv',
  'url': 'https://data.microbiomedata.org/data/nmdc:omprc-11-bn309345/nmdc:wfmgan-11-f8cseh97.1/nmdc_wfmgan-11-f8cseh97.1_scaffold_lineage.tsv',
  'data_ob_id': 'nmdc:dobj-11-2tmqv903'},
 {'data_object_type': 'Scaffold Lineage tsv',
  'url': 'https://data.microbiomedata.org/data/nmdc:omprc-11-dsv3f325/nmdc:wfmgan-11-bajw6h61.1/nmdc_wfmgan-11-bajw6h61.1_scaffold_lineage.tsv',
  'data_ob_id': 'nmdc:dobj-11-5c9hjp21'},
 {'data_object_type': 'Scaffold Lineage tsv',
  'url': 'https://data.microbiomedata.org/data/nmdc:omprc-11-1zehaw93/nmdc:wfmgan-11-4h48ff64.1/nmdc_wfmgan-11-4h48ff64.1_scaffold_lineage.tsv',
  'data_ob_id': 'nmdc:dobj-11-mwz10b29'},
 {'data_object_type': 'Scaffold Lineage tsv',
  'url': 'https://data

In [49]:
# Merge data_object_set with merged_list on id
merged_df9 = merge_items(do_df, merged_df8, "data_ob_id", "mga_has_output", False, False)
merged_df9

Unnamed: 0,data_object_type,url,data_ob_id,op_has_input,op_id,lp_has_input,lp_has_output,lp_id,extract_has_input,extract_has_output,...,pooling_has_output,pooling_id,soil_horizon,biosample_id,processed_sample1,processed_sample2,processed_sample3,mga_id,mga_was_informed_by,mga_has_output
0,Scaffold Lineage tsv,https://data.microbiomedata.org/data/nmdc:ompr...,nmdc:dobj-11-1txpq765,nmdc:procsm-11-je3jg029,nmdc:omprc-11-897qak81,nmdc:procsm-11-4508yv43,nmdc:procsm-11-je3jg029,nmdc:libprp-11-7ffq2g12,nmdc:procsm-11-xdbhma02,nmdc:procsm-11-4508yv43,...,nmdc:procsm-11-xdbhma02,nmdc:poolp-11-5p6tjm70,M horizon,nmdc:bsm-11-158pa966,nmdc:procsm-11-xdbhma02,nmdc:procsm-11-4508yv43,nmdc:procsm-11-je3jg029,nmdc:wfmgan-11-0w0rnd53.1,nmdc:omprc-11-897qak81,nmdc:dobj-11-1txpq765
1,Scaffold Lineage tsv,https://data.microbiomedata.org/data/nmdc:ompr...,nmdc:dobj-11-1txpq765,nmdc:procsm-11-je3jg029,nmdc:omprc-11-897qak81,nmdc:procsm-11-4508yv43,nmdc:procsm-11-je3jg029,nmdc:libprp-11-7ffq2g12,nmdc:procsm-11-xdbhma02,nmdc:procsm-11-4508yv43,...,nmdc:procsm-11-xdbhma02,nmdc:poolp-11-5p6tjm70,M horizon,nmdc:bsm-11-158pa966,nmdc:procsm-11-xdbhma02,nmdc:procsm-11-4508yv43,nmdc:procsm-11-je3jg029,nmdc:wfmgan-11-0w0rnd53.1,nmdc:omprc-11-897qak81,nmdc:dobj-11-1txpq765
2,Scaffold Lineage tsv,https://data.microbiomedata.org/data/nmdc:ompr...,nmdc:dobj-11-1txpq765,nmdc:procsm-11-je3jg029,nmdc:omprc-11-897qak81,nmdc:procsm-11-4508yv43,nmdc:procsm-11-je3jg029,nmdc:libprp-11-7ffq2g12,nmdc:procsm-11-xdbhma02,nmdc:procsm-11-4508yv43,...,nmdc:procsm-11-xdbhma02,nmdc:poolp-11-5p6tjm70,M horizon,nmdc:bsm-11-158pa966,nmdc:procsm-11-xdbhma02,nmdc:procsm-11-4508yv43,nmdc:procsm-11-je3jg029,nmdc:wfmgan-11-0w0rnd53.1,nmdc:omprc-11-897qak81,nmdc:dobj-11-1txpq765
3,Scaffold Lineage tsv,https://data.microbiomedata.org/data/nmdc:ompr...,nmdc:dobj-11-1txpq765,nmdc:procsm-11-je3jg029,nmdc:omprc-11-897qak81,nmdc:procsm-11-4508yv43,nmdc:procsm-11-je3jg029,nmdc:libprp-11-7ffq2g12,nmdc:procsm-11-xdbhma02,nmdc:procsm-11-4508yv43,...,nmdc:procsm-11-xdbhma02,nmdc:poolp-11-5p6tjm70,M horizon,nmdc:bsm-11-158pa966,nmdc:procsm-11-xdbhma02,nmdc:procsm-11-4508yv43,nmdc:procsm-11-je3jg029,nmdc:wfmgan-11-0w0rnd53.1,nmdc:omprc-11-897qak81,nmdc:dobj-11-1txpq765
4,Scaffold Lineage tsv,https://data.microbiomedata.org/data/nmdc:ompr...,nmdc:dobj-11-1txpq765,nmdc:procsm-11-je3jg029,nmdc:omprc-11-897qak81,nmdc:procsm-11-4508yv43,nmdc:procsm-11-je3jg029,nmdc:libprp-11-7ffq2g12,nmdc:procsm-11-xdbhma02,nmdc:procsm-11-4508yv43,...,nmdc:procsm-11-xdbhma02,nmdc:poolp-11-5p6tjm70,M horizon,nmdc:bsm-11-158pa966,nmdc:procsm-11-xdbhma02,nmdc:procsm-11-4508yv43,nmdc:procsm-11-je3jg029,nmdc:wfmgan-11-0w0rnd53.1,nmdc:omprc-11-897qak81,nmdc:dobj-11-1txpq765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66467911,Scaffold Lineage tsv,https://data.microbiomedata.org/data/nmdc:ompr...,nmdc:dobj-11-5wyv2j19,nmdc:procsm-11-h6y3cf29,nmdc:omprc-11-bj2afq20,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,...,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:wfmgan-11-wcy28p27.1,nmdc:omprc-11-bj2afq20,nmdc:dobj-11-5wyv2j19
66467912,Scaffold Lineage tsv,https://data.microbiomedata.org/data/nmdc:ompr...,nmdc:dobj-11-5wyv2j19,nmdc:procsm-11-h6y3cf29,nmdc:omprc-11-bj2afq20,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,...,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:wfmgan-11-wcy28p27.1,nmdc:omprc-11-bj2afq20,nmdc:dobj-11-5wyv2j19
66467913,Scaffold Lineage tsv,https://data.microbiomedata.org/data/nmdc:ompr...,nmdc:dobj-11-5wyv2j19,nmdc:procsm-11-h6y3cf29,nmdc:omprc-11-bj2afq20,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,...,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:wfmgan-11-wcy28p27.1,nmdc:omprc-11-bj2afq20,nmdc:dobj-11-5wyv2j19
66467914,Scaffold Lineage tsv,https://data.microbiomedata.org/data/nmdc:ompr...,nmdc:dobj-11-5wyv2j19,nmdc:procsm-11-h6y3cf29,nmdc:omprc-11-bj2afq20,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:libprp-11-86c4n641,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,...,nmdc:procsm-11-qpqz4w54,nmdc:poolp-11-evtggs98,O horizon,nmdc:bsm-11-xhzk6v39,nmdc:procsm-11-qpqz4w54,nmdc:procsm-11-3rfvyd88,nmdc:procsm-11-h6y3cf29,nmdc:wfmgan-11-wcy28p27.1,nmdc:omprc-11-bj2afq20,nmdc:dobj-11-5wyv2j19


In [51]:
# clean up data frame - remove all middle "joining columns" because the merge is getting slow\
column_list = merged_df9.columns.tolist()
columns_to_keep = ["soil_horizon", "biosample_id", "url"]
columns_to_remove = list(set(column_list).difference(columns_to_keep))
# Drop unnecessary rows
df9_cleaned = merged_df9.drop(columns=columns_to_remove)
df9_cleaned

Unnamed: 0,url,soil_horizon,biosample_id
0,https://data.microbiomedata.org/data/nmdc:ompr...,M horizon,nmdc:bsm-11-158pa966
1,https://data.microbiomedata.org/data/nmdc:ompr...,M horizon,nmdc:bsm-11-158pa966
2,https://data.microbiomedata.org/data/nmdc:ompr...,M horizon,nmdc:bsm-11-158pa966
3,https://data.microbiomedata.org/data/nmdc:ompr...,M horizon,nmdc:bsm-11-158pa966
4,https://data.microbiomedata.org/data/nmdc:ompr...,M horizon,nmdc:bsm-11-158pa966
...,...,...,...
66467911,https://data.microbiomedata.org/data/nmdc:ompr...,O horizon,nmdc:bsm-11-xhzk6v39
66467912,https://data.microbiomedata.org/data/nmdc:ompr...,O horizon,nmdc:bsm-11-xhzk6v39
66467913,https://data.microbiomedata.org/data/nmdc:ompr...,O horizon,nmdc:bsm-11-xhzk6v39
66467914,https://data.microbiomedata.org/data/nmdc:ompr...,O horizon,nmdc:bsm-11-xhzk6v39


In [83]:
# check rows when we reagrregate/implode, 
# imploded_df = df9_cleaned.groupby(["soil_horizon", "biosample_id"]).agg({"url": list}).reset_index()
imploded_df = df9_cleaned.groupby(["soil_horizon", "biosample_id"]).agg({"url": str}).reset_index()
imploded_df

Unnamed: 0,soil_horizon,biosample_id,url
0,M horizon,nmdc:bsm-11-00dkyf35,2604556 https://data.microbiomedata.org/dat...
1,M horizon,nmdc:bsm-11-02kcw433,2073358 https://data.microbiomedata.org/dat...
2,M horizon,nmdc:bsm-11-02n85875,4034224 https://data.microbiomedata.org/dat...
3,M horizon,nmdc:bsm-11-034x5t48,590976 https://data.microbiomedata.org/data...
4,M horizon,nmdc:bsm-11-043hgn06,2368603 https://data.microbiomedata.org/dat...
...,...,...,...
3858,O horizon,nmdc:bsm-11-zv1dn095,45987595 https://data.microbiomedata.org/da...
3859,O horizon,nmdc:bsm-11-zvepbm45,65831534 https://data.microbiomedata.org/da...
3860,O horizon,nmdc:bsm-11-zw5rb344,1183527 https://data.microbiomedata.org/dat...
3861,O horizon,nmdc:bsm-11-zx7j9919,61723476 https://data.microbiomedata.org/da...


In [88]:
# Test df
df_test = df9_cleaned[:1]
df_test
# url_value = df_test.iloc[0]["url"]
# print(url_value)



Unnamed: 0,url,soil_horizon,biosample_id
0,https://data.microbiomedata.org/data/nmdc:ompr...,M horizon,nmdc:bsm-11-158pa966


In [96]:
for index, row in df_test.iterrows():
    url = row["url"]

    response = requests.get(url)
    if response.status_code == 200:
        tsv_data = StringIO(response.text)

        tsv_df = pd.read_csv(tsv_data, delimiter="\t")

        # Give columns names
        tsv_df.columns = ["id", "taxa", "percent"]

        # split taxa column into a list where a semicolon (;) is the delimeter
        tsv_df["taxa"] = tsv_df["taxa"].str.split(";")
        

        # explode to find taxa distribution by creating a series object (taxa_percent)
        tsv_df_exploded = tsv_df["taxa"].explode().reset_index(drop=True)
        taxa_percent = tsv_df_exploded.value_counts(normalize=True)*100

        # taxa_percent.columns = ["taxa", "percent"]
        # taxa_dict = dict(zip(taxa_percent["taxa"], taxa_percent["percent"]))
        # print(taxa_dict)
        # Add series object back to original data frame 
        # df_test.at[index, "taxa_dist"] = taxa_percent
    

    else:
        print("error")

# df_test


KeyError: 'taxa'