In [1]:
import requests
import pandas as pd

In [7]:
base_url = "https://api.microbiomedata.org"
filt = "title.search:Bio-Scales"
url = f"{base_url}/studies?filter={filt}"
resp = requests.get(url)
studies = resp.json()["results"]
study_ids = []
for study in studies:
    study_ids.append(study["id"])
# Since there is only one value in the results, convert list to a string
study = "".join(study_ids)
print(study)

nmdc:sty-11-r2h77870


In [58]:
per_page = 2000
fields = "ph,calcium,magnesium,potassium,tot_nitro,manganese,zinc,ammonium_nitrogen,nitrate_nitrogen,nitrite_nitrogen,ecosystem_subtype,habitat"
filt = f"part_of:{study}"
cursor = "*"
all_results = []

# Use cursor pagination to get results
while True:
    url = f"{base_url}/biosamples?&filter={filt}&per_page={per_page}&cursor={cursor}&fields={fields}"
    resp = requests.get(url)
    data = resp.json()
    results = data["results"]
    cursor = data["meta"]["next_cursor"]
    all_results.extend(results)
    if not cursor:
        break

print(f"Total number of biosamples: {len(all_results)}")
print(all_results[:5])

Total number of biosamples: 416
[{'id': 'nmdc:bsm-11-011z7z70', 'ecosystem_subtype': 'Rhizosphere', 'habitat': 'Rhizosphere soil'}, {'id': 'nmdc:bsm-11-01teww33', 'ecosystem_subtype': 'Botanical garden', 'habitat': 'Soil', 'ph': 6.81, 'calcium': {'has_raw_value': '2320.83 mg/kg', 'has_numeric_value': 2320.83, 'has_unit': 'mg/kg'}, 'magnesium': {'has_raw_value': '458.94 mg/kg', 'has_numeric_value': 458.94, 'has_unit': 'mg/kg'}, 'potassium': {'has_raw_value': '489.767 mg/kg', 'has_numeric_value': 489.767, 'has_unit': 'mg/kg'}, 'tot_nitro': {'has_raw_value': '0.326 Percent', 'has_numeric_value': 0.326, 'has_unit': 'Percent'}, 'manganese': {'has_raw_value': '20.1285 mg/kg', 'has_numeric_value': 20.1285, 'has_unit': 'mg/kg'}, 'zinc': {'has_raw_value': '2.638 mg/kg', 'has_numeric_value': 2.638, 'has_unit': 'mg/kg'}, 'ammonium_nitrogen': {'has_raw_value': '2.329 mg/kg', 'has_numeric_value': 2.329, 'has_unit': 'mg/kg'}, 'nitrate_nitrogen': {'has_raw_value': '2.804 mg/kg', 'has_numeric_value': 

In [94]:
# How many biosamples include the requested fields?
# 103 of the 416 returned biosamples appear to have the fields we need for analysis

# convert string of fields from request above to dictionary with values set to 0
fields_list = fields.split(',')
print (fields_list)
field_counts = {field: 0 for field in fields_list}

for field in field_counts.keys():
    for samp in all_results:
        if field in samp:
            field_counts[field] += 1
            
print(field_counts)

['ph', 'calcium', 'magnesium', 'potassium', 'tot_nitro', 'manganese', 'zinc', 'ammonium_nitrogen', 'nitrate_nitrogen', 'nitrite_nitrogen', 'ecosystem_subtype', 'habitat']
{'ph': 103, 'calcium': 103, 'magnesium': 103, 'potassium': 103, 'tot_nitro': 103, 'manganese': 103, 'zinc': 103, 'ammonium_nitrogen': 103, 'nitrate_nitrogen': 103, 'nitrite_nitrogen': 103, 'ecosystem_subtype': 416, 'habitat': 416}


In [65]:
# Drop all rows that do not include all of the data we need and print

filtered_results = [biosamp for biosamp in all_results if all(field in biosamp for field in fields_list)]
print(filtered_results)

[{'id': 'nmdc:bsm-11-01teww33', 'ecosystem_subtype': 'Botanical garden', 'habitat': 'Soil', 'ph': 6.81, 'calcium': {'has_raw_value': '2320.83 mg/kg', 'has_numeric_value': 2320.83, 'has_unit': 'mg/kg'}, 'magnesium': {'has_raw_value': '458.94 mg/kg', 'has_numeric_value': 458.94, 'has_unit': 'mg/kg'}, 'potassium': {'has_raw_value': '489.767 mg/kg', 'has_numeric_value': 489.767, 'has_unit': 'mg/kg'}, 'tot_nitro': {'has_raw_value': '0.326 Percent', 'has_numeric_value': 0.326, 'has_unit': 'Percent'}, 'manganese': {'has_raw_value': '20.1285 mg/kg', 'has_numeric_value': 20.1285, 'has_unit': 'mg/kg'}, 'zinc': {'has_raw_value': '2.638 mg/kg', 'has_numeric_value': 2.638, 'has_unit': 'mg/kg'}, 'ammonium_nitrogen': {'has_raw_value': '2.329 mg/kg', 'has_numeric_value': 2.329, 'has_unit': 'mg/kg'}, 'nitrate_nitrogen': {'has_raw_value': '2.804 mg/kg', 'has_numeric_value': 2.804, 'has_unit': 'mg/kg'}, 'nitrite_nitrogen': {'has_raw_value': '0 mg/kg', 'has_numeric_value': 0.0, 'has_unit': 'mg/kg'}}, {'id

In [95]:
# Convert results to desired types and transform to data frame
# collect units for applicable fields

# df_inp = [] 

# calcium_units = set()
# magnesium_units = set()
# potassium_units = set()
# tot_nitro_units = set()
# manganese_units = set()
# zinc = set()
# for biosamp in filtered_results:
#     rec = {"id": biosamp["id"],
#           "ecosystem_subtype": biosamp["ecosystem_subtype"],
#           "habitat": biosamp["habitat"],
#           "ph": float(biosamp["ph"]),
#           "calcium": float(biosamp["calcium"]["has_numeric_value"]),
#           "magnesium": float(biosamp["magnesium"]["has_numeric_value"]),
#           "potassium": float(biosamp["potassium"]["has_numeric_value"]),
#           "tot_nitro": float(biosamp["tot_nitro"]["has_numeric_value"]),
#           "manganese": float(biosamp["manganese"]["has_numeric_value"]),
#            "zinc": float(biosamp["zinc"]["has_numeric_value"]),
#            "ammonium_nitrogen": float(biosamp["ammonium_nitrogen"]["has_numeric_value"]),
#            "nitrate_nitrogen": float(biosamp["nitrate_nitrogen"]["has_numeric_value"]),
#            "nitrite_nitrogen": float(biosamp["nitrite_nitrogen"]["has_numeric_value"])
#            }
#     calcium_units.add(biosamp["calcium"]["has_unit"])
#     magnesium_units.add(biosamp["magnesium"]["has_unit"])
#     potassium_units.add(biosamp["potassium"]["has_unit"])
#     # tot_nitro.add(biosamp["tot_nitro"]["has_unit"])
#     # manganese.add(biosamp["manganese"]["has_unit"])
#     zinc.add(biosamp["zinc"]["has_unit"])

#     df_inp.append(rec)
# print(df_inp)
    
   
fields_list.remove("ecosystem_subtype")
print(fields_list)

    

['ph', 'calcium', 'magnesium', 'potassium', 'tot_nitro', 'manganese', 'zinc', 'ammonium_nitrogen', 'nitrate_nitrogen', 'nitrite_nitrogen', 'habitat']
