In [1]:
import requests
import json
from datetime import datetime
import pprint

from pymongo import MongoClient

from collections import Counter
from collections import defaultdict

import pandas as pd

from dotenv import dotenv_values

In [2]:
# Specify the path to your .env file
env_path = "../../local/.env"

In [3]:
# Load variables into a dictionary
env_vars = dotenv_values(env_path)

In [4]:
REFRESH_TOKEN = env_vars['NMDC_DATA_SUBMISSION_REFRESH_TOKEN']

In [5]:
# Connect to the local MongoDB instance (default connection)
client = MongoClient('mongodb://localhost:27017/')  # Connect to your local MongoDB
db = client['misc_metadata']  # for alignment with mongo-ncbi-loadbalancer
collection = db['nmdc_submissions']

In [6]:
# Set the API endpoint for refreshing the token
url = 'https://data.microbiomedata.org/auth/refresh'

In [7]:
# Set the payload with the Refresh Token
payload = {
    "refresh_token": REFRESH_TOKEN
}

In [8]:
# Set the headers
headers = {
    'Content-Type': 'application/json'
}

In [9]:
# Make the POST request to refresh the token
response = requests.post(url, data=json.dumps(payload), headers=headers)

In [10]:
# Check if the request was successful
if response.status_code == 200:
    data = response.json()
    access_token = data['access_token']
    print(f"Access Token: {access_token}")
else:
    print(f"Failed to get access token: {response.status_code}")
    print(response.text)


Access Token: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxNmU5MzU0MS0zZDFlLTRjZmQtODI1Yi04YjJkZTVhMGI1YjMiLCJ0eXAiOiJCZWFyZXIiLCJpc3MiOiJodHRwczovL2RhdGEubWljcm9iaW9tZWRhdGEub3JnLyIsImlhdCI6MTc0MTk1NzA0MiwiZXhwIjoxNzQyMDQzNDQyfQ.4QeL0FIglVsvj-CoL63lHcJPeMn1q69aOlGu--twCGg


In [11]:
# Set the API endpoint for metadata submissions
url = 'https://data.microbiomedata.org/api/metadata_submission'


In [12]:
# Set the headers with the Access Token
headers = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {access_token}'
}


In [13]:
# Set the query parameters (default values)
params = {
    'column_sort': 'created',  # Sorting by 'created' column
    'sort_order': 'desc',      # Descending order
    'offset': 0,               # Starting from the first record (default value)
    'limit': 25                # Default to 25 records per page
}


In [14]:
# Initialize an empty list to hold all the records
all_records = []

In [15]:
# Start the pagination loop
while True:
    # Print timestamp for each request
    print(f"Request sent at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    # Make the GET request to fetch metadata submissions
    response = requests.get(url, headers=headers, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()

        # If there are no records, stop pagination
        if not data.get('results'):
            break

        # Insert the fetched records into MongoDB
        if data.get('results'):
            collection.insert_many(data['results'])  # Insert all records at once

        # Check if we've fetched all records (compare the number of records with the total count)
        if len(list(collection.find())) >= data['count']:
            break

        # Update the offset for the next page (next 25 records)
        params['offset'] += params['limit']
    else:
        print(f"Failed to fetch submissions: {response.status_code}")
        print(response.text)
        break

Request sent at: 2025-03-14 08:57:22
Request sent at: 2025-03-14 08:57:24
Request sent at: 2025-03-14 08:57:24
Request sent at: 2025-03-14 08:57:25
Request sent at: 2025-03-14 08:57:27
Request sent at: 2025-03-14 08:57:28
Request sent at: 2025-03-14 08:57:29
Request sent at: 2025-03-14 08:57:30
Request sent at: 2025-03-14 08:57:30
Request sent at: 2025-03-14 08:57:31
Request sent at: 2025-03-14 08:57:32
Request sent at: 2025-03-14 08:57:33
Request sent at: 2025-03-14 08:57:34


In [16]:
# Initialize a Counter to keep track of the keys
key_counter = Counter()

In [17]:
# Initialize a defaultdict to store lists of record IDs
key_dict = defaultdict(list)

In [18]:
for record in collection.find():
    record_id = record.get('id', None)  # Default to 'N/A' if id is missing

    metadata_submission = record.get('metadata_submission', {})
    sampleData = metadata_submission.get('sampleData', {})
        # Check if sampleData is non-empty
    if len(sampleData) > 0:
        # Update the Counter with the keys of the current sampleData
        key_counter.update(sampleData.keys())

        # Update the dictionary with record_id for each sampleData key
        for key in sampleData.keys():
            key_dict[key].append(record_id)



In [19]:
key_counter

Counter({'soil_data': 139,
         'jgi_mg_data': 50,
         'plant_associated_data': 36,
         'water_data': 25,
         'emsl_data': 24,
         'host_associated_data': 20,
         'misc_envs_data': 12,
         'jgi_mt_data': 10,
         'sediment_data': 10,
         'built_env_data': 6,
         'air_data': 5,
         'biofilm_data': 3,
         'jgi_mg_lr_data': 3,
         'hcr_fluids_swabs_data': 1})

In [20]:
key_dict

defaultdict(list,
            {'water_data': ['ec4108bd-61c1-4c11-872c-b154c3f794cb',
              'cd8c3d67-78c7-4375-aec5-163f0c70bbd1',
              '532f0047-1a7d-4226-a6f6-b0278e969c13',
              '719d9a5b-e83b-414c-a462-9c4dc7fa32ff',
              '6128ea79-f122-4d14-8588-30f06ce3f1f6',
              '202b557e-bdb3-4fbb-a63d-88b3d17b025d',
              'a494232a-24d7-4740-869e-c96cda6ad047',
              '78923b6e-0db7-4f98-b032-a0a4693c5b65',
              'c6d82b0c-3f6d-43a8-99d0-7f85ebcf7d70',
              '2b42010e-c223-42ba-8bd2-c808265f0110',
              '41937d47-8542-484b-829f-5ecb42569ec8',
              'ed07acd8-c5d7-4d3f-b71b-fa8fb415f541',
              '520ee2ca-3d3b-4aef-91bb-440bc171975f',
              'd31b7eec-1dab-46a8-8450-18b0bf799a6a',
              'a5a5d928-a69c-4378-8fbb-96f99f1ea871',
              '0890d98f-1a89-457e-ac29-816c1c385074',
              '54d154bf-6fd3-4b43-8aba-dd874a222894',
              '42d0f524-ddf1-42af-b339-339727c5dee

In [21]:
# List to hold transformed documents
transformed_docs = []

# Iterate over the documents in the collection
for record in collection.find():
    submission_id = record.get('id', 'N/A')  # Get the submission ID (default to 'N/A' if missing)

    # Get the sampleData (assuming it contains lists of dictionaries)
    sample_data = record.get('metadata_submission', {}).get('sampleData', {})

    # Iterate over each key (list) in sampleData
    for key, rows in sample_data.items():
        # For each "row" in the list (which is a dictionary)
        for row in rows:
            # Create a new document for each row with the submission ID and key-value pairs
            transformed_doc = {
                "submission_id": submission_id,
                "key": key,
                "row_data": [{"field": field, "value": value} for field, value in row.items()]
            }
            # Append the transformed document to the list
            transformed_docs.append(transformed_doc)


In [22]:
biosample_row_collection = db["submission_biosample_rows"]  # Replace with your collection name

In [23]:
# Insert the documents into the collection
result = biosample_row_collection.insert_many(transformed_docs)

Where, in any column from any row in any template in any submission, does the value "YSISB-Stream Sediment" appear?

In [24]:
query = {
    "row_data": {
        "$elemMatch": {
            "value": "YSISB-Stream Sediment"
        }
    }
}

In [25]:
# Execute the query
results = biosample_row_collection.find(query)

In [26]:
# Print results
for doc in results:
    pprint.pprint(doc)

{'_id': ObjectId('67d427c1fbe27ada99415c2d'),
 'key': 'sediment_data',
 'row_data': [{'field': 'elev', 'value': 21},
              {'field': 'depth', 'value': '0-0.1'},
              {'field': 'lat_lon', 'value': '42.706758 -70.915147'},
              {'field': 'ecosystem', 'value': 'Environmental'},
              {'field': 'samp_name', 'value': 'YSISB-Stream Sediment'},
              {'field': 'env_medium', 'value': 'sediment [ENVO:00002007]'},
              {'field': 'geo_loc_name', 'value': 'USA: Massachusetts, Ipswich'},
              {'field': 'analysis_type',
               'value': ['metabolomics', 'natural organic matter']},
              {'field': 'oxy_stat_samp', 'value': 'aerobic'},
              {'field': 'ecosystem_type', 'value': 'Freshwater'},
              {'field': 'collection_date', 'value': '2024-08-13'},
              {'field': 'env_broad_scale',
               'value': 'freshwater biome [ENVO:00000873]'},
              {'field': 'env_local_scale', 'value': 'stream 

Now retrieve the submission with the id from that result

In [27]:
# Query to get the document with the given submission_id
query = {"id": "6128ea79-f122-4d14-8588-30f06ce3f1f6"}

In [28]:
# Initialize an empty list to hold all rows
all_rows = []

In [29]:
template = "sediment_data"

In [30]:
# Iterate through all documents in the collection
for document in collection.find(query):
    # Extract 'sediment_data' from 'metadata_submission.sampleData'
    template_data = document.get('metadata_submission', {}).get('sampleData', {}).get(template, [])

    for row in template_data:
        row['submission_id'] = document.get('id', None)  # Add document ID to the row
        row['template'] = template  # Add document ID to the row

    # Append the rows to the list
    all_rows.extend(template_data)

In [31]:
submissions_samples_frame = pd.DataFrame(all_rows)

In [32]:
submissions_samples_frame

Unnamed: 0,elev,depth,lat_lon,ecosystem,samp_name,env_medium,geo_loc_name,analysis_type,oxy_stat_samp,ecosystem_type,collection_date,env_broad_scale,env_local_scale,samp_store_temp,ecosystem_subtype,ecosystem_category,specific_ecosystem,submission_id,template
0,21,0-0.1,42.706758 -70.915147,Environmental,YSISB-Stream Sediment,sediment [ENVO:00002007],"USA: Massachusetts, Ipswich","[metabolomics, natural organic matter]",aerobic,Freshwater,2024-08-13,freshwater biome [ENVO:00000873],stream [ENVO:00000023],-20 Celsius,River,Aquatic,Sediment,6128ea79-f122-4d14-8588-30f06ce3f1f6,sediment_data
1,21,0-0.1,42.706758 -70.915147,Environmental,YSISB-Riparian Sediment,sediment [ENVO:00002007],"USA: Massachusetts, Ipswich","[metabolomics, natural organic matter]",aerobic,Freshwater,2024-08-13,freshwater biome [ENVO:00000873],stream bank [ENVO:00000142],-20 Celsius,River,Aquatic,Sediment,6128ea79-f122-4d14-8588-30f06ce3f1f6,sediment_data
2,39,0-0.1,42.516602 -71.189903,Environmental,SBFRE-Stream Sediment,sediment [ENVO:00002007],"USA: Massachusetts, Ipswich","[metabolomics, natural organic matter]",aerobic,Freshwater,2024-08-13,freshwater biome [ENVO:00000873],stream [ENVO:00000023],-20 Celsius,River,Aquatic,Sediment,6128ea79-f122-4d14-8588-30f06ce3f1f6,sediment_data
3,39,0-0.1,42.516602 -71.189903,Environmental,SBFRE-Riparian Sediment,sediment [ENVO:00002007],"USA: Massachusetts, Ipswich","[metabolomics, natural organic matter]",aerobic,Freshwater,2024-08-13,freshwater biome [ENVO:00000873],stream bank [ENVO:00000142],-20 Celsius,River,Aquatic,Sediment,6128ea79-f122-4d14-8588-30f06ce3f1f6,sediment_data
4,35,0-0.1,42.533218 -71.19883,Environmental,IS103-Stream Sediment,sediment [ENVO:00002007],"USA: Massachusetts, Ipswich","[metabolomics, natural organic matter]",aerobic,Freshwater,2024-08-13,freshwater biome [ENVO:00000873],stream [ENVO:00000023],-20 Celsius,River,Aquatic,Sediment,6128ea79-f122-4d14-8588-30f06ce3f1f6,sediment_data
5,35,0-0.1,42.533218 -71.19883,Environmental,IS103-Riparian Sediment,sediment [ENVO:00002007],"USA: Massachusetts, Ipswich","[metabolomics, natural organic matter]",aerobic,Freshwater,2024-08-13,freshwater biome [ENVO:00000873],stream bank [ENVO:00000142],-20 Celsius,River,Aquatic,Sediment,6128ea79-f122-4d14-8588-30f06ce3f1f6,sediment_data
6,26,0-0.1,42.581268 -71.158653,Environmental,IS111-Stream Sediment,sediment [ENVO:00002007],"USA: Massachusetts, Ipswich","[metabolomics, natural organic matter]",aerobic,Freshwater,2024-08-13,freshwater biome [ENVO:00000873],stream [ENVO:00000023],-20 Celsius,River,Aquatic,Sediment,6128ea79-f122-4d14-8588-30f06ce3f1f6,sediment_data
7,26,0-0.1,42.581268 -71.158653,Environmental,IS111-Riparian Sediment,sediment [ENVO:00002007],"USA: Massachusetts, Ipswich","[metabolomics, natural organic matter]",aerobic,Freshwater,2024-08-13,freshwater biome [ENVO:00000873],stream bank [ENVO:00000142],-20 Celsius,River,Aquatic,Sediment,6128ea79-f122-4d14-8588-30f06ce3f1f6,sediment_data
8,15,0-0.1,42.58812 -70.988663,Environmental,IS122-Stream Sediment,sediment [ENVO:00002007],"USA: Massachusetts, Ipswich","[metabolomics, natural organic matter]",aerobic,Freshwater,2024-08-14,freshwater biome [ENVO:00000873],stream [ENVO:00000023],-20 Celsius,River,Aquatic,Sediment,6128ea79-f122-4d14-8588-30f06ce3f1f6,sediment_data
9,15,0-0.1,42.58812 -70.988663,Environmental,IS122-Riparian Sediment,sediment [ENVO:00002007],"USA: Massachusetts, Ipswich","[metabolomics, natural organic matter]",aerobic,Freshwater,2024-08-14,freshwater biome [ENVO:00000873],stream bank [ENVO:00000142],-20 Celsius,River,Aquatic,Sediment,6128ea79-f122-4d14-8588-30f06ce3f1f6,sediment_data
