In [1]:
from pymongo import MongoClient
import csv

In [2]:
# Save the list of dicts to a TSV file
tsv_file_path = "myrold_attributes.tsv"

In [3]:
# Connect to the local MongoDB instance (no authentication)
client = MongoClient("mongodb://localhost:27017/")

In [4]:
# Access your database and collection
db = client["ncbi_metadata"]  # Replace with your database name
collection = db["biosamples"]  # Replace with your collection name

In [5]:
# Define the query
query = {
    "Attributes.Attribute": {
        "$elemMatch": {
            "attribute_name": "emp500_principal_investigator",
            "content": "Myrold"
        }
    }
}

In [6]:
# Execute the query
results = collection.find(query)

In [7]:
# Build a list of dictionaries
documents_list = []

for document in results:
    doc_dict = {}

    # Add accession as a top-level key
    accession = document.get('accession')
    doc_dict['accession'] = accession

    # Add each attribute_name as a key with content as its value
    attributes = document.get('Attributes', {}).get('Attribute', [])
    for attribute in attributes:
        attribute_name = attribute.get('attribute_name')
        content = attribute.get('content')
        doc_dict[attribute_name] = content

    documents_list.append(doc_dict)

In [8]:
# Get all unique keys for the TSV header
all_keys = set()
for doc in documents_list:
    all_keys.update(doc.keys())
all_keys = list(all_keys)

In [9]:
# Ensure 'accession' appears first, and the rest of the keys are sorted alphabetically
all_keys = ['accession'] + sorted(k for k in all_keys if k != 'accession')

In [10]:
# Write to TSV
with open(tsv_file_path, "w", newline='', encoding="utf-8") as tsvfile:
    writer = csv.DictWriter(tsvfile, fieldnames=all_keys, delimiter='\t')
    writer.writeheader()
    writer.writerows(documents_list)

print(f"TSV file saved to {tsv_file_path}")

TSV file saved to myrold_attributes.tsv


In [11]:
# Close the connection
client.close()


In [12]:
documents_list[0]

{'accession': 'SAMEA7724195',
 'ENA first public': '2020-12-17',
 'ENA last update': '2020-12-16',
 'ENA-CHECKLIST': 'ERC000011',
 'External Id': 'SAMEA7724195',
 'INSDC center alias': 'UCSDMI',
 'INSDC center name': 'University of California San Diego Microbiome Initiative',
 'INSDC first public': '2020-12-17T04:08:06Z',
 'INSDC last update': '2020-12-16T01:23:59Z',
 'INSDC status': 'public',
 'Submitter Id': 'qiita_sid_13114:13114.myrold.5.s001',
 'alpha_shotgun_woltka_min10k_richness': '719.0',
 'alpha_shotgun_woltka_rar10k_richness': '504.0',
 'alpha_shotgun_woltka_rar3450_richness': '255.0',
 'collection_timestamp': '12/15/15 0:00',
 'cur_land_use': 'forest',
 'cur_vegetation': 'Douglas-fir',
 'depth_sample': '0.15',
 'elevation': '286',
 'emp500_principal_investigator': 'Myrold',
 'emp500_study_id': '5',
 'emp500_title': 'Tree-associated soils',
 'empo_2': 'Non-saline',
 'empo_3': 'Soil (non-saline)',
 'env biome': 'temperate coniferous forest biome',
 'env feature': 'temperate c

In [13]:
# Close the connection
client.close()