In [1]:
from pymongo import MongoClient
import csv

In [2]:
connection_string = "mongodb://localhost:27017/"
db_name = "gold_metadata"
selected_study = "Gs0153999"
output_tsv = f"{selected_study}_environment_ecosystem.tsv"

In [3]:
# Connect to MongoDB server running on localhost
client = MongoClient(connection_string)

In [4]:
# Access the "gold_metadata" database
db = client[db_name]

In [5]:
# Get the list of collection names
collections = db.list_collection_names()


In [6]:
# Print the collection names
print(f"Collections in '{db_name}' database: {collections}")

Collections in 'gold_metadata' database: ['studies', 'biosamples', 'projects']


In [7]:
# Access the 'studies' collection
studies_collection = db['studies']

study_record = studies_collection.find_one({"studyGoldId": selected_study})

biosamples = []

# Get the value of the 'biosamples' field, which should be a list of strings
if study_record and 'biosamples' in study_record:
    biosamples = study_record['biosamples']
#     print("Biosamples:", biosamples)
# else:
#     print("No biosamples found for the given studyGoldId.")

In [8]:
# Access the 'biosamples' collection
biosamples_collection = db['biosamples']

# Initialize an empty list to store the dicts
biosample_data = []

# Iterate over the biosampleGoldId values from the biosamples list
for biosample_id in biosamples:
    # Find the document where 'biosampleGoldId' matches the value
    biosample_record = biosamples_collection.find_one({"biosampleGoldId": biosample_id})

    if biosample_record:
        biosample_dict = {
            "biosampleGoldId": biosample_record.get("biosampleGoldId"),
            "ecosystemPathId": biosample_record.get("ecosystemPathId"),
            "ecosystem": biosample_record.get("ecosystem"),
            "ecosystemCategory": biosample_record.get("ecosystemCategory"),
            "ecosystemType": biosample_record.get("ecosystemType"),
            "ecosystemSubtype": biosample_record.get("ecosystemSubtype"),
            "specificEcosystem": biosample_record.get("specificEcosystem"),
            "envoBroadScale": biosample_record.get("envoBroadScale"),
            "envoLocalScale": biosample_record.get("envoLocalScale"),
            "envoMedium": biosample_record.get("envoMedium"),
            "biosampleName": biosample_record.get("biosampleName"),
            "ncbiTaxId": biosample_record.get("ncbiTaxId"),
            "ncbiTaxName": biosample_record.get("ncbiTaxName"),
            "sampleCollectionSite": biosample_record.get("sampleCollectionSite"),
            "geographicLocation": biosample_record.get("geographicLocation"),
            "latitude": biosample_record.get("latitude"),
            "longitude": biosample_record.get("longitude"),
            "dateCollected": biosample_record.get("dateCollected"),
            "description": biosample_record.get("description"),
            "elevationInMeters": biosample_record.get("elevationInMeters"),
            "geoLocation": biosample_record.get("geoLocation"),
            "habitat": biosample_record.get("habitat"),
            "isoCountry": biosample_record.get("isoCountry"),
            "modDate": biosample_record.get("modDate"),
            "addDate": biosample_record.get("addDate"),
        }
        biosample_data.append(biosample_dict)



In [9]:
# Define the header (fieldnames)
fieldnames = [
    "biosampleGoldId", "ecosystemPathId", "ecosystem", "ecosystemCategory",
    "ecosystemType", "ecosystemSubtype", "specificEcosystem", "envoBroadScale",
    "envoLocalScale", "envoMedium", "biosampleName", "ncbiTaxId", "ncbiTaxName",
    "sampleCollectionSite", "geographicLocation", "latitude", "longitude",
    "dateCollected", "description", "elevationInMeters", "geoLocation",
    "habitat", "isoCountry", "modDate", "addDate",
]

# Open a TSV file for writing
with open(output_tsv, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames, delimiter='\t')

    # Write the header
    writer.writeheader()

    # Write the rows
    writer.writerows(biosample_data)

print(f"TSV file saved as '{output_tsv}'.")

TSV file saved as 'Gs0153999_environment_ecosystem.tsv'.
