In [1]:
import pandas as pd
from pymongo import MongoClient

In [2]:
# For the BBOP/NMDC MongoDB containing NCBI metadata

MONGO_USERNAME = None
MONGO_PASSWORD = None
MONGO_HOST = "localhost"
MONGO_PORT = 27017
MONGO_DATABASE = "ncbi_metadata"
BIOPROJECTS_COLLECTION = "bioprojects"
BIOSAMPLES_COLLECTION = "biosamples"
BIOSAMPLES_BIOPROJECTS_COLLECTION = "sra_biosamples_bioprojects"


In [3]:
# "Potential_import_SRA_Jan2024"
SHEET_ID = "1432d4WGdO5aSU2SvHBSvlqWmJ1NiEOTJ4UGQq1zR_ho"

In [4]:
CSV_URL = f"https://docs.google.com/spreadsheets/d/{SHEET_ID}/gviz/tq?tqx=out:csv"

In [5]:
# Read the CSV into a pandas DataFrame
df = pd.read_csv(CSV_URL)


In [6]:
# Display the first few rows
df

Unnamed: 0,"## Bioproject selected based on: ## At least 50 metagenomes (10M reads +), not currently in IMG (as far as I know), environmental / terrestrial / soil / plant ## For IMG imports, looked at cases where MAGs are not already available in NCBI, and potential for interesting new diversity ## For NMDC imports, looked more specifically at soil and related, and nice metadata BioProject Id",# metaG 10M+,Project name,Environment(s),Publication,MAGs in NCBI,Import to IMG ?,Import to NMDC ?,Notes
0,PRJEB52368,827,Metagenomic sequencing of Tara Pacific coral s...,Marine/coral,,,No,No,"Tara, unpublished, probably stay away"
1,PRJNA352737,721,HOT ALOHA metagenomic time and depth series,Marine,10.1038/s41564-017-0008-3,Yes (197),No,No,"Already has MAGs in NCBI, and oceanic so not h..."
2,PRJNA385736,688,Marine amplicons/metagenomes from Australian ...,Marine,,,Yes,Maybe ?,"Some paired metaG and metaT, but marine and no..."
3,PRJNA656268,572,Bio-GO-SHIP: Global marine 'omics studies of r...,Marine,No but NSF grant,Maybe (see TPA),No,No,Has a TPA and already in MGnify. 10.1093/nar/g...
4,PRJNA385854,470,Marine metagenomes from the bioGEOTRACES project,Marine,10.1038/sdata.2018.176,,Yes,Maybe ?,"Rich metadata, could be good if no MAGs (Data ..."
...,...,...,...,...,...,...,...,...,...
108,PRJEB35627,50,Seasonal and diel patterns of bacteriophages i...,Marine,,,No,No,
109,PRJNA476799,50,Greenhouse Vegetable Surfaces Raw sequence reads,Plant,,,No,No,"Food related, not high priority"
110,PRJNA691683,50,Topsoil viromes from five types of land uses,Soil,,,No,No,More for IMG/VR ?
111,PRJNA798446,50,Shotgun metagenome of microbial community in m...,Soil,,,No,No,no metadata


In [7]:
# Get the leftmost column (first column)
bioproj_accession_column = df.iloc[:, 0]  # Select the first column

# Get unique values
bioproj_accession_values = list(bioproj_accession_column.unique())
bioproj_accession_values.sort()

In [8]:
# Display the unique values
bioproj_accession_values

['PRJEB18675',
 'PRJEB27870',
 'PRJEB31530',
 'PRJEB34634',
 'PRJEB34883',
 'PRJEB35627',
 'PRJEB35640',
 'PRJEB35770',
 'PRJEB38290',
 'PRJEB38681',
 'PRJEB40760',
 'PRJEB41174',
 'PRJEB41834',
 'PRJEB4352',
 'PRJEB44163',
 'PRJEB44309',
 'PRJEB44414',
 'PRJEB45634',
 'PRJEB52368',
 'PRJEB52406',
 'PRJEB52452',
 'PRJEB52753',
 'PRJEB55522',
 'PRJEB62460',
 'PRJEB66294',
 'PRJEB9691',
 'PRJEB9742',
 'PRJNA1000042',
 'PRJNA1000596',
 'PRJNA1001993',
 'PRJNA1002763',
 'PRJNA1014704',
 'PRJNA1028263',
 'PRJNA1035420',
 'PRJNA1035643',
 'PRJNA308326',
 'PRJNA329908',
 'PRJNA352737',
 'PRJNA358725',
 'PRJNA379303',
 'PRJNA385736',
 'PRJNA385854',
 'PRJNA385855',
 'PRJNA386568',
 'PRJNA389803',
 'PRJNA395437',
 'PRJNA429141',
 'PRJNA448773',
 'PRJNA449266',
 'PRJNA450295',
 'PRJNA450643',
 'PRJNA473136',
 'PRJNA476799',
 'PRJNA528368',
 'PRJNA530708',
 'PRJNA545144',
 'PRJNA558772',
 'PRJNA577476',
 'PRJNA588686',
 'PRJNA608274',
 'PRJNA628860',
 'PRJNA629394',
 'PRJNA632564',
 'PRJNA640378'

In [9]:
len(bioproj_accession_values)

113

In [10]:
if MONGO_USERNAME is not None and MONGO_PASSWORD is not None:
    username = MONGO_USERNAME
    password = MONGO_PASSWORD
    host = MONGO_HOST
    port = MONGO_PORT

    # Build the connection string with authentication.
    connection_string = f"mongodb://{username}:{password}@{host}:{port}"
else:
    # Default connection to unauthenticated MongoDB.
    host = MONGO_HOST
    port = MONGO_PORT
    connection_string = f"mongodb://{host}:{port}"

# Create the client connection.
client = MongoClient(connection_string)

In [11]:
# --------------------------
# Select Database
# --------------------------

db = client[MONGO_DATABASE]  # Dynamically select database


In [12]:
biosamples_bioprojects_collection = db[BIOSAMPLES_BIOPROJECTS_COLLECTION]

In [13]:
# MongoDB query using the `$in` operator
query = {"bioproject_accession": {"$in": bioproj_accession_values}}

# Fetch matching documents
wishlist_biosamples_bioprojects = list(biosamples_bioprojects_collection.find(query))


In [14]:
len(wishlist_biosamples_bioprojects)

30161

In [15]:
# Print the results
for doc in wishlist_biosamples_bioprojects[0:9]:
    print(doc)

{'_id': ObjectId('679b82323b7bd066cbcbb9f2'), 'biosample_accession': 'SAMEA30188668', 'bioproject_accession': 'PRJEB18675'}
{'_id': ObjectId('679b82323b7bd066cbcbb9f3'), 'biosample_accession': 'SAMEA30189418', 'bioproject_accession': 'PRJEB18675'}
{'_id': ObjectId('679b82323b7bd066cbcbb9f4'), 'biosample_accession': 'SAMEA30190168', 'bioproject_accession': 'PRJEB18675'}
{'_id': ObjectId('679b82323b7bd066cbcbb9f5'), 'biosample_accession': 'SAMEA30190918', 'bioproject_accession': 'PRJEB18675'}
{'_id': ObjectId('679b82323b7bd066cbcbb9f6'), 'biosample_accession': 'SAMEA30191668', 'bioproject_accession': 'PRJEB18675'}
{'_id': ObjectId('679b82323b7bd066cbcbb9f7'), 'biosample_accession': 'SAMEA30192418', 'bioproject_accession': 'PRJEB18675'}
{'_id': ObjectId('679b82323b7bd066cbcbb9f8'), 'biosample_accession': 'SAMEA30193168', 'bioproject_accession': 'PRJEB18675'}
{'_id': ObjectId('679b82323b7bd066cbcbb9f9'), 'biosample_accession': 'SAMEA30193918', 'bioproject_accession': 'PRJEB18675'}
{'_id': 

In [16]:
biosamples_collection = db[BIOSAMPLES_COLLECTION]

In [17]:
# Extract the biosample_accession values
biosample_accession_list = [item["biosample_accession"] for item in wishlist_biosamples_bioprojects]

# MongoDB query using `$in`
query = {"accession": {"$in": biosample_accession_list}}

# Fetch matching documents
wishlist_biosamples = list(biosamples_collection.find(query))

In [18]:
len(wishlist_biosamples)

30161

In [19]:
wishlist_biosamples[0]

{'_id': ObjectId('677f66eb0a6241ac79325d55'),
 'access': 'public',
 'publication_date': '2022-06-06T00:00:00.000',
 'last_update': '2024-06-26T10:36:46.000',
 'submission_date': '2022-09-23T14:20:12.056',
 'id': '30986217',
 'accession': 'SAMEA110022181',
 'Ids': {'Id': [{'content': 'SAMEA110022181',
    'db': 'BioSample',
    'is_primary': '1'},
   {'content': 'ERS11966019', 'db': 'SRA'}]},
 'Description': {'Title': {'content': '2'},
  'Organism': {'taxonomy_id': '256318',
   'taxonomy_name': 'metagenome',
   'OrganismName': {'content': 'metagenome'}}},
 'Owner': {'Name': {'content': 'EBI'}},
 'Models': {'Model': {'content': 'Generic'}},
 'Package': {'content': 'Generic.1.0', 'display_name': 'Generic'},
 'Attributes': {'Attribute': [{'content': 'GR',
    'attribute_name': 'Country',
    'harmonized_name': 'geo_loc_name',
    'display_name': 'geographic location'},
   {'content': 'ERC000022', 'attribute_name': 'ENA-CHECKLIST'},
   {'content': 'CEH', 'attribute_name': 'INSDC center name

In [20]:
# Convert to dictionary for O(1) lookup
bioproject_dict = {item["biosample_accession"]: item["bioproject_accession"] for item in wishlist_biosamples_bioprojects}


In [21]:
# List of relevant harmonized names
target_harmonized_names = {"env_broad_scale", "env_local_scale", "env_medium"}

data = []
for doc in wishlist_biosamples:
    biosample_accession = doc.get("accession")
    bioproject_accession = bioproject_dict.get(biosample_accession)

    # Get attributes and ensure it's always a list
    attributes = doc.get("Attributes", {}).get("Attribute", [])
    if isinstance(attributes, dict):
        attributes = [attributes]

    # Collect attributes into a dictionary
    attr_dict = {name: None for name in target_harmonized_names}  # Initialize with None
    for attr in attributes:
        harmonized_name = attr.get("harmonized_name")
        if harmonized_name in target_harmonized_names:
            attr_dict[harmonized_name] = attr.get("content")

    # Append the structured row
    data.append({
        "bioproject_accession": bioproject_accession,
        "biosample_accession": biosample_accession,
        "env_broad_scale": attr_dict["env_broad_scale"],
        "env_local_scale": attr_dict["env_local_scale"],
        "env_medium": attr_dict["env_medium"],
    })


# Convert to pandas DataFrame
df = pd.DataFrame(data)

In [22]:
df

Unnamed: 0,bioproject_accession,biosample_accession,env_broad_scale,env_local_scale,env_medium
0,PRJEB52753,SAMEA110022181,terrestrial biome,,soil
1,PRJEB52753,SAMEA110027444,terrestrial biome,,soil
2,PRJEB52452,SAMEA110646458,Trades Biome,,"particulate matter (ENVO:01000060), including ..."
3,PRJEB52452,SAMEA110646459,Trades Biome,,"particulate matter (ENVO:01000060), including ..."
4,PRJEB52452,SAMEA110646460,Trades Biome,,"particulate matter (ENVO:01000060), including ..."
...,...,...,...,...,...
30156,PRJNA1035643,SAMN38096308,xeric shrubland biome,terrestrial environmental zone,surface soil
30157,PRJNA1035643,SAMN38096309,xeric shrubland biome,terrestrial environmental zone,surface soil
30158,PRJNA1035643,SAMN38096310,xeric shrubland biome,terrestrial environmental zone,surface soil
30159,PRJNA1035643,SAMN38096311,xeric shrubland biome,terrestrial environmental zone,surface soil


In [23]:
df.to_csv("simons_wishlist_env_triads.tsv", index=False, sep="\t")