In [1]:
import duckdb
import pandas as pd
import yaml
from google.cloud import bigquery
from oaklib import get_adapter
from pymongo import MongoClient

In [2]:
# For the BBOP/NMDC MongoDB containing NCBI metadata

MONGO_USERNAME = None
MONGO_PASSWORD = None
MONGO_HOST = "localhost"
MONGO_PORT = 27017
MONGO_DATABASE = "ncbi_metadata"
BIOPROJECTS_COLLECTION = "bioprojects"
BIOSAMPLES_COLLECTION = "biosamples"
BIOSAMPLES_BIOPROJECTS_COLLECTION = "sra_biosamples_bioprojects"


In [3]:
# "Potential_import_SRA_Jan2024"
SHEET_ID = "1432d4WGdO5aSU2SvHBSvlqWmJ1NiEOTJ4UGQq1zR_ho"

In [4]:
NCBI_BIOSAMPLES_DUCKDB_FILE = "../../../local/ncbi_biosamples.duckdb"

In [5]:
CSV_URL = f"https://docs.google.com/spreadsheets/d/{SHEET_ID}/gviz/tq?tqx=out:csv"

In [6]:
envo_adapter_string = 'sqlite:obo:envo'

In [7]:
output_tsv = "simons_wishlist_env_triads.tsv"

In [8]:
gsheet_tsv = "Potential_import_SRA_Jan2024.tsv"

In [9]:
sra_metadata_frame_tsv = "simons_wishlist_sra_metadata.tsv"

In [10]:
# Define output file name
value_counts_yaml = "value_counts_report.yaml"

In [11]:
nmdc_gcp_project = "nmdc-377118"

In [12]:
show_limit = 10

In [13]:
# List of relevant harmonized names
target_harmonized_names = {"env_broad_scale", "env_local_scale", "env_medium"}

In [14]:
bigquery_client = bigquery.Client(project=nmdc_gcp_project)



In [15]:
# Read the CSV into a pandas DataFrame
gsheet_df = pd.read_csv(CSV_URL)


In [16]:
# Display the first few rows
gsheet_df

Unnamed: 0,"## Bioproject selected based on: ## At least 50 metagenomes (10M reads +), not currently in IMG (as far as I know), environmental / terrestrial / soil / plant ## For IMG imports, looked at cases where MAGs are not already available in NCBI, and potential for interesting new diversity ## For NMDC imports, looked more specifically at soil and related, and nice metadata BioProject Id",# metaG 10M+,Project name,Environment(s),Publication,MAGs in NCBI,Import to IMG ?,Import to NMDC ?,Notes
0,PRJEB52368,827,Metagenomic sequencing of Tara Pacific coral s...,Marine/coral,,,No,No,"Tara, unpublished, probably stay away"
1,PRJNA352737,721,HOT ALOHA metagenomic time and depth series,Marine,10.1038/s41564-017-0008-3,Yes (197),No,No,"Already has MAGs in NCBI, and oceanic so not h..."
2,PRJNA385736,688,Marine amplicons/metagenomes from Australian ...,Marine,,,Yes,Maybe ?,"Some paired metaG and metaT, but marine and no..."
3,PRJNA656268,572,Bio-GO-SHIP: Global marine 'omics studies of r...,Marine,No but NSF grant,Maybe (see TPA),No,No,Has a TPA and already in MGnify. 10.1093/nar/g...
4,PRJNA385854,470,Marine metagenomes from the bioGEOTRACES project,Marine,10.1038/sdata.2018.176,,Yes,Maybe ?,"Rich metadata, could be good if no MAGs (Data ..."
...,...,...,...,...,...,...,...,...,...
108,PRJEB35627,50,Seasonal and diel patterns of bacteriophages i...,Marine,,,No,No,
109,PRJNA476799,50,Greenhouse Vegetable Surfaces Raw sequence reads,Plant,,,No,No,"Food related, not high priority"
110,PRJNA691683,50,Topsoil viromes from five types of land uses,Soil,,,No,No,More for IMG/VR ?
111,PRJNA798446,50,Shotgun metagenome of microbial community in m...,Soil,,,No,No,no metadata


In [17]:
gsheet_df.to_csv(gsheet_tsv, index=False, sep="\t")

In [18]:
# Get the leftmost column (first column)
bioproj_accession_column = gsheet_df.iloc[:, 0]  # Select the first column

In [19]:
# Get unique values
bioproj_accession_values = list(bioproj_accession_column.unique())
bioproj_accession_values.sort()

In [20]:
# Display the unique values
bioproj_accession_values[0:show_limit]

['PRJEB18675',
 'PRJEB27870',
 'PRJEB31530',
 'PRJEB34634',
 'PRJEB34883',
 'PRJEB35627',
 'PRJEB35640',
 'PRJEB35770',
 'PRJEB38290',
 'PRJEB38681']

In [21]:
len(bioproj_accession_values)

113

In [22]:
if MONGO_USERNAME is not None and MONGO_PASSWORD is not None:
    username = MONGO_USERNAME
    password = MONGO_PASSWORD
    host = MONGO_HOST
    port = MONGO_PORT

    # Build the connection string with authentication.
    connection_string = f"mongodb://{username}:{password}@{host}:{port}"
else:
    # Default connection to unauthenticated MongoDB.
    host = MONGO_HOST
    port = MONGO_PORT
    connection_string = f"mongodb://{host}:{port}"

# Create the client connection.
client = MongoClient(connection_string)

In [23]:
# --------------------------
# Select Database
# --------------------------

db = client[MONGO_DATABASE]  # Dynamically select database


In [24]:
biosamples_bioprojects_collection = db[BIOSAMPLES_BIOPROJECTS_COLLECTION]

In [25]:
# MongoDB query using the `$in` operator
query = {"bioproject_accession": {"$in": bioproj_accession_values}}

In [26]:
# Fetch matching documents
wishlist_biosamples_bioprojects = list(biosamples_bioprojects_collection.find(query))


In [27]:
len(wishlist_biosamples_bioprojects)

30161

In [28]:
# Print the results
for doc in wishlist_biosamples_bioprojects[0:show_limit]:
    print(doc)

{'_id': ObjectId('679b82323b7bd066cbcbb9f2'), 'biosample_accession': 'SAMEA30188668', 'bioproject_accession': 'PRJEB18675'}
{'_id': ObjectId('679b82323b7bd066cbcbb9f3'), 'biosample_accession': 'SAMEA30189418', 'bioproject_accession': 'PRJEB18675'}
{'_id': ObjectId('679b82323b7bd066cbcbb9f4'), 'biosample_accession': 'SAMEA30190168', 'bioproject_accession': 'PRJEB18675'}
{'_id': ObjectId('679b82323b7bd066cbcbb9f5'), 'biosample_accession': 'SAMEA30190918', 'bioproject_accession': 'PRJEB18675'}
{'_id': ObjectId('679b82323b7bd066cbcbb9f6'), 'biosample_accession': 'SAMEA30191668', 'bioproject_accession': 'PRJEB18675'}
{'_id': ObjectId('679b82323b7bd066cbcbb9f7'), 'biosample_accession': 'SAMEA30192418', 'bioproject_accession': 'PRJEB18675'}
{'_id': ObjectId('679b82323b7bd066cbcbb9f8'), 'biosample_accession': 'SAMEA30193168', 'bioproject_accession': 'PRJEB18675'}
{'_id': ObjectId('679b82323b7bd066cbcbb9f9'), 'biosample_accession': 'SAMEA30193918', 'bioproject_accession': 'PRJEB18675'}
{'_id': 

In [29]:
biosamples_collection = db[BIOSAMPLES_COLLECTION]

In [30]:
# Extract the biosample_accession values
biosample_accession_list = [item["biosample_accession"] for item in wishlist_biosamples_bioprojects]

In [31]:
biosample_accession_list[0:show_limit]

['SAMEA30188668',
 'SAMEA30189418',
 'SAMEA30190168',
 'SAMEA30190918',
 'SAMEA30191668',
 'SAMEA30192418',
 'SAMEA30193168',
 'SAMEA30193918',
 'SAMEA30194668',
 'SAMEA30195418']

In [32]:
# MongoDB query using `$in`
query = {"accession": {"$in": biosample_accession_list}}

In [33]:
# Fetch matching documents
wishlist_biosamples = list(biosamples_collection.find(query))

In [34]:
len(wishlist_biosamples)

30161

In [35]:
wishlist_biosamples[0]

{'_id': ObjectId('677f66eb0a6241ac79325d55'),
 'access': 'public',
 'publication_date': '2022-06-06T00:00:00.000',
 'last_update': '2024-06-26T10:36:46.000',
 'submission_date': '2022-09-23T14:20:12.056',
 'id': '30986217',
 'accession': 'SAMEA110022181',
 'Ids': {'Id': [{'content': 'SAMEA110022181',
    'db': 'BioSample',
    'is_primary': '1'},
   {'content': 'ERS11966019', 'db': 'SRA'}]},
 'Description': {'Title': {'content': '2'},
  'Organism': {'taxonomy_id': '256318',
   'taxonomy_name': 'metagenome',
   'OrganismName': {'content': 'metagenome'}}},
 'Owner': {'Name': {'content': 'EBI'}},
 'Models': {'Model': {'content': 'Generic'}},
 'Package': {'content': 'Generic.1.0', 'display_name': 'Generic'},
 'Attributes': {'Attribute': [{'content': 'GR',
    'attribute_name': 'Country',
    'harmonized_name': 'geo_loc_name',
    'display_name': 'geographic location'},
   {'content': 'ERC000022', 'attribute_name': 'ENA-CHECKLIST'},
   {'content': 'CEH', 'attribute_name': 'INSDC center name

In [36]:
# Convert to dictionary for O(1) lookup
bioproject_dict = {item["biosample_accession"]: item["bioproject_accession"] for item in wishlist_biosamples_bioprojects}


In [37]:
data = []

In [38]:
for doc in wishlist_biosamples:
    biosample_accession = doc.get("accession")
    bioproject_accession = bioproject_dict.get(biosample_accession)

    # Get attributes and ensure it's always a list
    attributes = doc.get("Attributes", {}).get("Attribute", [])
    if isinstance(attributes, dict):
        attributes = [attributes]

    # Collect attributes into a dictionary
    attr_dict = {name: None for name in target_harmonized_names}  # Initialize with None
    for attr in attributes:
        harmonized_name = attr.get("harmonized_name")
        if harmonized_name in target_harmonized_names:
            attr_dict[harmonized_name] = attr.get("content")

    # Append the structured row
    data.append({
        "bioproject_accession": bioproject_accession,
        "biosample_accession": biosample_accession,
        "env_broad_scale": attr_dict["env_broad_scale"],
        "env_local_scale": attr_dict["env_local_scale"],
        "env_medium": attr_dict["env_medium"],
    })

In [39]:
# Convert to pandas DataFrame
triads_of_biosamples_frame = pd.DataFrame(data)

In [40]:
triads_of_biosamples_frame

Unnamed: 0,bioproject_accession,biosample_accession,env_broad_scale,env_local_scale,env_medium
0,PRJEB52753,SAMEA110022181,terrestrial biome,,soil
1,PRJEB52753,SAMEA110027444,terrestrial biome,,soil
2,PRJEB52452,SAMEA110646458,Trades Biome,,"particulate matter (ENVO:01000060), including ..."
3,PRJEB52452,SAMEA110646459,Trades Biome,,"particulate matter (ENVO:01000060), including ..."
4,PRJEB52452,SAMEA110646460,Trades Biome,,"particulate matter (ENVO:01000060), including ..."
...,...,...,...,...,...
30156,PRJNA1035643,SAMN38096308,xeric shrubland biome,terrestrial environmental zone,surface soil
30157,PRJNA1035643,SAMN38096309,xeric shrubland biome,terrestrial environmental zone,surface soil
30158,PRJNA1035643,SAMN38096310,xeric shrubland biome,terrestrial environmental zone,surface soil
30159,PRJNA1035643,SAMN38096311,xeric shrubland biome,terrestrial environmental zone,surface soil


In [41]:
triads_of_biosamples_frame.columns

Index(['bioproject_accession', 'biosample_accession', 'env_broad_scale',
       'env_local_scale', 'env_medium'],
      dtype='object')

In [42]:
duck_conn = duckdb.connect(NCBI_BIOSAMPLES_DUCKDB_FILE)

In [43]:
# Convert biosample_accession_list to a Pandas DataFrame
accession_df = pd.DataFrame(biosample_accession_list, columns=["accession"])


Get URIs mined from environmental triads from DuckDB

Much of this has since been moved into notebooks/studies_exploration/ncbi_annotation_mining

In [44]:
# Register the DataFrame as a temporary DuckDB table (in-memory, no writes to disk)
duck_conn.register("accession_filter", accession_df)

<duckdb.duckdb.DuckDBPyConnection at 0x77c1e848baf0>

In [45]:
# Define the query using the registered table
query = """
WITH biosample_subset AS (
    SELECT b.id, b.accession
    FROM main.biosample b
    JOIN accession_filter af
        ON b.accession = af.accession
),
asserted_data AS (
    SELECT
        b.id,
        b.accession,
        ctns.harmonized_name,
        ca.curie,
        'asserted' AS source
    FROM
        main.curies_asserted ca
    JOIN main.contexts_to_normalized_strings ctns
        ON ca.id = ctns.normalized_context_string_id
    JOIN biosample_subset b
        ON ctns.id = b.id
),
ner_data AS (
    SELECT
        b.id,
        b.accession,
        ctns.harmonized_name,
        cn.curie,
        cn.coverage_sum,
        'NER' AS source
    FROM
        main.curies_ner cn
    JOIN main.contexts_to_normalized_strings ctns
        ON cn.id = ctns.normalized_context_string_id
    JOIN biosample_subset b
        ON ctns.id = b.id
    WHERE
        cn.is_longest_match = TRUE
        AND cn.subsumed = FALSE
),
ranked_ner AS (
    SELECT
        fd.id,
        fd.accession,
        fd.harmonized_name,
        fd.curie,
        fd.source,
        ROW_NUMBER() OVER (PARTITION BY fd.id, fd.harmonized_name ORDER BY fd.coverage_sum DESC) AS rank
    FROM
        ner_data fd
)
SELECT id, accession, harmonized_name, curie, source
FROM asserted_data
UNION ALL
SELECT id, accession, harmonized_name, curie, source
FROM ranked_ner
WHERE rank = 1
ORDER BY id, harmonized_name, source;
"""


In [46]:
# Execute query and fetch results
id_harmonized_name_best_curies = duck_conn.execute(query).fetchdf()

In [47]:
id_harmonized_name_best_curies

Unnamed: 0,id,accession,harmonized_name,curie,source
0,2872651,SAMEA2591063,env_broad_scale,ENVO:00000447,NER
1,2872651,SAMEA2591063,env_broad_scale,ENVO:00000447,asserted
2,2872651,SAMEA2591063,env_local_scale,ENVO:00002042,NER
3,2872651,SAMEA2591063,env_local_scale,ENVO:00002042,asserted
4,2872651,SAMEA2591063,env_medium,ENVO:01000060,NER
...,...,...,...,...,...
41248,38096312,SAMN38096312,env_broad_scale,ENVO:01000218,NER
41249,38096312,SAMN38096312,env_local_scale,ENVO:01001199,NER
41250,38096312,SAMN38096312,env_medium,ENVO:02000059,NER
41251,42755192,SAMEA115861732,env_broad_scale,ENVO:00000873,NER


In [48]:
envo_adapter = get_adapter(envo_adapter_string)

In [49]:
# Extract unique EnvO CURIEs
all_mined_curies = id_harmonized_name_best_curies["curie"].unique()


In [50]:
# Fetch labels for all EnvO classes
all_mined_curie_labels = {}

In [51]:
for curie in all_mined_curies:
    label = envo_adapter.label(curie)
    if label:
        all_mined_curie_labels[curie] = label

In [52]:
id_harmonized_name_best_curies["curie_label"] = id_harmonized_name_best_curies["curie"].map(
    lambda curie: f"{all_mined_curie_labels.get(curie, 'Unknown')} [{curie}]"
)

In [53]:
# Map CURIEs to labels, defaulting to the CURIE itself if the label is missing
id_harmonized_name_best_curies

Unnamed: 0,id,accession,harmonized_name,curie,source,curie_label
0,2872651,SAMEA2591063,env_broad_scale,ENVO:00000447,NER,marine biome [ENVO:00000447]
1,2872651,SAMEA2591063,env_broad_scale,ENVO:00000447,asserted,marine biome [ENVO:00000447]
2,2872651,SAMEA2591063,env_local_scale,ENVO:00002042,NER,surface water [ENVO:00002042]
3,2872651,SAMEA2591063,env_local_scale,ENVO:00002042,asserted,surface water [ENVO:00002042]
4,2872651,SAMEA2591063,env_medium,ENVO:01000060,NER,particulate environmental material [ENVO:01000...
...,...,...,...,...,...,...
41248,38096312,SAMN38096312,env_broad_scale,ENVO:01000218,NER,xeric shrubland biome [ENVO:01000218]
41249,38096312,SAMN38096312,env_local_scale,ENVO:01001199,NER,terrestrial environmental zone [ENVO:01001199]
41250,38096312,SAMN38096312,env_medium,ENVO:02000059,NER,surface soil [ENVO:02000059]
41251,42755192,SAMEA115861732,env_broad_scale,ENVO:00000873,NER,freshwater biome [ENVO:00000873]


In [54]:

# Pivot so that each harmonized_name becomes a separate column
biosample_curie_pivot = (
    id_harmonized_name_best_curies
    .groupby(["id", "accession", "harmonized_name"])["curie_label"]
    .apply(lambda x: "; ".join(sorted(set(x))))  # Aggregate CURIe/label pairs
    .unstack(fill_value="")  # Pivot harmonized_name columns
    .reset_index()
)


In [55]:
# Ensure the correct column order, filling missing columns if needed
expected_columns = ["id", "accession", "env_broad_scale", "env_local_scale", "env_medium"]

In [56]:
for col in expected_columns:
    if col not in biosample_curie_pivot.columns:
        biosample_curie_pivot[col] = ""  # Add missing harmonized_name columns

In [57]:
biosample_curie_pivot = biosample_curie_pivot[expected_columns]  # Reorder columns

In [58]:
# Reset column names to fix any unintended MultiIndex issues
biosample_curie_pivot.columns.name = None  # Remove any lingering index name


In [59]:
biosample_curie_pivot = biosample_curie_pivot.rename_axis(None, axis=1)  # Remove any index label

In [60]:
biosample_curie_pivot

Unnamed: 0,id,accession,env_broad_scale,env_local_scale,env_medium
0,2872651,SAMEA2591063,marine biome [ENVO:00000447],surface water [ENVO:00002042],particulate environmental material [ENVO:01000...
1,2872652,SAMEA2611380,marine biome [ENVO:00000447],surface water [ENVO:00002042],particulate environmental material [ENVO:01000...
2,2872653,SAMEA2591077,marine biome [ENVO:00000447],deep chlorophyll maximum layer [ENVO:01000326],particulate environmental material [ENVO:01000...
3,2872654,SAMEA2591082,marine biome [ENVO:00000447],deep chlorophyll maximum layer [ENVO:01000326],particulate environmental material [ENVO:01000...
4,2872655,SAMEA2611378,marine biome [ENVO:00000447],surface water [ENVO:00002042],particulate environmental material [ENVO:01000...
...,...,...,...,...,...
14681,38096309,SAMN38096309,xeric shrubland biome [ENVO:01000218],terrestrial environmental zone [ENVO:01001199],surface soil [ENVO:02000059]
14682,38096310,SAMN38096310,xeric shrubland biome [ENVO:01000218],terrestrial environmental zone [ENVO:01001199],surface soil [ENVO:02000059]
14683,38096311,SAMN38096311,xeric shrubland biome [ENVO:01000218],terrestrial environmental zone [ENVO:01001199],surface soil [ENVO:02000059]
14684,38096312,SAMN38096312,xeric shrubland biome [ENVO:01000218],terrestrial environmental zone [ENVO:01001199],surface soil [ENVO:02000059]


In [61]:
# Perform a full outer join on biosample_accession (triads_of_biosamples_frame) and accession (biosample_curie_pivot)
merged_df = triads_of_biosamples_frame.merge(
    biosample_curie_pivot,
    left_on="biosample_accession",
    right_on="accession",
    how="outer",  # Ensures all rows from both DataFrames are kept
    suffixes=("_asserted", "_mined")  # Suffix for column disambiguation
)


In [62]:
# Drop the duplicate "accession" column after merging (since it matches "biosample_accession")
merged_df = merged_df.drop(columns=["accession"])

In [63]:
# Rename "id" to "biosample_id"
merged_df = merged_df.rename(columns={"id": "biosample_id"})

In [64]:
# Convert "biosample_id" to integer, handling potential NaNs (replace NaNs with -1 and convert)
merged_df["biosample_id"] = merged_df["biosample_id"].fillna(-1).astype(int)

In [65]:
# Reorder columns: Move "biosample_id" next to "biosample_accession"
column_order = (
    ["bioproject_accession", "biosample_id", "biosample_accession"] +
    [col for col in merged_df.columns if col not in ["bioproject_accession", "biosample_id", "biosample_accession"]]
)

In [66]:
merged_df = merged_df[column_order]

In [67]:
merged_df

Unnamed: 0,bioproject_accession,biosample_id,biosample_accession,env_broad_scale_asserted,env_local_scale_asserted,env_medium_asserted,env_broad_scale_mined,env_local_scale_mined,env_medium_mined
0,PRJEB52753,30986217,SAMEA110022181,terrestrial biome,,soil,terrestrial biome [ENVO:00000446],,soil [ENVO:00001998]
1,PRJEB52753,30986218,SAMEA110027444,terrestrial biome,,soil,terrestrial biome [ENVO:00000446],,soil [ENVO:00001998]
2,PRJEB52452,33205310,SAMEA110646458,Trades Biome,,"particulate matter (ENVO:01000060), including ...",biome [ENVO:00000428],,particulate environmental material [ENVO:01000...
3,PRJEB52452,33205311,SAMEA110646459,Trades Biome,,"particulate matter (ENVO:01000060), including ...",biome [ENVO:00000428],,particulate environmental material [ENVO:01000...
4,PRJEB52452,33205312,SAMEA110646460,Trades Biome,,"particulate matter (ENVO:01000060), including ...",biome [ENVO:00000428],,particulate environmental material [ENVO:01000...
...,...,...,...,...,...,...,...,...,...
30156,PRJNA1035643,38096308,SAMN38096308,xeric shrubland biome,terrestrial environmental zone,surface soil,xeric shrubland biome [ENVO:01000218],terrestrial environmental zone [ENVO:01001199],surface soil [ENVO:02000059]
30157,PRJNA1035643,38096309,SAMN38096309,xeric shrubland biome,terrestrial environmental zone,surface soil,xeric shrubland biome [ENVO:01000218],terrestrial environmental zone [ENVO:01001199],surface soil [ENVO:02000059]
30158,PRJNA1035643,38096310,SAMN38096310,xeric shrubland biome,terrestrial environmental zone,surface soil,xeric shrubland biome [ENVO:01000218],terrestrial environmental zone [ENVO:01001199],surface soil [ENVO:02000059]
30159,PRJNA1035643,38096311,SAMN38096311,xeric shrubland biome,terrestrial environmental zone,surface soil,xeric shrubland biome [ENVO:01000218],terrestrial environmental zone [ENVO:01001199],surface soil [ENVO:02000059]


In [68]:
merged_df.to_csv(output_tsv, index=False, sep="\t")

In [69]:
# close duck_conn
duck_conn.close()

Get the SRA metadata for the bioprojects of the 30k wishlist_biosamples from bigquery


In [70]:
merged_df.shape

(30161, 9)

In [71]:
# Define batch size
BATCH_SIZE = 1000  # ~ 30_000 total

In [72]:
# add constraint on Simon's specified SRA accessions (bioproj_accession_values) or filter afterward

In [73]:
# Split biosample_list into batches
batches = [biosample_accession_list[i:i + BATCH_SIZE] for i in range(0, len(biosample_accession_list), BATCH_SIZE)]

In [74]:
# Initialize empty DataFrame to store results
all_results = []

In [75]:
for batch in batches:
    query = """
    SELECT *
    FROM `nih-sra-datastore.sra.metadata`
    WHERE biosample IN UNNEST(@biosample_list)
    """

    # Define query parameters
    job_config = bigquery.QueryJobConfig(
        query_parameters=[
            bigquery.ArrayQueryParameter("biosample_list", "STRING", batch)
        ]
    )

    # Run query
    query_job = bigquery_client.query(query, job_config=job_config)
    results = query_job.result()

    # Convert batch results to a DataFrame and store them
    batch_df = results.to_dataframe()
    all_results.append(batch_df)

    print(f"Processed batch with {len(batch)} biosamples...")

# 2 minutes @ 1000

Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...
Processed batch with 1000 biosamples...


In [76]:
# Concatenate all batch results
sra_metadata_frame = pd.concat(all_results, ignore_index=True)

In [77]:
sra_metadata_frame.shape

(67261, 37)

In [78]:
# destructively require that the bioprojects values come from Simon's Google sheet (bioproj_accession_values)

In [79]:
sra_metadata_frame = sra_metadata_frame[sra_metadata_frame['bioproject'].isin(bioproj_accession_values)]

In [80]:
sra_metadata_frame.shape

(61527, 37)

In [81]:
sra_metadata_frame.columns

Index(['acc', 'assay_type', 'center_name', 'consent', 'experiment',
       'sample_name', 'instrument', 'librarylayout', 'libraryselection',
       'librarysource', 'platform', 'sample_acc', 'biosample', 'organism',
       'sra_study', 'releasedate', 'bioproject', 'mbytes', 'loaddate',
       'avgspotlen', 'mbases', 'insertsize', 'library_name',
       'biosamplemodel_sam', 'collection_date_sam',
       'geo_loc_name_country_calc', 'geo_loc_name_country_continent_calc',
       'geo_loc_name_sam', 'ena_first_public_run', 'ena_last_update_run',
       'sample_name_sam', 'datastore_filetype', 'datastore_provider',
       'datastore_region', 'attributes', 'run_file_version', 'jattr'],
      dtype='object')

In [82]:
sra_metadata_frame

Unnamed: 0,acc,assay_type,center_name,consent,experiment,sample_name,instrument,librarylayout,libraryselection,librarysource,...,geo_loc_name_sam,ena_first_public_run,ena_last_update_run,sample_name_sam,datastore_filetype,datastore_provider,datastore_region,attributes,run_file_version,jattr
0,ERR3191551,WGS,MAX PLANCK INSTITUTE FOR DEVELOPMENTAL BIOLOGY,public,ERX3219331,SAMEA5393687,Illumina HiSeq 3000,PAIRED,RANDOM,METAGENOMIC,...,[],[2019-04-30],[2019-03-05],[NextMet68_707],"[fastq, run.zq, sra]","[ena, gs, ncbi, s3]","[ena, gs.us-east1, ncbi.public, s3.us-east-1]","[{'k': 'bases', 'v': '5748832'}, {'k': 'bytes'...",1,"{""bases"": 5748832, ""bytes"": 2906796, ""run_file..."
1,ERR3191448,WGS,MAX PLANCK INSTITUTE FOR DEVELOPMENTAL BIOLOGY,public,ERX3219228,SAMEA5393583,Illumina HiSeq 3000,PAIRED,RANDOM,METAGENOMIC,...,[],[2019-04-30],[2019-03-05],[NextMet20_604],"[fastq, run.zq, sra]","[ena, gs, ncbi, s3]","[ena, gs.us-east1, ncbi.public, s3.us-east-1]","[{'k': 'bases', 'v': '964935499'}, {'k': 'byte...",1,"{""bases"": 964935499, ""bytes"": 444915251, ""run_..."
2,ERR3191154,WGS,MAX PLANCK INSTITUTE FOR DEVELOPMENTAL BIOLOGY,public,ERX3218934,SAMEA5393289,Illumina HiSeq 3000,PAIRED,RANDOM,METAGENOMIC,...,[],[2019-04-30],[2019-03-05],[NextMet28_310],"[fastq, run.zq, sra]","[ena, gs, ncbi, s3]","[ena, gs.us-east1, ncbi.public, s3.us-east-1]","[{'k': 'bases', 'v': '948865850'}, {'k': 'byte...",1,"{""bases"": 948865850, ""bytes"": 406827096, ""run_..."
3,ERR3191490,WGS,MAX PLANCK INSTITUTE FOR DEVELOPMENTAL BIOLOGY,public,ERX3219270,SAMEA5393625,Illumina HiSeq 3000,PAIRED,RANDOM,METAGENOMIC,...,[],[2019-04-30],[2019-03-05],[NextMet4_646],"[fastq, run.zq, sra]","[ena, gs, ncbi, s3]","[ena, gs.us-east1, ncbi.public, s3.us-east-1]","[{'k': 'bases', 'v': '771458445'}, {'k': 'byte...",1,"{""bases"": 771458445, ""bytes"": 355104803, ""run_..."
4,ERR3191198,WGS,MAX PLANCK INSTITUTE FOR DEVELOPMENTAL BIOLOGY,public,ERX3218978,SAMEA5393333,Illumina HiSeq 3000,PAIRED,RANDOM,METAGENOMIC,...,[],[2019-04-30],[2019-03-05],[NextMet68_354],"[fastq, run.zq, sra]","[ena, gs, ncbi, s3]","[ena, gs.us-east1, ncbi.public, s3.us-east-1]","[{'k': 'bases', 'v': '8343395'}, {'k': 'bytes'...",1,"{""bases"": 8343395, ""bytes"": 3887394, ""run_file..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67256,SRR26458657,WGS,SOUTH CHINA AGRICULTURE UNIVERSITY,public,SRX22162494,SQS3,Illumina HiSeq 4000,PAIRED,RANDOM,METAGENOMIC,...,"[China:Guangdong,Guangzhou]",[],[],[],"[fastq, run.zq, sra]","[gs, ncbi, s3]","[gs.us-east1, ncbi.public, s3.us-east-1]","[{'k': 'bases', 'v': '8053725900'}, {'k': 'byt...",1,"{""bases"": 8053725900, ""bytes"": 2738159328, ""ru..."
67257,SRR24972674,WGS,CENTRAL QUEENSLAND UNIVERSITY,public,SRX20730338,P8W8,Illumina NovaSeq 6000,PAIRED,size fractionation,METAGENOMIC,...,[Australia],[],[],[],"[fastq, run.zq, sra]","[gs, ncbi, s3]","[gs.us-east1, ncbi.public, s3.us-east-1]","[{'k': 'bases', 'v': '6530128800'}, {'k': 'byt...",1,"{""bases"": 6530128800, ""bytes"": 2144739401, ""ru..."
67258,SRR26458666,WGS,SOUTH CHINA AGRICULTURE UNIVERSITY,public,SRX22162485,SRN2,Illumina HiSeq 4000,PAIRED,RANDOM,METAGENOMIC,...,"[China:Guangdong,Guangzhou]",[],[],[],"[fastq, run.zq, sra]","[gs, ncbi, s3]","[gs.us-east1, ncbi.public, s3.us-east-1]","[{'k': 'bases', 'v': '7498980900'}, {'k': 'byt...",1,"{""bases"": 7498980900, ""bytes"": 2589548114, ""ru..."
67259,SRR26458498,WGS,SOUTH CHINA AGRICULTURE UNIVERSITY,public,SRX22162653,JFKT2,Illumina HiSeq 4000,PAIRED,RANDOM,METAGENOMIC,...,"[China:Guangdong,Guangzhou]",[],[],[],"[fastq, run.zq, sra]","[gs, ncbi, s3]","[gs.us-east1, ncbi.public, s3.us-east-1]","[{'k': 'bases', 'v': '6323503200'}, {'k': 'byt...",1,"{""bases"": 6323503200, ""bytes"": 2154437936, ""ru..."


In [83]:
sra_metadata_frame_no_attrs = sra_metadata_frame.copy()

In [84]:
sra_metadata_frame_no_attrs.drop(columns=["attributes", "jattr"], inplace=True)


In [85]:
sra_metadata_frame_no_attrs.shape

(61527, 35)

In [86]:
sra_metadata_frame_no_attrs.to_csv(sra_metadata_frame_tsv, sep="\t", index=False)
# 30 seconds -> large file?
# 10 sec without attributes and jattr

In [87]:
min_spot_len = 150

In [88]:
min_mbases = 10

In [89]:
min_num_samples = 50

In [90]:
acceptable_platforms = [
    "ILLUMINA",
]

In [91]:
quality_filtered = sra_metadata_frame_no_attrs[
    (sra_metadata_frame_no_attrs['avgspotlen'] >= min_spot_len) &
    (sra_metadata_frame_no_attrs['mbases'] >= min_mbases) &
    (sra_metadata_frame_no_attrs['platform'].isin(acceptable_platforms))
    ]


In [92]:
quality_filtered.shape

(53205, 35)

In [93]:
bioproject_counts = quality_filtered['bioproject'].value_counts()


In [94]:
bioproject_counts

bioproject
PRJNA385736    23712
PRJEB62460      3527
PRJNA781406     2226
PRJNA656268     2126
PRJEB34634      2076
               ...  
PRJNA476799       50
PRJNA798446       50
PRJNA691683       50
PRJNA450295       42
PRJEB55522         7
Name: count, Length: 107, dtype: int64

In [95]:
# Get bioprojects where count > 50
bioprojects_to_keep = bioproject_counts[bioproject_counts >= min_num_samples].index

In [96]:

# Filter quality_filtered to keep only rows with these bioprojects
quality_filtered_expected_bioprojs = quality_filtered[quality_filtered['bioproject'].isin(bioprojects_to_keep)]


In [97]:
quality_filtered_expected_bioprojs.shape

(53156, 35)

In [98]:
columns_to_count = ['assay_type', 'instrument', 'librarylayout',
                    'libraryselection', 'librarysource', 'platform', 'organism']

In [99]:
value_counts_dict = {col: quality_filtered_expected_bioprojs[col].value_counts() for col in columns_to_count}

In [100]:
# Convert value_counts Series to dictionaries
value_counts_dict_cleaned = {col: value_counts_dict[col].to_dict() for col in value_counts_dict}


In [101]:
# Sort inner dictionaries by count (descending)
sorted_value_counts = {
    col: dict(sorted(value_counts_dict_cleaned[col].items(), key=lambda item: item[1], reverse=True))
    for col in sorted(value_counts_dict_cleaned.keys())  # Sort columns alphabetically
}


In [102]:
# Dump dictionary to YAML file
with open(value_counts_yaml, "w") as f:
    yaml.dump(value_counts_dict_cleaned, f, default_flow_style=False, sort_keys=False)

In [103]:
surprise_1_frame = quality_filtered_expected_bioprojs[
    (quality_filtered_expected_bioprojs['librarysource'] == "METAGENOMIC") &
    (~quality_filtered_expected_bioprojs['organism'].str.lower().str.contains("metagenome", na=False))
]


In [104]:
surprise_1_frame.shape

(1252, 35)

In [105]:
surprise_2_frame = quality_filtered_expected_bioprojs[
    (quality_filtered_expected_bioprojs['librarysource'] != "METAGENOMIC") &
    (quality_filtered_expected_bioprojs['organism'].str.lower().str.contains("metagenome", na=False))
]

In [106]:
surprise_2_frame.shape

(20193, 35)

this bigquery includes some of the constraints but sill gets 24 M out of 35 M rows

```sql
WITH
  filtered_data AS (
  SELECT
    bioproject
  FROM
    `nih-sra-datastore.sra.metadata`
  WHERE
    avgspotlen >= 150
    AND mbases >= 10
    AND platform = 'ILLUMINA'
  GROUP BY
    bioproject
  HAVING
    COUNT(*) >= 50 )
SELECT
  *
FROM
  `nih-sra-datastore.sra.metadata`
WHERE
  bioproject IN (
  SELECT
    bioproject
  FROM
    filtered_data);
```

use queries like this to identify the attribute names that are mapped to env_broad_scale, env_local, scale, and env_medium

```js
db.getCollection("your_collection").aggregate(
  [
    { "$match": { "Attributes.Attribute.harmonized_name": "env_broad_scale" } },
    { "$unwind": "$Attributes.Attribute" },
    { "$match": { "Attributes.Attribute.harmonized_name": "env_broad_scale" } },
    { "$group": { "_id": "$Attributes.Attribute.attribute_name", "count": { "$sum": 1 } } },
    { "$project": { "_id": 0, "attribute_name": "$_id", "count": 1 } }
  ],
  { allowDiskUse: true }
);
```