In [1]:
from datetime import datetime, date

from google.cloud import bigquery
from pymongo import MongoClient

In [2]:
nmdc_gcp_project = "nmdc-377118"

In [3]:
# Function to convert datetime objects to ISO format
def serialize_row(row):
    row_dict = dict(row)  # Convert BigQuery Row to dict
    for key, value in row_dict.items():
        if isinstance(value, (datetime, date)):  # Convert both datetime and date
            row_dict[key] = value.isoformat()
    return row_dict

In [4]:
bq_client = bigquery.Client(project=nmdc_gcp_project)



In [5]:
# Connect to MongoDB
mongo_client = MongoClient("mongodb://localhost:27017/")

In [6]:
db = mongo_client["ncbi_metadata"]  # Change database name as needed

In [7]:
collection = db["filtered_sra_metadata"]  # Change collection name as needed

In [8]:
collection.drop()

In [9]:

# Define your query
query = """
WITH filtered_data AS (
  SELECT bioproject
  FROM `nih-sra-datastore.sra.metadata`
  WHERE avgspotlen >= 150
    AND mbases >= 10
    AND platform = 'ILLUMINA'
  GROUP BY bioproject
  HAVING COUNT(*) >= 50
)
SELECT * EXCEPT(jattr)
FROM `nih-sra-datastore.sra.metadata` m
WHERE m.bioproject IN (SELECT bioproject FROM filtered_data)
  AND EXISTS (
    SELECT 1
    FROM UNNEST(m.attributes) AS attr
    WHERE attr.k IN (
      'env_broad_scale_sam',
      'broad_scale_environmental_context_sam',
      'env_biome_sam',
      'biome_sam',
      'environment__biome__sam'
    )
  )
  AND EXISTS (
    SELECT 1
    FROM UNNEST(m.attributes) AS attr
    WHERE attr.k IN (
      'feature_sam',
      'env_feature_sam',
      'environment__feature__sam',
      'env_local_scale_sam'
    )
  )
  AND EXISTS (
    SELECT 1
    FROM UNNEST(m.attributes) AS attr
    WHERE attr.k IN (
      'env_material_sam',
      'environment__material__sam',
      'env_medium_sam',
      'environmental_medium_sam',
      'material_sam'
    )
  )
  limit 100000;
"""


In [10]:
# Run the query
query_job = bq_client.query(query)

In [11]:
# Process results in batches
BATCH_SIZE = 10000  # Adjust based on performance testing

In [12]:
batch = []

In [13]:
# Stream results row-by-row
for row in query_job:
    batch.append(serialize_row(row))

    # Insert into MongoDB in batches
    if len(batch) >= BATCH_SIZE:
        collection.insert_many(batch)
        print(f"Inserted {len(batch)} records...")
        batch.clear()  # Reset batch

# 10 000 rows in 30 seconds
# expect 3 million rows
# 100 000 in 4 minutes

Inserted 10000 records...
Inserted 10000 records...
Inserted 10000 records...
Inserted 10000 records...
Inserted 10000 records...
Inserted 10000 records...
Inserted 10000 records...
Inserted 10000 records...
Inserted 10000 records...
Inserted 10000 records...


In [14]:
# Insert any remaining records
if batch:
    collection.insert_many(batch)
    print(f"Inserted final {len(batch)} records...")

print("Data transfer complete!")

Data transfer complete!
