In [None]:
import time

from pymongo import MongoClient, UpdateOne
from pymongo.errors import BulkWriteError
from tqdm.notebook import tqdm


In [None]:
# def flatten_all_ncbi_biosample_harmonized_attributes(
#         mongo_uri="mongodb://localhost:27017/",
#         db_name="ncbi_metadata",
#         input_collection_name="biosamples",
#         output_collection_name="biosample_harmonized_attributes",
# ):
#     """
#     Processes biosample documents from a MongoDB collection, extracts harmonized attributes,
#     streams the results to a new MongoDB collection, and provides progress tracking in Jupyter Notebook.
#
#     :param mongo_uri: MongoDB connection URI.
#     :param db_name: Name of the database.
#     :param input_collection_name: Name of the input collection.
#     :param output_collection_name: Name of the output collection.
#     """
#     try:
#         client = MongoClient(mongo_uri)
#         db = client[db_name]
#         input_collection = db[input_collection_name]
#         output_collection = db[output_collection_name]
#
#         estimated_count = input_collection.estimated_document_count()
#         print(f"Estimated document count: {estimated_count}")
#
#         with tqdm(total=estimated_count, desc="Processing Biosamples") as pbar:
#             for sample in input_collection.find():
#                 biosample_entry = {"accession": sample.get("accession", "")}
#
#                 attributes_list = sample.get("Attributes", {}).get("Attribute", [])
#
#                 for attribute in attributes_list:
#                     if isinstance(attribute, dict):
#                         harmonized_name = attribute.get("harmonized_name")
#                         content = attribute.get("content", "")
#
#                         if harmonized_name:
#                             biosample_entry[harmonized_name] = content
#
#                 output_collection.insert_one(biosample_entry)
#                 pbar.update(1)
#
#         print(f"Processed and streamed biosamples from '{input_collection_name}' to '{output_collection_name}'.")
#
#     except Exception as e:
#         print(f"An error occurred: {e}")
#     finally:
#         if 'client' in locals() and client:
#             client.close()


In [None]:
# # Example usage (assuming MongoDB is running locally)
# flatten_all_ncbi_biosample_harmonized_attributes(
#     mongo_uri="mongodb://localhost:27017/",
#     db_name="ncbi_metadata",
#     input_collection_name="biosamples",
#     output_collection_name="biosample_harmonized_attributes",
# )
#
# # four hours

In [None]:
def flatten_all_ncbi_biosample_harmonized_attributes(
      mongo_uri="mongodb://localhost:27017/",
      db_name="ncbi_metadata",
      input_collection_name="biosamples",
      output_collection_name="biosample_harmonized_attributes",
      batch_size=10000,
      drop_existing=True,
      create_index=True
):
  """
  Processes biosample documents from a MongoDB collection, extracts harmonized attributes,
  and efficiently streams the results to a new MongoDB collection with bulk operations.

  :param mongo_uri: MongoDB connection URI.
  :param db_name: Name of the database.
  :param input_collection_name: Name of the input collection.
  :param output_collection_name: Name of the output collection.
  :param batch_size: Number of documents to process in each batch.
  :param drop_existing: Whether to drop the existing output collection.
  :param create_index: Whether to create an index on accession field.
  """
  start_time = time.time()

  try:
      client = MongoClient(mongo_uri)
      db = client[db_name]
      input_collection = db[input_collection_name]

      # Drop existing collection if requested
      if drop_existing and output_collection_name in db.list_collection_names():
          print(f"Dropping existing collection: {output_collection_name}")
          db.drop_collection(output_collection_name)

      output_collection = db[output_collection_name]

      # Create index on accession field if requested
      if create_index:
          output_collection.create_index("accession", unique=True)

      estimated_count = input_collection.estimated_document_count()
      print(f"Estimated document count: {estimated_count}")

      # Prepare for bulk operations
      bulk_operations = []
      processed_count = 0

      with tqdm(total=estimated_count, desc="Processing Biosamples") as pbar:
          # Process in batches
          cursor = input_collection.find(
              {},
              projection={"accession": 1, "Attributes.Attribute": 1}
          ).batch_size(batch_size)

          for sample in cursor:
              accession = sample.get("accession", "")
              biosample_entry = {"accession": accession}

              attributes_list = sample.get("Attributes", {}).get("Attribute", [])

              for attribute in attributes_list:
                  if isinstance(attribute, dict):
                      harmonized_name = attribute.get("harmonized_name")
                      content = attribute.get("content", "")

                      if harmonized_name:
                          biosample_entry[harmonized_name] = content

              # Use upsert to handle potential duplicates
              bulk_operations.append(
                  UpdateOne(
                      {"accession": accession},
                      {"$set": biosample_entry},
                      upsert=True
                  )
              )

              processed_count += 1
              pbar.update(1)

              # Execute bulk operation when batch size is reached
              if len(bulk_operations) >= batch_size:
                  try:
                      output_collection.bulk_write(bulk_operations, ordered=False)
                      bulk_operations = []
                  except BulkWriteError as bwe:
                      print(f"Bulk write error: {bwe.details}")
                      bulk_operations = []

          # Process any remaining operations
          if bulk_operations:
              try:
                  output_collection.bulk_write(bulk_operations, ordered=False)
              except BulkWriteError as bwe:
                  print(f"Bulk write error: {bwe.details}")

      elapsed_time = time.time() - start_time
      print(f"Processed {processed_count} biosamples in {elapsed_time:.2f} seconds")
      print(f"Speed: {processed_count / elapsed_time:.2f} documents/second")

  except Exception as e:
      print(f"An error occurred: {e}")
  finally:
      if 'client' in locals() and client:
          client.close()


In [None]:
# Example usage (assuming MongoDB is running locally)
flatten_all_ncbi_biosample_harmonized_attributes(
    mongo_uri="mongodb://localhost:27017/",
    db_name="ncbi_metadata",
    input_collection_name="biosamples",
    output_collection_name="biosample_harmonized_attributes_2",
)

# 1:45 hours