In [2]:
from pymongo import MongoClient
from tqdm.notebook import tqdm  # Import tqdm.notebook for Jupyter Notebook progress bars

In [None]:
def flatten_all_ncbi_biosample_harmonized_attributes(
        mongo_uri="mongodb://localhost:27017/",
        db_name="ncbi_metadata",
        input_collection_name="biosamples",
        output_collection_name="biosample_harmonized_attributes",
):
    """
    Processes biosample documents from a MongoDB collection, extracts harmonized attributes,
    streams the results to a new MongoDB collection, and provides progress tracking in Jupyter Notebook.

    :param mongo_uri: MongoDB connection URI.
    :param db_name: Name of the database.
    :param input_collection_name: Name of the input collection.
    :param output_collection_name: Name of the output collection.
    """
    try:
        client = MongoClient(mongo_uri)
        db = client[db_name]
        input_collection = db[input_collection_name]
        output_collection = db[output_collection_name]

        estimated_count = input_collection.estimated_document_count()
        print(f"Estimated document count: {estimated_count}")

        with tqdm(total=estimated_count, desc="Processing Biosamples") as pbar:
            for sample in input_collection.find():
                biosample_entry = {"accession": sample.get("accession", "")}

                attributes_list = sample.get("Attributes", {}).get("Attribute", [])

                for attribute in attributes_list:
                    if isinstance(attribute, dict):
                        harmonized_name = attribute.get("harmonized_name")
                        content = attribute.get("content", "")

                        if harmonized_name:
                            biosample_entry[harmonized_name] = content

                output_collection.insert_one(biosample_entry)
                pbar.update(1)

        print(f"Processed and streamed biosamples from '{input_collection_name}' to '{output_collection_name}'.")

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        if 'client' in locals() and client:
            client.close()


In [None]:
# # Example usage (assuming MongoDB is running locally)
# flatten_all_ncbi_biosample_harmonized_attributes(
#     mongo_uri="mongodb://localhost:27017/",
#     db_name="ncbi_metadata",
#     input_collection_name="biosamples",
#     output_collection_name="biosample_harmonized_attributes",
# )
#
# # four hours