In [103]:
from pymongo import MongoClient
import pprint  # For better printing of results
from tqdm.notebook import tqdm

In [2]:
# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")  # Adjust if needed

In [3]:
db = client["ncbi_metadata"]  # Replace with your database name

In [71]:
# Define the accession you want to search for
accession_id = "SAMN00000002"
# accession_id = "SAMN35108443"

#

In [72]:
# Query for the biosample_harmonized_attributes record
biosample = db.biosample_harmonized_attributes.find_one({"accession": accession_id}, {"_id": 0})


In [73]:
# Print the result
if biosample:
    pprint.pprint(biosample)
else:
    print("No matching biosample found.")

{'accession': 'SAMN00000002',
 'collection_date': 'not determined',
 'env_broad_scale': 'terrestrial biome [ENVO:00000446]',
 'env_local_scale': 'human-associated habitat [ENVO:00009003]',
 'env_medium': 'biological product [ENVO:02000043]',
 'env_package': 'missing',
 'estimated_size': '2550000',
 'geo_loc_name': 'not determined',
 'host': 'Homo sapiens',
 'host_taxid': '9606',
 'investigation_type': 'missing',
 'isol_growth_condt': 'not determined',
 'isolation_source': 'missing',
 'lat_lon': 'not determined',
 'num_replicons': 'not determined',
 'project_name': 'Alistipes putredinis DSM 17216',
 'ref_biomaterial': 'not determined',
 'source_material_id': 'DSM 17216, CCUG 45780, CIP 104286, ATCC 29800, Carlier '
                       '10203, VPI 3293',
 'strain': 'DSM 17216'}


biosample_harmonized_attributes -> unique_triad_values

In [74]:
# Aggregation pipeline
pipeline = [
    {"$match": {"accession": accession_id}},  # Match the given accession
    {"$project": {"_id": 0, "env_broad_scale": 1, "env_local_scale": 1, "env_medium": 1}},
    {"$lookup": {
        "from": "unique_triad_values",
        "localField": "env_broad_scale",
        "foreignField": "content",
        "as": "broad_scale_match"
    }},
    {"$lookup": {
        "from": "unique_triad_values",
        "localField": "env_local_scale",
        "foreignField": "content",
        "as": "local_scale_match"
    }},
    {"$lookup": {
        "from": "unique_triad_values",
        "localField": "env_medium",
        "foreignField": "content",
        "as": "medium_match"
    }},
    {"$project": {
        "env_broad_scale": "$broad_scale_match",
        "env_local_scale": "$local_scale_match",
        "env_medium": "$medium_match"
    }}
]

In [75]:
# Run the aggregation query
results = list(db.biosample_harmonized_attributes.aggregate(pipeline))

In [76]:
# Print the results
if results:
    pprint.pprint(results[0])  # Print the first result
else:
    print("No matching biosample found.")

{'env_broad_scale': [{'_id': ObjectId('67cf606085d3a7550cf4a54a'),
                      'annotations_count': 1,
                      'content': 'terrestrial biome [ENVO:00000446]',
                      'content_len': 33,
                      'count': 20895,
                      'formula_like': False,
                      'parsed_annotations': [{'bioportal_prefix': True,
                                              'cleaned_label': 'terrestrial '
                                                               'biome',
                                              'cleaned_label_len': 17,
                                              'obo_prefix': True,
                                              'raw_component': 'terrestrial '
                                                               'biome '
                                                               '[ENVO:00000446]',
                                              'raw_curie': 'ENVO:00000446',
                          

now from biosample_harmonized_attributes, through unique_triad_values, to triad_components_labels

In [77]:
# Aggregation pipeline
pipeline = [
    {"$match": {"accession": accession_id}},  # Match the given accession
    {"$project": {"_id": 0, "env_broad_scale": 1, "env_local_scale": 1, "env_medium": 1}},

    # Lookup matching unique_triad_values
    {"$lookup": {
        "from": "unique_triad_values",
        "localField": "env_broad_scale",
        "foreignField": "content",
        "as": "broad_scale_match"
    }},
    {"$lookup": {
        "from": "unique_triad_values",
        "localField": "env_local_scale",
        "foreignField": "content",
        "as": "local_scale_match"
    }},
    {"$lookup": {
        "from": "unique_triad_values",
        "localField": "env_medium",
        "foreignField": "content",
        "as": "medium_match"
    }},

    # Extract parsed_annotations.cleaned_label from unique_triad_values
    {"$project": {
        "broad_scale_match": {
            "$ifNull": [{"$first": "$broad_scale_match.parsed_annotations.cleaned_label"}, []]
        },
        "local_scale_match": {
            "$ifNull": [{"$first": "$local_scale_match.parsed_annotations.cleaned_label"}, []]
        },
        "medium_match": {
            "$ifNull": [{"$first": "$medium_match.parsed_annotations.cleaned_label"}, []]
        }
    }},

    # Lookup matching triad_components_labels by component_label
    {"$lookup": {
        "from": "triad_components_labels",
        "localField": "broad_scale_match",
        "foreignField": "component_label",
        "as": "broad_scale_labels"
    }},
    {"$lookup": {
        "from": "triad_components_labels",
        "localField": "local_scale_match",
        "foreignField": "component_label",
        "as": "local_scale_labels"
    }},
    {"$lookup": {
        "from": "triad_components_labels",
        "localField": "medium_match",
        "foreignField": "component_label",
        "as": "medium_labels"
    }},

    # Final projection
    {"$project": {
        "env_broad_scale_labels": "$broad_scale_labels",
        "env_local_scale_labels": "$local_scale_labels",
        "env_medium_labels": "$medium_labels"
    }}
]

In [78]:
# Run the query
results = list(db.biosample_harmonized_attributes.aggregate(pipeline))

In [79]:
# Print the results
if results:
    pprint.pprint(results[0])  # Print the first result
else:
    print("No matching biosample found.")

{'env_broad_scale_labels': [{'_id': ObjectId('67cf60a685d3a7550cf77886'),
                             'component_label': 'terrestrial biome',
                             'count': 185397,
                             'oak_text_annotations': [{'curie': 'ENVO:00000446',
                                                       'predicate_id': 'rdfs:label'}]}],
 'env_local_scale_labels': [{'_id': ObjectId('67cf60a685d3a7550cf77887'),
                             'component_label': 'human associated habitat',
                             'count': 471324,
                             'partial_matches_vs_precedent': {'partial_matches_vs_precedent': [{'match_string': 'associated',
                                                                                                'object_id': 'PATO:0001668',
                                                                                                'object_label': 'associated',
                                                                    

now link the oak_text_annotations and partial_matches_vs_precedent to class_label_cache records

still need to follow URI mappings (which don't seem to be present)



In [80]:

# Aggregation pipeline
pipeline = [
    # Match biosample_harmonized_attributes by accession
    {"$match": {"accession": accession_id}},
    {"$project": {"_id": 0, "env_broad_scale": 1, "env_local_scale": 1, "env_medium": 1}},

    # Lookup matching unique_triad_values
    {"$lookup": {
        "from": "unique_triad_values",
        "localField": "env_broad_scale",
        "foreignField": "content",
        "as": "broad_scale_match"
    }},
    {"$lookup": {
        "from": "unique_triad_values",
        "localField": "env_local_scale",
        "foreignField": "content",
        "as": "local_scale_match"
    }},
    {"$lookup": {
        "from": "unique_triad_values",
        "localField": "env_medium",
        "foreignField": "content",
        "as": "medium_match"
    }},

    # Lookup triad_components_labels by component_label
    {"$lookup": {
        "from": "triad_components_labels",
        "localField": "broad_scale_match.parsed_annotations.cleaned_label",
        "foreignField": "component_label",
        "as": "broad_scale_labels"
    }},
    {"$lookup": {
        "from": "triad_components_labels",
        "localField": "local_scale_match.parsed_annotations.cleaned_label",
        "foreignField": "component_label",
        "as": "local_scale_labels"
    }},
    {"$lookup": {
        "from": "triad_components_labels",
        "localField": "medium_match.parsed_annotations.cleaned_label",
        "foreignField": "component_label",
        "as": "medium_labels"
    }},

    # Unwind partial_matches_vs_precedent.partial_matches_vs_precedent to get object_id
    {"$unwind": {"path": "$broad_scale_labels.partial_matches_vs_precedent.partial_matches_vs_precedent", "preserveNullAndEmptyArrays": True}},
    {"$unwind": {"path": "$local_scale_labels.partial_matches_vs_precedent.partial_matches_vs_precedent", "preserveNullAndEmptyArrays": True}},
    {"$unwind": {"path": "$medium_labels.partial_matches_vs_precedent.partial_matches_vs_precedent", "preserveNullAndEmptyArrays": True}},

    # Extract CURIEs and flatten them
    {"$project": {
        "broad_scale_curies": {
            "$reduce": {
                "input": {
                    "$setUnion": [
                        {"$ifNull": ["$broad_scale_match.parsed_annotations.repaired_curie", []]},
                        {"$ifNull": ["$broad_scale_labels.oak_text_annotations.curie", []]},
                        {"$ifNull": ["$broad_scale_labels.ols_text_annotation.obo_id", []]},
                        {"$ifNull": ["$broad_scale_labels.partial_matches_vs_precedent.partial_matches_vs_precedent.object_id", []]}
                    ]
                },
                "initialValue": [],
                "in": {"$concatArrays": ["$$value", "$$this"]}
            }
        },
        "local_scale_curies": {
            "$reduce": {
                "input": {
                    "$setUnion": [
                        {"$ifNull": ["$local_scale_match.parsed_annotations.repaired_curie", []]},
                        {"$ifNull": ["$local_scale_labels.oak_text_annotations.curie", []]},
                        {"$ifNull": ["$local_scale_labels.ols_text_annotation.obo_id", []]},
                        {"$ifNull": ["$local_scale_labels.partial_matches_vs_precedent.partial_matches_vs_precedent.object_id", []]}
                    ]
                },
                "initialValue": [],
                "in": {"$concatArrays": ["$$value", "$$this"]}
            }
        },
        "medium_curies": {
            "$reduce": {
                "input": {
                    "$setUnion": [
                        {"$ifNull": ["$medium_match.parsed_annotations.repaired_curie", []]},
                        {"$ifNull": ["$medium_labels.oak_text_annotations.curie", []]},
                        {"$ifNull": ["$medium_labels.ols_text_annotation.obo_id", []]},
                        {"$ifNull": ["$medium_labels.partial_matches_vs_precedent.partial_matches_vs_precedent.object_id", []]}
                    ]
                },
                "initialValue": [],
                "in": {"$concatArrays": ["$$value", "$$this"]}
            }
        }
    }},

    # Lookup class_label_cache by CURIEs to get labels
    {"$lookup": {
        "from": "class_label_cache",
        "localField": "broad_scale_curies",
        "foreignField": "curie",
        "as": "broad_scale_class_labels"
    }},
    {"$lookup": {
        "from": "class_label_cache",
        "localField": "local_scale_curies",
        "foreignField": "curie",
        "as": "local_scale_class_labels"
    }},
    {"$lookup": {
        "from": "class_label_cache",
        "localField": "medium_curies",
        "foreignField": "curie",
        "as": "medium_class_labels"
    }},

    # Format final output as a list of {curie, label} pairs
    {"$project": {
        "env_broad_scale": {
            "$map": {
                "input": "$broad_scale_class_labels",
                "as": "item",
                "in": {"curie": "$$item.curie", "label": "$$item.label"}
            }
        },
        "env_local_scale": {
            "$map": {
                "input": "$local_scale_class_labels",
                "as": "item",
                "in": {"curie": "$$item.curie", "label": "$$item.label"}
            }
        },
        "env_medium": {
            "$map": {
                "input": "$medium_class_labels",
                "as": "item",
                "in": {"curie": "$$item.curie", "label": "$$item.label"}
            }
        }
    }}
]



In [81]:
# Run the query
results = list(db.biosample_harmonized_attributes.aggregate(pipeline))

In [83]:
# Print the results
if results:
    pprint.pprint(results[0])  # Print the first result
else:
    print("No matching biosample found.")

{'env_broad_scale': [{'curie': 'ENVO:00000446', 'label': 'terrestrial biome'}],
 'env_local_scale': [{'curie': 'ENVO:00009003',
                      'label': 'obsolete human-associated habitat'},
                     {'curie': 'ENVO:01000739', 'label': 'habitat'},
                     {'curie': 'NCBITaxon:9606', 'label': 'Homo sapiens'},
                     {'curie': 'PATO:0001668', 'label': 'associated with'}],
 'env_medium': [{'curie': 'ENVO:02000043', 'label': 'biological product'}]}


In [84]:

# Aggregation pipeline
pipeline = [
    {"$match": {"accession": accession_id}},
    {"$project": {
        "_id": 0,
        "accession": 1,
        "env_broad_scale": 1,
        "env_local_scale": 1,
        "env_medium": 1
    }},

    # Lookup matching unique_triad_values
    {"$lookup": {
        "from": "unique_triad_values",
        "localField": "env_broad_scale",
        "foreignField": "content",
        "as": "broad_scale_match"
    }},
    {"$lookup": {
        "from": "unique_triad_values",
        "localField": "env_local_scale",
        "foreignField": "content",
        "as": "local_scale_match"
    }},
    {"$lookup": {
        "from": "unique_triad_values",
        "localField": "env_medium",
        "foreignField": "content",
        "as": "medium_match"
    }},

    # Lookup triad_components_labels by component_label
    {"$lookup": {
        "from": "triad_components_labels",
        "localField": "broad_scale_match.parsed_annotations.cleaned_label",
        "foreignField": "component_label",
        "as": "broad_scale_labels"
    }},
    {"$lookup": {
        "from": "triad_components_labels",
        "localField": "local_scale_match.parsed_annotations.cleaned_label",
        "foreignField": "component_label",
        "as": "local_scale_labels"
    }},
    {"$lookup": {
        "from": "triad_components_labels",
        "localField": "medium_match.parsed_annotations.cleaned_label",
        "foreignField": "component_label",
        "as": "medium_labels"
    }},

    # Unwind partial_matches_vs_precedent.partial_matches_vs_precedent
    {"$unwind": {"path": "$broad_scale_labels.partial_matches_vs_precedent.partial_matches_vs_precedent", "preserveNullAndEmptyArrays": True}},
    {"$unwind": {"path": "$local_scale_labels.partial_matches_vs_precedent.partial_matches_vs_precedent", "preserveNullAndEmptyArrays": True}},
    {"$unwind": {"path": "$medium_labels.partial_matches_vs_precedent.partial_matches_vs_precedent", "preserveNullAndEmptyArrays": True}},

    # Lookup class_label_cache by CURIEs
    {"$lookup": {
        "from": "class_label_cache",
        "localField": "broad_scale_labels.oak_text_annotations.curie",
        "foreignField": "curie",
        "as": "broad_scale_oak_labels"
    }},
    {"$lookup": {
        "from": "class_label_cache",
        "localField": "broad_scale_labels.ols_text_annotation.obo_id",
        "foreignField": "curie",
        "as": "broad_scale_ols_labels"
    }},
    {"$lookup": {
        "from": "class_label_cache",
        "localField": "broad_scale_labels.partial_matches_vs_precedent.partial_matches_vs_precedent.object_id",
        "foreignField": "curie",
        "as": "broad_scale_partial_labels"
    }},

    {"$lookup": {
        "from": "class_label_cache",
        "localField": "local_scale_labels.oak_text_annotations.curie",
        "foreignField": "curie",
        "as": "local_scale_oak_labels"
    }},
    {"$lookup": {
        "from": "class_label_cache",
        "localField": "local_scale_labels.ols_text_annotation.obo_id",
        "foreignField": "curie",
        "as": "local_scale_ols_labels"
    }},
    {"$lookup": {
        "from": "class_label_cache",
        "localField": "local_scale_labels.partial_matches_vs_precedent.partial_matches_vs_precedent.object_id",
        "foreignField": "curie",
        "as": "local_scale_partial_labels"
    }},

    {"$lookup": {
        "from": "class_label_cache",
        "localField": "medium_labels.oak_text_annotations.curie",
        "foreignField": "curie",
        "as": "medium_oak_labels"
    }},
    {"$lookup": {
        "from": "class_label_cache",
        "localField": "medium_labels.ols_text_annotation.obo_id",
        "foreignField": "curie",
        "as": "medium_ols_labels"
    }},
    {"$lookup": {
        "from": "class_label_cache",
        "localField": "medium_labels.partial_matches_vs_precedent.partial_matches_vs_precedent.object_id",
        "foreignField": "curie",
        "as": "medium_partial_labels"
    }},

    # Final projection with structured fields (INCLUDING env_medium!)
    {"$project": {
        "accession": 1,
        "env_broad_scale": {
            "original_value": "$env_broad_scale",
            "parsed_annotations": {
                "$map": {
                    "input": "$broad_scale_oak_labels",
                    "as": "item",
                    "in": {"curie": "$$item.curie", "label": "$$item.label"}
                }
            },
            "oak_text_annotations": {
                "$map": {
                    "input": "$broad_scale_oak_labels",
                    "as": "item",
                    "in": {"curie": "$$item.curie", "label": "$$item.label"}
                }
            },
            "ols_text_annotation": {
                "$map": {
                    "input": "$broad_scale_ols_labels",
                    "as": "item",
                    "in": {"curie": "$$item.curie", "label": "$$item.label"}
                }
            },
            "partial_matches_vs_precedent": {
                "$map": {
                    "input": "$broad_scale_partial_labels",
                    "as": "item",
                    "in": {"curie": "$$item.curie", "label": "$$item.label"}
                }
            }
        },
        "env_local_scale": {
            "original_value": "$env_local_scale",
            "parsed_annotations": {
                "$map": {
                    "input": "$local_scale_oak_labels",
                    "as": "item",
                    "in": {"curie": "$$item.curie", "label": "$$item.label"}
                }
            },
            "oak_text_annotations": {
                "$map": {
                    "input": "$local_scale_oak_labels",
                    "as": "item",
                    "in": {"curie": "$$item.curie", "label": "$$item.label"}
                }
            },
            "ols_text_annotation": {
                "$map": {
                    "input": "$local_scale_ols_labels",
                    "as": "item",
                    "in": {"curie": "$$item.curie", "label": "$$item.label"}
                }
            },
            "partial_matches_vs_precedent": {
                "$map": {
                    "input": "$local_scale_partial_labels",
                    "as": "item",
                    "in": {"curie": "$$item.curie", "label": "$$item.label"}
                }
            }
        },
        "env_medium": {
            "original_value": "$env_medium",
            "parsed_annotations": {
                "$map": {
                    "input": "$medium_oak_labels",
                    "as": "item",
                    "in": {"curie": "$$item.curie", "label": "$$item.label"}
                }
            },
            "oak_text_annotations": {
                "$map": {
                    "input": "$medium_oak_labels",
                    "as": "item",
                    "in": {"curie": "$$item.curie", "label": "$$item.label"}
                }
            },
            "ols_text_annotation": {
                "$map": {
                    "input": "$medium_ols_labels",
                    "as": "item",
                    "in": {"curie": "$$item.curie", "label": "$$item.label"}
                }
            },
            "partial_matches_vs_precedent": {
                "$map": {
                    "input": "$medium_partial_labels",
                    "as": "item",
                    "in": {"curie": "$$item.curie", "label": "$$item.label"}
                }
            }
        }
    }}
]

# Run the query
results = list(db.biosample_harmonized_attributes.aggregate(pipeline))

# Print the transformed results
if results:
    pprint.pprint(results[0])
else:
    print("No matching biosample found.")


{'accession': 'SAMN00000002',
 'env_broad_scale': {'oak_text_annotations': [{'curie': 'ENVO:00000446',
                                               'label': 'terrestrial biome'}],
                     'ols_text_annotation': [],
                     'original_value': 'terrestrial biome [ENVO:00000446]',
                     'parsed_annotations': [{'curie': 'ENVO:00000446',
                                             'label': 'terrestrial biome'}],
                     'partial_matches_vs_precedent': []},
 'env_local_scale': {'oak_text_annotations': [],
                     'ols_text_annotation': [],
                     'original_value': 'human-associated habitat '
                                       '[ENVO:00009003]',
                     'parsed_annotations': [],
                     'partial_matches_vs_precedent': [{'curie': 'ENVO:01000739',
                                                       'label': 'habitat'},
                                                      {'curie'

In [226]:
# Define the aggregation pipeline template
def get_pipeline(document):
    return [
        {"$match": {"accession": document["accession"]}},  # Use the document's accession
        {"$project": {
            "_id": 0,
            "accession": 1,
            "env_broad_scale": 1,
            "env_local_scale": 1,
            "env_medium": 1
        }},

        # Lookup matching unique_triad_values
        {"$lookup": {
            "from": "unique_triad_values",
            "localField": "env_broad_scale",
            "foreignField": "content",
            "as": "broad_scale_match"
        }},
        {"$lookup": {
            "from": "unique_triad_values",
            "localField": "env_local_scale",
            "foreignField": "content",
            "as": "local_scale_match"
        }},
        {"$lookup": {
            "from": "unique_triad_values",
            "localField": "env_medium",
            "foreignField": "content",
            "as": "medium_match"
        }},

        # Lookup triad_components_labels by component_label
        {"$lookup": {
            "from": "triad_components_labels",
            "localField": "broad_scale_match.parsed_annotations.cleaned_label",
            "foreignField": "component_label",
            "as": "broad_scale_labels"
        }},
        {"$lookup": {
            "from": "triad_components_labels",
            "localField": "local_scale_match.parsed_annotations.cleaned_label",
            "foreignField": "component_label",
            "as": "local_scale_labels"
        }},
        {"$lookup": {
            "from": "triad_components_labels",
            "localField": "medium_match.parsed_annotations.cleaned_label",
            "foreignField": "component_label",
            "as": "medium_labels"
        }},

        # Unwind partial_matches_vs_precedent.partial_matches_vs_precedent
        {"$unwind": {"path": "$broad_scale_labels.partial_matches_vs_precedent.partial_matches_vs_precedent",
                     "preserveNullAndEmptyArrays": True}},
        {"$unwind": {"path": "$local_scale_labels.partial_matches_vs_precedent.partial_matches_vs_precedent",
                     "preserveNullAndEmptyArrays": True}},
        {"$unwind": {"path": "$medium_labels.partial_matches_vs_precedent.partial_matches_vs_precedent",
                     "preserveNullAndEmptyArrays": True}},

        # Lookup class_label_cache by CURIEs
        {"$lookup": {
            "from": "class_label_cache",
            "localField": "broad_scale_labels.oak_text_annotations.curie",
            "foreignField": "curie",
            "as": "broad_scale_oak_labels"
        }},
        {"$lookup": {
            "from": "class_label_cache",
            "localField": "broad_scale_labels.ols_text_annotation.obo_id",
            "foreignField": "curie",
            "as": "broad_scale_ols_labels"
        }},
        {"$lookup": {
            "from": "class_label_cache",
            "localField": "broad_scale_labels.partial_matches_vs_precedent.partial_matches_vs_precedent.object_id",
            "foreignField": "curie",
            "as": "broad_scale_partial_labels"
        }},

        {"$lookup": {
            "from": "class_label_cache",
            "localField": "local_scale_labels.oak_text_annotations.curie",
            "foreignField": "curie",
            "as": "local_scale_oak_labels"
        }},
        {"$lookup": {
            "from": "class_label_cache",
            "localField": "local_scale_labels.ols_text_annotation.obo_id",
            "foreignField": "curie",
            "as": "local_scale_ols_labels"
        }},
        {"$lookup": {
            "from": "class_label_cache",
            "localField": "local_scale_labels.partial_matches_vs_precedent.partial_matches_vs_precedent.object_id",
            "foreignField": "curie",
            "as": "local_scale_partial_labels"
        }},

        {"$lookup": {
            "from": "class_label_cache",
            "localField": "medium_labels.oak_text_annotations.curie",
            "foreignField": "curie",
            "as": "medium_oak_labels"
        }},
        {"$lookup": {
            "from": "class_label_cache",
            "localField": "medium_labels.ols_text_annotation.obo_id",
            "foreignField": "curie",
            "as": "medium_ols_labels"
        }},
        {"$lookup": {
            "from": "class_label_cache",
            "localField": "medium_labels.partial_matches_vs_precedent.partial_matches_vs_precedent.object_id",
            "foreignField": "curie",
            "as": "medium_partial_labels"
        }},

        # Final projection with structured fields (INCLUDING env_medium!)
        {"$project": {
            "accession": 1,
            "env_broad_scale": {
                "original_value": "$env_broad_scale",
                "parsed_annotations": {
                    "$map": {
                        "input": "$broad_scale_oak_labels",
                        "as": "item",
                        "in": {"curie": "$$item.curie", "label": "$$item.label"}
                    }
                },
                "oak_text_annotations": {
                    "$map": {
                        "input": "$broad_scale_oak_labels",
                        "as": "item",
                        "in": {"curie": "$$item.curie", "label": "$$item.label"}
                    }
                },
                "ols_text_annotation": {
                    "$map": {
                        "input": "$broad_scale_ols_labels",
                        "as": "item",
                        "in": {"curie": "$$item.curie", "label": "$$item.label"}
                    }
                },
                "partial_matches_vs_precedent": {
                    "$map": {
                        "input": "$broad_scale_partial_labels",
                        "as": "item",
                        "in": {"curie": "$$item.curie", "label": "$$item.label"}
                    }
                }
            },
            "env_local_scale": {
                "original_value": "$env_local_scale",
                "parsed_annotations": {
                    "$map": {
                        "input": "$local_scale_oak_labels",
                        "as": "item",
                        "in": {"curie": "$$item.curie", "label": "$$item.label"}
                    }
                },
                "oak_text_annotations": {
                    "$map": {
                        "input": "$local_scale_oak_labels",
                        "as": "item",
                        "in": {"curie": "$$item.curie", "label": "$$item.label"}
                    }
                },
                "ols_text_annotation": {
                    "$map": {
                        "input": "$local_scale_ols_labels",
                        "as": "item",
                        "in": {"curie": "$$item.curie", "label": "$$item.label"}
                    }
                },
                "partial_matches_vs_precedent": {
                    "$map": {
                        "input": "$local_scale_partial_labels",
                        "as": "item",
                        "in": {"curie": "$$item.curie", "label": "$$item.label"}
                    }
                }
            },
            "env_medium": {
                "original_value": "$env_medium",
                "parsed_annotations": {
                    "$map": {
                        "input": "$medium_oak_labels",
                        "as": "item",
                        "in": {"curie": "$$item.curie", "label": "$$item.label"}
                    }
                },
                "oak_text_annotations": {
                    "$map": {
                        "input": "$medium_oak_labels",
                        "as": "item",
                        "in": {"curie": "$$item.curie", "label": "$$item.label"}
                    }
                },
                "ols_text_annotation": {
                    "$map": {
                        "input": "$medium_ols_labels",
                        "as": "item",
                        "in": {"curie": "$$item.curie", "label": "$$item.label"}
                    }
                },
                "partial_matches_vs_precedent": {
                    "$map": {
                        "input": "$medium_partial_labels",
                        "as": "item",
                        "in": {"curie": "$$item.curie", "label": "$$item.label"}
                    }
                }
            }
        }},
        #
        # # Use $out to store the results directly in the collection
        # {"$out": output_collection_name}
    ]

In [245]:
output_collection_name = 'compact_mined_triads'

In [246]:
# Collection for storing results
output_collection = db[output_collection_name]


In [247]:
db[output_collection_name].drop()


In [248]:
biosample_count = 46_000_000
# roughly 30 minutes per million
# have ! 44 million

In [249]:
# Cursor to stream through first 100 biosample_harmonized_attributes documents directly
cursor = db.biosample_harmonized_attributes.find({}, {"accession": 1, "env_broad_scale": 1, "env_local_scale": 1,
                                                      "env_medium": 1}).limit(biosample_count)

In [None]:
# Wrap cursor with tqdm for progress tracking
for document in tqdm(cursor, total=biosample_count, desc="Processing Biosamples", unit="doc"):
    # Get the parameterized pipeline
    pipeline = get_pipeline(document)

    # Execute aggregation and get result
    result = list(db.biosample_harmonized_attributes.aggregate(pipeline))

    # Insert the single document result
    if result:
        output_collection.insert_one(result[0])  # Insert one document


Processing Biosamples:   0%|          | 0/46000000 [00:00<?, ?doc/s]