In [11]:
from pymongo import MongoClient
import pprint

In [2]:
# Connect to local MongoDB
client = MongoClient("mongodb://localhost:27017/")

In [3]:
db = client.ncbi_metadata

In [35]:
# Collections
biosample_harmonized = db.biosample_harmonized_attributes
unique_triad_values = db.unique_triad_values
triad_components_labels = db.triad_components_labels
class_label_cache = db.class_label_cache

In [6]:
# db.biosample_harmonized_attributes.create_index({ "env_broad_scale": 1 })
# # 2 minutes

'env_broad_scale_1'

In [7]:
# db.biosample_harmonized_attributes.create_index({ "env_local_scale": 1 })
# # 2 minutes

'env_local_scale_1'

In [8]:
# db.biosample_harmonized_attributes.create_index({ "env_medium": 1 })
# # 2 minutes

'env_medium_1'

In [22]:
# db.biosample_harmonized_attributes.create_index({ "accession": 1 })
# # 2 minutes

'accession_1'

In [39]:
# Environmental fields to iterate over
env_triad_fields = ["env_broad_scale", "env_local_scale", "env_medium"]

In [100]:
biosample_count = 1_000_000
# 20 seconds per 1_000_000

In [101]:
# Global counters
global_oak_count = 0
global_ols_count = 0
global_precedent_count = 0
no_triad_count = 0
no_matching_unique_triad_value = 0
no_matching_triad_components_label = 0

In [102]:
# Process first N biosample_harmonized_attributes records
for i, biosample_doc in enumerate(biosample_harmonized.find().limit(biosample_count), start=1):
    accession = biosample_doc.get("accession")
    # print(f"\n[{i}/{biosample_count}] Processing biosample_harmonized_attributes document: {accession}")

    for field in env_triad_fields:
        # print(f"Processing {accession} {field}")
        field_value = biosample_doc.get(field)
        if not field_value:
            no_triad_count += 1
            # print(f"    ⚠️ Skipping {field} (No value found)")
            continue

        # print(f"    🔍 Searching for unique_triad_values with content: {field_value}")

        # Find a matching document in unique_triad_values
        triad_doc = unique_triad_values.find_one({"content": field_value})
        if not triad_doc:
            no_matching_unique_triad_value += 1
            # print(f"    ❌ No match found in unique_triad_values for {field}")
            continue

        # print(f"    ✅ Found unique_triad_values match for {field}: {field_value}")

        parsed_annotations = triad_doc.get("parsed_annotations", [])
        if not parsed_annotations:
            no_matching_triad_components_label += 1
            # print(f"        ⚠️ No parsed annotations found for {field}")
            continue

        for annotation in parsed_annotations:
            cleaned_label = annotation.get("cleaned_label")
            repaired_curie = annotation.get("repaired_curie")
            if not cleaned_label:
                continue

            # Find matching triad_components_labels
            matching_components = list(triad_components_labels.find({"component_label": cleaned_label}))

            if matching_components:
                # print(f"        🔹 Found {len(matching_components)} matching triad_components_labels for cleaned_label: {cleaned_label} in {accession}")

                # Count occurrences of specific fields globally
                for component in matching_components:
                    if "oak_text_annotations" in component:
                        global_oak_count += 1
                    if "ols_text_annotation" in component:
                        global_ols_count += 1
                    if "partial_matches_vs_precedent" in component:
                        global_precedent_count += 1

                if repaired_curie:
                    # print(f"        🔍 Looking up repaired CURIE: {repaired_curie} in class_label_cache")

                    # Look up `repaired_curie` in class_label_cache
                    curie_doc = class_label_cache.find_one({"curie": repaired_curie})

                    # if curie_doc:
                    #     curie_label = curie_doc.get("label")
                    #     if curie_label:
                    #         print(f"        ✅ Found CURIE {repaired_curie} in class_label_cache with label {curie_label}")
                    #     else:
                    #         print(f"        ❌ Didn't find a label for: {repaired_curie}")
                    # else:
                    #     print(f"        ❌ No matching CURIE found in class_label_cache for: {repaired_curie}")



In [111]:
biosample_count

1000000

In [112]:
no_triad_count

2986960

In [113]:
no_triad_count / 3

995653.3333333334

In [114]:
no_matching_unique_triad_value

0

In [115]:
no_matching_triad_components_label

0

In [116]:
global_oak_count


7594

In [117]:
global_ols_count


164

In [118]:
global_precedent_count

4557