In [1]:
import csv
import pprint
import re
from collections import Counter

from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A
from pymongo import MongoClient
from rapidfuzz import process, fuzz


In [2]:
# For Ontology Access Kit (OAK)
envo_adapter_string = "sqlite:obo:envo"

In [3]:
BIOME = "ENVO:00000428"
ENV_MAT = "ENVO:00010483"
ABP = "ENVO:01000813"

In [4]:
# For the BBOP/NMDC MongoDB containing NCBI metadata

MONGO_USERNAME = None
MONGO_PASSWORD = None
MONGO_HOST = "localhost"
MONGO_PORT = 27017
MONGO_DATABASE = "ncbi_metadata"
BIOPROJECTS_COLLECTION = "bioprojects"
BIOSAMPLES_COLLECTION = "biosamples"
BIOSAMPLES_BIOPROJECTS_COLLECTION = "sra_biosamples_bioprojects"


In [5]:
if MONGO_USERNAME is not None and MONGO_PASSWORD is not None:
    # Replace these with your actual credentials and connection details.
    username = MONGO_USERNAME
    password = MONGO_PASSWORD
    host = MONGO_HOST
    port = MONGO_PORT

    # Build the connection string with authentication.
    connection_string = f"mongodb://{username}:{password}@{host}:{port}"
else:
    # Default connection to unauthenticated MongoDB.
    host = MONGO_HOST
    port = MONGO_PORT
    connection_string = f"mongodb://{host}:{port}"

# Create the client connection.
client = MongoClient(connection_string)

In [6]:
# --------------------------
# Select Database
# --------------------------

db = client[MONGO_DATABASE]  # Dynamically select database


In [7]:
bioprojects_collection = db[BIOPROJECTS_COLLECTION]

In [8]:
# --------------------------
# Search for NCBI BioProject records "about" EMP500
# --------------------------
# This query searches for the term "EMP500" using the $text operator,
# projects the text relevance score into the "score" field,
# and sorts the results by that relevance score.
cursor = bioprojects_collection.find(
    {"$text": {"$search": "EMP500"}},
    {"score": {"$meta": "textScore"}}
).sort([("score", {"$meta": "textScore"})])


In [9]:
emp500_candidate_bioprojects = []

In [10]:
# --------------------------
# Print the Results for Review
# --------------------------
for doc in cursor:
    pprint.pprint(doc)
    emp500_candidate_bioprojects.append(doc)

{'ProjectDescr': {'Description': 'The Earth Microbiome Project Multi-omics '
                                 'component (EMP500) involves amplicon and '
                                 'shotgun metagenomic sequencing and '
                                 'metabolomic profiling of over five hundred '
                                 'microbial communities from diverse '
                                 'environments on our planet. We developed new '
                                 'protocols for shotgun metagenomic sequencing '
                                 'and assembly, with the goal of applying this '
                                 'workflow to a range of environmental '
                                 'samples, combined with metabolomic '
                                 'profiling. 16S, 18S, and ITS amplicon '
                                 'sequencing was done in addition. We acquired '
                                 'a set of >500 fresh environmental samples '
     

Use the BioProject with the ProjectDescr.Description.Title 'Earth Microbiome Project Multi-omics (EMP500)'

I.e. the zeroth element in the list `emp500_candidate_bioprojects`.

What's the accession?

In [11]:
emp500_bioproject_accession = emp500_candidate_bioprojects[0]['ProjectID']['ArchiveID']['accession']

In [12]:
print(emp500_bioproject_accession)

PRJEB42019


In [13]:
# Select the collection 'sra_biosamples_bioprojects'
sra_biosamples_bioprojects_collection = db[BIOSAMPLES_BIOPROJECTS_COLLECTION]

# Define the query to find documents with bioproject_accession equal to "PRJEB42019"
query = {"bioproject_accession": "PRJEB42019"}

# Execute the query
cursor = sra_biosamples_bioprojects_collection.find(query)

emp500_biosample_accessions = set()

# Iterate over the cursor and print each document
for doc in cursor:
    emp500_biosample_accessions.add(doc['biosample_accession'])


In [14]:
print(len(emp500_biosample_accessions))

1024


In [15]:
biosamples_collection = db.biosamples  # Replace 'xxx' with your actual collection name

# Assuming emp500_biosample_accessions is a Python set containing 1024 values
# Use the $in operator to match documents where 'accession' is in your set.
query = {"accession": {"$in": list(emp500_biosample_accessions)}}

# Execute the query
cursor = biosamples_collection.find(query)

emp500_biosample_docs = []

# Iterate over the cursor and print each matching document
for doc in cursor:
    emp500_biosample_docs.append(doc)

In [16]:
print(len(emp500_biosample_docs))

1024


In [17]:
# Define the target harmonized names
target_harmonized_names = {"env_broad_scale", "env_local_scale", "env_medium"}

# Extract relevant data
emp500_env_triad_rows = []  # Store the env triads themselves
env_content_counter = Counter()  # Store counts of all strings used as env triad values

for doc in emp500_biosample_docs:
    accession = doc.get("accession", "")
    package_content = doc.get("Package", {}).get("content", "")

    # Extract harmonized values
    env_values = {key: "" for key in target_harmonized_names}
    attributes = doc.get("Attributes", {}).get("Attribute", [])

    for attr in attributes:
        harmonized_name = attr.get("harmonized_name")
        content_value = attr.get("content", "")
        if harmonized_name in target_harmonized_names:
            env_values[harmonized_name] = content_value
            env_content_counter[content_value] += 1  # Count occurrences

    # Add extracted values to the list
    emp500_env_triad_rows.append({
        "accession": accession,
        "package": package_content,
        "env_broad_scale": env_values["env_broad_scale"],
        "env_local_scale": env_values["env_local_scale"],
        "env_medium": env_values["env_medium"]
    })


In [18]:
pprint.pprint(emp500_env_triad_rows[0:3])

[{'accession': 'SAMEA7723388',
  'env_broad_scale': 'urban biome',
  'env_local_scale': 'anaerobic bioreactor',
  'env_medium': 'anaerobic sludge',
  'package': 'Generic.1.0'},
 {'accession': 'SAMEA7723389',
  'env_broad_scale': 'urban biome',
  'env_local_scale': 'anaerobic bioreactor',
  'env_medium': 'anaerobic sludge',
  'package': 'Generic.1.0'},
 {'accession': 'SAMEA7723390',
  'env_broad_scale': 'urban biome',
  'env_local_scale': 'anaerobic bioreactor',
  'env_medium': 'anaerobic sludge',
  'package': 'Generic.1.0'}]


The env triads consist of labels only, no CURIes

In [19]:
pprint.pprint(env_content_counter)

Counter({'urban biome': 285,
         'research facility': 191,
         'feces': 184,
         'sterile water': 178,
         'organic material': 166,
         'host-associated habitat': 133,
         'soil': 132,
         'marine benthic feature': 125,
         'temperate mixed forest biome': 112,
         'marine benthic biome': 90,
         'animal-associated habitat': 84,
         'marine coral reef biome': 51,
         'sediment': 45,
         'subpolar coniferous forest biome': 45,
         'marine biome': 42,
         'marine salt marsh biome': 40,
         'desert biome': 40,
         'marine sediment': 38,
         'marine reef biome': 30,
         'tropical broadleaf forest biome': 28,
         'insecta-associated habitat': 28,
         'lake sediment': 27,
         'montane desert': 26,
         'alpine soil': 26,
         'kelp forest': 25,
         'saline marsh': 24,
         'temperate coniferous forest biome': 22,
         'dense settlement biome': 22,
         'coral 

In [20]:
print(len(env_content_counter))

112


In [21]:
envo_adapter = get_adapter(envo_adapter_string)

In [22]:
biome_curies = list(envo_adapter.descendants(BIOME, predicates=[IS_A]))
env_mat_curies = list(envo_adapter.descendants(ENV_MAT, predicates=[IS_A]))
abp_curies = list(envo_adapter.descendants(ABP, predicates=[IS_A]))

In [23]:
non_biome_non_env_mat_abp_curies = set(abp_curies) - set(biome_curies) - set(env_mat_curies)

In [24]:
# use the OAK annotator to match the submitted env triad values to ENVO terms
# Don't panic about red error messages

# List to store all annotations
env_triad_terms_annotations_list = []

for content_value, count in env_content_counter.items():
    try:
        annotations = envo_adapter.annotate_text(content_value)
        for annotation in annotations:
            # Fetch the label for the object_id
            object_label = envo_adapter.label(annotation.object_id) if annotation.object_id else "Unknown"

            annotation_dict = {
                "content_value": content_value,
                "count": count,  # Frequency of this content value in the dataset
                "predicate_id": annotation.predicate_id,
                "object_id": annotation.object_id,
                "object_label": object_label,  # Looked up label
                "subject_start": annotation.subject_start,
                "subject_end": annotation.subject_end,
                "match_string": annotation.match_string,
                "matches_whole_text": annotation.matches_whole_text,
            }
            env_triad_terms_annotations_list.append(annotation_dict)

    except Exception as e:
        print(f"Error processing '{content_value}': {e}")


ERROR:root:Skipping statements(subject=ENVO:01001644,predicate=oio:hasDbXref,object=None,value=Carbonate which is formed as the result of some biological process.,datatype=None,language=None,); ValueError: Carbonate which is formed as the result of some biological process. is not a valid URI or CURIE


In [25]:
for annotation in env_triad_terms_annotations_list:
    annotation["match_string_len"] = len(annotation["match_string"]) if annotation["match_string"] else 0

In [26]:
# Pretty-print some sample results
for annotation in env_triad_terms_annotations_list[:10]:  # Show first 10 annotations
    pprint.pprint(annotation)

{'content_value': 'urban biome',
 'count': 285,
 'match_string': 'biome',
 'match_string_len': 5,
 'matches_whole_text': False,
 'object_id': 'ENVO:00000428',
 'object_label': 'biome',
 'predicate_id': 'rdfs:label',
 'subject_end': 11,
 'subject_start': 7}
{'content_value': 'urban biome',
 'count': 285,
 'match_string': 'urban biome',
 'match_string_len': 11,
 'matches_whole_text': True,
 'object_id': 'ENVO:01000249',
 'object_label': 'urban biome',
 'predicate_id': 'rdfs:label',
 'subject_end': 11,
 'subject_start': 1}
{'content_value': 'anaerobic bioreactor',
 'count': 9,
 'match_string': 'bioreactor',
 'match_string_len': 10,
 'matches_whole_text': False,
 'object_id': 'ENVO:00002123',
 'object_label': 'bioreactor',
 'predicate_id': 'rdfs:label',
 'subject_end': 20,
 'subject_start': 11}
{'content_value': 'anaerobic bioreactor',
 'count': 9,
 'match_string': 'anaerobic bioreactor',
 'match_string_len': 20,
 'matches_whole_text': True,
 'object_id': 'ENVO:00002124',
 'object_label': 

In [27]:
# Configurable match length cutoff
MATCH_LENGTH_CUTOFF = 3  # Exclude matches with length < 3 from needs_review_list

# Lists for categorization
perfect_match_list = []
needs_review_list = []
unmatched_content_values = set()  # Store content_values that are neither in perfect_match_list nor needs_review_list
non_perfect_match_content_values = set()  # Track all content_values that didn’t make it into perfect matches

# Step 1: Identify content_values that have at least one perfect match with predicate_id='rdfs:label'
perfect_match_content_values = set()
all_annotated_content_values = set()  # Track all content_values that were annotated

for annotation in env_triad_terms_annotations_list:
    content_value = annotation["content_value"]
    all_annotated_content_values.add(content_value)

    if annotation["matches_whole_text"] and annotation["predicate_id"] == "rdfs:label":
        perfect_match_list.append(annotation)
        perfect_match_content_values.add(content_value)

# Step 2: Identify content_values that have no perfect match and meet the length cutoff
needs_review_content_values = set()

for annotation in env_triad_terms_annotations_list:
    content_value = annotation["content_value"]
    annotation["match_string_len"] = len(annotation["match_string"]) if annotation["match_string"] else 0

    if (
            content_value not in perfect_match_content_values and
            annotation["match_string_len"] >= MATCH_LENGTH_CUTOFF
    ):
        needs_review_list.append(annotation)
        needs_review_content_values.add(content_value)

# Step 3: Identify content_values that are in env_content_counter but not in perfect_match_list or needs_review_list
for content_value in env_content_counter.keys():
    if content_value not in perfect_match_content_values and content_value not in needs_review_content_values:
        unmatched_content_values.add(content_value)

# Step 4: Save Perfect Matches to TSV
perfect_match_tsv = "emp500_perfect_matches.tsv"

with open(perfect_match_tsv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, delimiter="\t", fieldnames=[
        "content_value", "count", "predicate_id", "object_id", "object_label",
        "subject_start", "subject_end", "match_string", "match_string_len", "matches_whole_text"
    ])
    writer.writeheader()
    writer.writerows(perfect_match_list)

print(f"Perfect Matches TSV saved to {perfect_match_tsv}")

# Step 5: Save Needs Review to TSV (only matches with length ≥ MATCH_LENGTH_CUTOFF)
needs_review_tsv = "emp500_needs_review.tsv"

with open(needs_review_tsv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, delimiter="\t", fieldnames=[
        "content_value", "count", "predicate_id", "object_id", "object_label",
        "subject_start", "subject_end", "match_string", "match_string_len", "matches_whole_text"
    ])
    writer.writeheader()
    writer.writerows(needs_review_list)

print(f"Needs Review TSV saved to {needs_review_tsv} (excluding matches with length < {MATCH_LENGTH_CUTOFF})")

# Step 6: Save Unmatched Content Values to TSV
unmatched_tsv = "emp500_unmatched_content_values.tsv"

with open(unmatched_tsv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file, delimiter="\t")
    writer.writerow(["content_value", "count"])
    for content_value in unmatched_content_values:
        writer.writerow([content_value, env_content_counter[content_value]])

print(f"Unmatched content values TSV saved to {unmatched_tsv} (total: {len(unmatched_content_values)})")

# Step 7: Save Non-Perfect Matches (combining Needs Review + Unmatched)
non_perfect_match_tsv = "emp500_non_perfect_matches.tsv"

with open(non_perfect_match_tsv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file, delimiter="\t")
    writer.writerow(["content_value", "count"])

    # Include all needs_review content values
    for content_value in needs_review_content_values:
        writer.writerow([content_value, env_content_counter[content_value]])

    # Include all unmatched content values
    for content_value in unmatched_content_values:
        writer.writerow([content_value, env_content_counter[content_value]])

print(
    f"Non-Perfect Matches TSV saved to {non_perfect_match_tsv} (total: {len(needs_review_content_values) + len(unmatched_content_values)})")


Perfect Matches TSV saved to emp500_perfect_matches.tsv
Needs Review TSV saved to emp500_needs_review.tsv (excluding matches with length < 3)
Unmatched content values TSV saved to emp500_unmatched_content_values.tsv (total: 2)
Non-Perfect Matches TSV saved to emp500_non_perfect_matches.tsv (total: 24)


In [28]:
obsoletes_curies_labelled = list()

obsoletes_curies_envo = set(envo_adapter.obsoletes())
for curie in obsoletes_curies_envo:
    temp_dict = dict()
    temp_dict["curie"] = curie
    temp_dict["label"] = envo_adapter.label(curie)
    obsoletes_curies_labelled.append(temp_dict)

In [29]:
# Helper function: Normalize text (lowercase + remove extra spaces)
def normalize_text(text):
    return re.sub(r'\s+', ' ', text).strip().lower()  # Replace multiple spaces & trim


# Step 1: Create a lookup set of normalized obsolete labels
obsolete_label_set = {normalize_text(entry["label"]) for entry in obsoletes_curies_labelled}

# Step 2: Prepare result storage
exact_obsolete_matches = []

# Step 3: Check if "obsolete " + content_value exists in the obsolete labels
for content_value in needs_review_content_values | unmatched_content_values:  # Union of both sets
    obsolete_label_candidate = normalize_text(f"obsolete {content_value}")
    if obsolete_label_candidate in obsolete_label_set:
        # Find the corresponding CURIE using normalized comparison
        matching_entry = next(entry for entry in obsoletes_curies_labelled
                              if normalize_text(entry["label"]) == obsolete_label_candidate)
        exact_obsolete_matches.append({
            "content_value": content_value,  # ORIGINAL content_value
            "obsolete_label": matching_entry["label"],  # ORIGINAL obsolete label
            "obsolete_curie": matching_entry["curie"]
        })



In [30]:
pprint.pprint(exact_obsolete_matches)

[{'content_value': 'marine benthic feature',
  'obsolete_curie': 'ENVO:01000105',
  'obsolete_label': 'obsolete marine benthic feature'},
 {'content_value': 'insecta-associated habitat',
  'obsolete_curie': 'ENVO:00009004',
  'obsolete_label': 'obsolete insecta-associated habitat'},
 {'content_value': 'montane grasslands and shrubland biome',
  'obsolete_curie': 'ENVO:00000882',
  'obsolete_label': 'obsolete Montane grasslands and shrubland biome'},
 {'content_value': 'ocean water',
  'obsolete_curie': 'ENVO:00002151',
  'obsolete_label': 'obsolete ocean water'},
 {'content_value': 'animal-associated habitat',
  'obsolete_curie': 'ENVO:00006776',
  'obsolete_label': 'obsolete animal-associated habitat'}]


### the code below doesn't add much

In [31]:
# ✅ Configurable settings
include_exact_matches = True  # Set to False to exclude exact matches from fuzzy search
fuzzy_score_threshold = 85  # Set the minimum fuzzy match score

# Step 1: Build a set of exact match content values
exact_matched_content_values = {entry["content_value"] for entry in exact_obsolete_matches}

# Step 2: Select content values to run fuzzy matching on
if include_exact_matches:
    fuzzy_match_content_values = needs_review_content_values | unmatched_content_values | exact_matched_content_values
else:
    fuzzy_match_content_values = (needs_review_content_values | unmatched_content_values) - exact_matched_content_values

# Step 3: Create a mapping of stripped obsolete labels to original labels & CURIEs
stripped_obsolete_label_map = {
    entry["label"].replace("obsolete ", "").strip(): (entry["label"], entry["curie"])
    for entry in obsoletes_curies_labelled
}

# Step 4: Prepare result storage
fuzzy_obsolete_matches = []

# Step 5: Use fuzzy matching for selected content values
for content_value in fuzzy_match_content_values:
    match_result = process.extractOne(
        content_value,
        list(stripped_obsolete_label_map.keys()),  # Compare against stripped labels
        scorer=fuzz.token_sort_ratio
    )

    if match_result:
        best_match_label, score = match_result[:2]  # Extract match and score

        if score >= fuzzy_score_threshold:  # Apply score threshold
            original_label, curie = stripped_obsolete_label_map[best_match_label]  # Retrieve full label & CURIE

            fuzzy_obsolete_matches.append({
                "content_value": content_value,
                "matched_obsolete_label": original_label,  # Store full original label
                "score": score,
                "obsolete_curie": curie
            })

# Debugging: Print a few sample matches to confirm it's working
print(f"DEBUG: Found {len(fuzzy_obsolete_matches)} fuzzy matches (Score ≥ {fuzzy_score_threshold}).")
print(f"DEBUG: Sample fuzzy matches:")
for match in fuzzy_obsolete_matches[:5]:  # Print first 5 matches
    print(match)

# Step 6: Save fuzzy matches to a TSV
fuzzy_matches_tsv = "fuzzy_obsolete_matches.tsv"
with open(fuzzy_matches_tsv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, delimiter="\t",
                            fieldnames=["content_value", "matched_obsolete_label", "score", "obsolete_curie"])
    writer.writeheader()
    writer.writerows(fuzzy_obsolete_matches)

print(f"✅ Fuzzy obsolete matches saved to {fuzzy_matches_tsv} (Total: {len(fuzzy_obsolete_matches)})")


DEBUG: Found 5 fuzzy matches (Score ≥ 85).
DEBUG: Sample fuzzy matches:
{'content_value': 'marine benthic feature', 'matched_obsolete_label': 'obsolete marine benthic feature', 'score': 100.0, 'obsolete_curie': 'ENVO:01000105'}
{'content_value': 'insecta-associated habitat', 'matched_obsolete_label': 'obsolete insecta-associated habitat', 'score': 100.0, 'obsolete_curie': 'ENVO:00009004'}
{'content_value': 'host-associated habitat', 'matched_obsolete_label': 'obsolete insecta-associated habitat', 'score': 85.71428571428572, 'obsolete_curie': 'ENVO:00009004'}
{'content_value': 'ocean water', 'matched_obsolete_label': 'obsolete ocean water', 'score': 100.0, 'obsolete_curie': 'ENVO:00002151'}
{'content_value': 'animal-associated habitat', 'matched_obsolete_label': 'obsolete animal-associated habitat', 'score': 100.0, 'obsolete_curie': 'ENVO:00006776'}
✅ Fuzzy obsolete matches saved to fuzzy_obsolete_matches.tsv (Total: 5)


In [32]:

# Step 1: Extract unique obsolete CURIEs
obsolete_curies = {entry["obsolete_curie"] for entry in exact_obsolete_matches}

# obsolete_curies = {entry["curie"] for entry in obsoletes_curies_labelled}

# Step 2: Fetch entity metadata for each CURIE
entity_metadata_list = []

for curie in obsolete_curies:
    entity_metadata = envo_adapter.entity_metadata_map(curie)  # Fetch metadata
    if entity_metadata:
        entity_metadata_list.append(entity_metadata)



In [33]:
# Step 1: Initialize a counter to track key occurrences
key_counter = Counter()

# Step 2: Iterate through metadata dictionaries and count keys
for metadata in entity_metadata_list:
    if isinstance(metadata, dict):  # Ensure it's a dictionary
        key_counter.update(metadata.keys())

# Step 3: Print the results
print("✅ Key Appearance Counts in Metadata:")
for key, count in key_counter.most_common():
    print(f"{key}: {count}")




✅ Key Appearance Counts in Metadata:
id: 5
owl:deprecated: 5
rdfs:label: 5
sh:prefix: 5
schema:url: 5
rdfs:isDefinedBy: 5
oio:hasOBONamespace: 4
oio:id: 4
IAO:0000115: 3
oio:consider: 2
IAO:0100001: 1
oio:hasDbXref: 1
oio:hasExactSynonym: 1
oio:inSubset: 1
oio:created_by: 1
oio:creation_date: 1
rdfs:comment: 1


In [34]:
pprint.pprint(entity_metadata_list)

[{'IAO:0100001': ['ENVO:00002149'],
  'id': ['ENVO:00002151'],
  'oio:hasDbXref': ['https://en.wikipedia.org/wiki/Ocean_water'],
  'oio:hasOBONamespace': ['ENVO'],
  'oio:id': ['ENVO:00002151'],
  'owl:deprecated': [True],
  'rdfs:isDefinedBy': ['http://purl.obolibrary.org/obo/envo.owl'],
  'rdfs:label': ['obsolete ocean water'],
  'schema:url': ['http://purl.obolibrary.org/obo/ENVO_00002151'],
  'sh:prefix': ['ENVO']},
 {'IAO:0000115': ['A habitat that is in or on a living animal. Here "animal" '
                  'denotes an individual of a species that is a sub-taxon of '
                  'NCBITaxon:33208.'],
  'id': ['ENVO:00006776'],
  'oio:hasExactSynonym': ['metazoan-associated habitat'],
  'oio:hasOBONamespace': ['ENVO'],
  'oio:id': ['ENVO:00006776'],
  'oio:inSubset': ['obo:envo#EnvO-Lite-GSC'],
  'owl:deprecated': [True],
  'rdfs:isDefinedBy': ['http://purl.obolibrary.org/obo/envo.owl'],
  'rdfs:label': ['obsolete animal-associated habitat'],
  'schema:url': ['http://purl.o