In [1]:
import csv
import json
import pprint
import re
from collections import Counter

from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A
from pymongo import MongoClient


In [2]:
# For Ontology Access Kit (OAK)
envo_adapter_string = "sqlite:obo:envo"

In [3]:
BIOME = "ENVO:00000428"
ENV_MAT = "ENVO:00010483"
ABP = "ENVO:01000813"

In [4]:
# For the BBOP/NMDC MongoDB containing NCBI metadata

MONGO_USERNAME = None
MONGO_PASSWORD = None
MONGO_HOST = "localhost"
MONGO_PORT = 27017
MONGO_DATABASE = "ncbi_metadata"
BIOPROJECTS_COLLECTION = "bioprojects"
BIOSAMPLES_COLLECTION = "biosamples"
BIOSAMPLES_BIOPROJECTS_COLLECTION = "sra_biosamples_bioprojects"


In [5]:
if MONGO_USERNAME is not None and MONGO_PASSWORD is not None:
    # Replace these with your actual credentials and connection details.
    username = MONGO_USERNAME
    password = MONGO_PASSWORD
    host = MONGO_HOST
    port = MONGO_PORT

    # Build the connection string with authentication.
    connection_string = f"mongodb://{username}:{password}@{host}:{port}"
else:
    # Default connection to unauthenticated MongoDB.
    host = MONGO_HOST
    port = MONGO_PORT
    connection_string = f"mongodb://{host}:{port}"

# Create the client connection.
client = MongoClient(connection_string)

In [6]:
# --------------------------
# Select Database
# --------------------------

db = client[MONGO_DATABASE]  # Dynamically select database


In [7]:
bioprojects_collection = db[BIOPROJECTS_COLLECTION]

In [8]:
# --------------------------
# Search for NCBI BioProject records "about" EMP500
# --------------------------
# This query searches for the term "EMP500" using the $text operator,
# projects the text relevance score into the "score" field,
# and sorts the results by that relevance score.
cursor = bioprojects_collection.find(
    {"$text": {"$search": "EMP500"}},
    {"score": {"$meta": "textScore"}}
).sort([("score", {"$meta": "textScore"})])


In [9]:
emp500_candidate_bioprojects = []

In [10]:
# --------------------------
# Print the Results for Review
# --------------------------
for doc in cursor:
    pprint.pprint(doc)
    emp500_candidate_bioprojects.append(doc)

{'ProjectDescr': {'Description': 'The Earth Microbiome Project Multi-omics '
                                 'component (EMP500) involves amplicon and '
                                 'shotgun metagenomic sequencing and '
                                 'metabolomic profiling of over five hundred '
                                 'microbial communities from diverse '
                                 'environments on our planet. We developed new '
                                 'protocols for shotgun metagenomic sequencing '
                                 'and assembly, with the goal of applying this '
                                 'workflow to a range of environmental '
                                 'samples, combined with metabolomic '
                                 'profiling. 16S, 18S, and ITS amplicon '
                                 'sequencing was done in addition. We acquired '
                                 'a set of >500 fresh environmental samples '
     

Use the BioProject with the ProjectDescr.Description.Title 'Earth Microbiome Project Multi-omics (EMP500)'

I.e. the zeroth element in the list `emp500_candidate_bioprojects`.

What's the accession?

In [11]:
emp500_bioproject_accession = emp500_candidate_bioprojects[0]['ProjectID']['ArchiveID']['accession']

In [12]:
print(emp500_bioproject_accession)

PRJEB42019


In [13]:
# Select the collection 'sra_biosamples_bioprojects'
sra_biosamples_bioprojects_collection = db[BIOSAMPLES_BIOPROJECTS_COLLECTION]

# Define the query to find documents with bioproject_accession equal to "PRJEB42019"
query = {"bioproject_accession": "PRJEB42019"}

# Execute the query
cursor = sra_biosamples_bioprojects_collection.find(query)

emp500_biosample_accessions = set()

# Iterate over the cursor and print each document
for doc in cursor:
    emp500_biosample_accessions.add(doc['biosample_accession'])


In [14]:
print(len(emp500_biosample_accessions))

1024


In [15]:
biosamples_collection = db.biosamples  # Replace 'xxx' with your actual collection name

# Assuming emp500_biosample_accessions is a Python set containing 1024 values
# Use the $in operator to match documents where 'accession' is in your set.
query = {"accession": {"$in": list(emp500_biosample_accessions)}}

# Execute the query
cursor = biosamples_collection.find(query)

emp500_biosample_docs = []

# Iterate over the cursor and print each matching document
for doc in cursor:
    emp500_biosample_docs.append(doc)

In [16]:
print(len(emp500_biosample_docs))

1024


In [17]:
# Define the target harmonized names
target_harmonized_names = {"env_broad_scale", "env_local_scale", "env_medium"}

# Extract relevant data
emp500_env_triad_rows = []  # Store the env triads themselves
env_content_counter = Counter()  # Store counts of all strings used as env triad values

for doc in emp500_biosample_docs:
    accession = doc.get("accession", "")
    package_content = doc.get("Package", {}).get("content", "")

    # Extract harmonized values
    env_values = {key: "" for key in target_harmonized_names}
    attributes = doc.get("Attributes", {}).get("Attribute", [])

    for attr in attributes:
        harmonized_name = attr.get("harmonized_name")
        content_value = attr.get("content", "")
        if harmonized_name in target_harmonized_names:
            env_values[harmonized_name] = content_value
            env_content_counter[content_value] += 1  # Count occurrences

    # Add extracted values to the list
    emp500_env_triad_rows.append({
        "accession": accession,
        "package": package_content,
        "env_broad_scale": env_values["env_broad_scale"],
        "env_local_scale": env_values["env_local_scale"],
        "env_medium": env_values["env_medium"]
    })


In [18]:
pprint.pprint(emp500_env_triad_rows[0:3])

[{'accession': 'SAMEA7723388',
  'env_broad_scale': 'urban biome',
  'env_local_scale': 'anaerobic bioreactor',
  'env_medium': 'anaerobic sludge',
  'package': 'Generic.1.0'},
 {'accession': 'SAMEA7723389',
  'env_broad_scale': 'urban biome',
  'env_local_scale': 'anaerobic bioreactor',
  'env_medium': 'anaerobic sludge',
  'package': 'Generic.1.0'},
 {'accession': 'SAMEA7723390',
  'env_broad_scale': 'urban biome',
  'env_local_scale': 'anaerobic bioreactor',
  'env_medium': 'anaerobic sludge',
  'package': 'Generic.1.0'}]


The env triads consist of labels only, no CURIes

In [19]:
pprint.pprint(env_content_counter)

Counter({'urban biome': 285,
         'research facility': 191,
         'feces': 184,
         'sterile water': 178,
         'organic material': 166,
         'host-associated habitat': 133,
         'soil': 132,
         'marine benthic feature': 125,
         'temperate mixed forest biome': 112,
         'marine benthic biome': 90,
         'animal-associated habitat': 84,
         'marine coral reef biome': 51,
         'sediment': 45,
         'subpolar coniferous forest biome': 45,
         'marine biome': 42,
         'marine salt marsh biome': 40,
         'desert biome': 40,
         'marine sediment': 38,
         'marine reef biome': 30,
         'tropical broadleaf forest biome': 28,
         'insecta-associated habitat': 28,
         'lake sediment': 27,
         'montane desert': 26,
         'alpine soil': 26,
         'kelp forest': 25,
         'saline marsh': 24,
         'temperate coniferous forest biome': 22,
         'dense settlement biome': 22,
         'coral 

In [20]:
print(len(env_content_counter))

112


In [21]:
envo_adapter = get_adapter(envo_adapter_string)

In [22]:
biome_curies = list(envo_adapter.descendants(BIOME, predicates=[IS_A]))
env_mat_curies = list(envo_adapter.descendants(ENV_MAT, predicates=[IS_A]))
abp_curies = list(envo_adapter.descendants(ABP, predicates=[IS_A]))

In [23]:
non_biome_non_env_mat_abp_curies = set(abp_curies) - set(biome_curies) - set(env_mat_curies)

In [24]:
# use the OAK annotator to match the submitted env triad values to ENVO terms
# Don't panic about red error messages

# List to store all annotations
env_triad_terms_annotations_list = []

for content_value, count in env_content_counter.items():
    try:
        annotations = envo_adapter.annotate_text(content_value)
        for annotation in annotations:
            # Fetch the label for the object_id
            object_label = envo_adapter.label(annotation.object_id) if annotation.object_id else "Unknown"

            annotation_dict = {
                "content_value": content_value,
                "count": count,  # Frequency of this content value in the dataset
                "predicate_id": annotation.predicate_id,
                "object_id": annotation.object_id,
                "object_label": object_label,  # Looked up label
                "subject_start": annotation.subject_start,
                "subject_end": annotation.subject_end,
                "match_string": annotation.match_string,
                "matches_whole_text": annotation.matches_whole_text,
            }
            env_triad_terms_annotations_list.append(annotation_dict)

    except Exception as e:
        print(f"Error processing '{content_value}': {e}")


ERROR:root:Skipping statements(subject=ENVO:01001644,predicate=oio:hasDbXref,object=None,value=Carbonate which is formed as the result of some biological process.,datatype=None,language=None,); ValueError: Carbonate which is formed as the result of some biological process. is not a valid URI or CURIE


In [25]:
for annotation in env_triad_terms_annotations_list:
    annotation["match_string_len"] = len(annotation["match_string"]) if annotation["match_string"] else 0

In [26]:
# Pretty-print some sample results
for annotation in env_triad_terms_annotations_list[:10]:  # Show first 10 annotations
    pprint.pprint(annotation)

{'content_value': 'urban biome',
 'count': 285,
 'match_string': 'biome',
 'match_string_len': 5,
 'matches_whole_text': False,
 'object_id': 'ENVO:00000428',
 'object_label': 'biome',
 'predicate_id': 'rdfs:label',
 'subject_end': 11,
 'subject_start': 7}
{'content_value': 'urban biome',
 'count': 285,
 'match_string': 'urban biome',
 'match_string_len': 11,
 'matches_whole_text': True,
 'object_id': 'ENVO:01000249',
 'object_label': 'urban biome',
 'predicate_id': 'rdfs:label',
 'subject_end': 11,
 'subject_start': 1}
{'content_value': 'anaerobic bioreactor',
 'count': 9,
 'match_string': 'bioreactor',
 'match_string_len': 10,
 'matches_whole_text': False,
 'object_id': 'ENVO:00002123',
 'object_label': 'bioreactor',
 'predicate_id': 'rdfs:label',
 'subject_end': 20,
 'subject_start': 11}
{'content_value': 'anaerobic bioreactor',
 'count': 9,
 'match_string': 'anaerobic bioreactor',
 'match_string_len': 20,
 'matches_whole_text': True,
 'object_id': 'ENVO:00002124',
 'object_label': 

In [27]:
# Configurable match length cutoff
MATCH_LENGTH_CUTOFF = 3  # Exclude matches with length < 3 from needs_review_list

# Lists for categorization
perfect_match_list = []
needs_review_list = []

unmatched_content_values = set()  # Store content_values that are neither in perfect_match_list nor needs_review_list
non_perfect_match_content_values = set()  # Track all content_values that didn’t make it into perfect matches

# Identify content_values that have at least one perfect match with predicate_id='rdfs:label'
perfect_match_content_values = set()
all_annotated_content_values = set()  # Track all content_values that were annotated

for annotation in env_triad_terms_annotations_list:
    content_value = annotation["content_value"]
    all_annotated_content_values.add(content_value)

    if annotation["matches_whole_text"] and annotation["predicate_id"] == "rdfs:label":
        perfect_match_list.append(annotation)
        perfect_match_content_values.add(content_value)

# Identify content_values that have no perfect match and meet the length cutoff
needs_review_content_values = set()

for annotation in env_triad_terms_annotations_list:
    content_value = annotation["content_value"]
    annotation["match_string_len"] = len(annotation["match_string"]) if annotation["match_string"] else 0

    if (
            content_value not in perfect_match_content_values and
            annotation["match_string_len"] >= MATCH_LENGTH_CUTOFF
    ):
        needs_review_list.append(annotation)
        needs_review_content_values.add(content_value)

# Identify content_values that are in env_content_counter but not in perfect_match_list or needs_review_list
for content_value in env_content_counter.keys():
    if content_value not in perfect_match_content_values and content_value not in needs_review_content_values:
        unmatched_content_values.add(content_value)


In [28]:
obsoletes_curies_labelled = list()

obsoletes_curies_envo = set(envo_adapter.obsoletes())
for curie in obsoletes_curies_envo:
    temp_dict = dict()
    temp_dict["curie"] = curie
    temp_dict["label"] = envo_adapter.label(curie)
    obsoletes_curies_labelled.append(temp_dict)

In [29]:
# Helper function: Normalize text (lowercase + remove extra spaces)
def normalize_text(text):
    return re.sub(r'\s+', ' ', text).strip().lower()  # Replace multiple spaces & trim


# Create a lookup set of normalized obsolete labels
obsolete_label_set = {normalize_text(entry["label"]) for entry in obsoletes_curies_labelled}

# Prepare result storage
exact_obsolete_matches = []

# Check if "obsolete " + content_value exists in the obsolete labels
for content_value in needs_review_content_values | unmatched_content_values:  # Union of both sets
    obsolete_label_candidate = normalize_text(f"obsolete {content_value}")
    if obsolete_label_candidate in obsolete_label_set:
        # Find the corresponding CURIE using normalized comparison
        matching_entry = next(entry for entry in obsoletes_curies_labelled
                              if normalize_text(entry["label"]) == obsolete_label_candidate)
        exact_obsolete_matches.append({
            "content_value": content_value,  # ORIGINAL content_value
            "obsolete_label": matching_entry["label"],  # ORIGINAL obsolete label
            "obsolete_curie": matching_entry["curie"]
        })


In [30]:
pprint.pprint(exact_obsolete_matches)

[{'content_value': 'insecta-associated habitat',
  'obsolete_curie': 'ENVO:00009004',
  'obsolete_label': 'obsolete insecta-associated habitat'},
 {'content_value': 'montane grasslands and shrubland biome',
  'obsolete_curie': 'ENVO:00000882',
  'obsolete_label': 'obsolete Montane grasslands and shrubland biome'},
 {'content_value': 'animal-associated habitat',
  'obsolete_curie': 'ENVO:00006776',
  'obsolete_label': 'obsolete animal-associated habitat'},
 {'content_value': 'ocean water',
  'obsolete_curie': 'ENVO:00002151',
  'obsolete_label': 'obsolete ocean water'},
 {'content_value': 'marine benthic feature',
  'obsolete_curie': 'ENVO:01000105',
  'obsolete_label': 'obsolete marine benthic feature'}]


In [31]:
obsolete_curies = {entry["obsolete_curie"] for entry in
                   exact_obsolete_matches}  # obsolete values that were used (indirectly)
# obsolete_curies = {entry["curie"] for entry in obsoletes_curies_labelled} # all obsolete values in EnvO

obsoletes_metadata_list = []

for curie in obsolete_curies:
    entity_metadata = envo_adapter.entity_metadata_map(curie)  # Fetch metadata
    if entity_metadata:
        obsoletes_metadata_list.append(entity_metadata)


In [32]:
pprint.pprint(obsoletes_metadata_list)

[{'IAO:0000115': ['A habitat that is in or on a living animal. Here "animal" '
                  'denotes an individual of a species that is a sub-taxon of '
                  'NCBITaxon:33208.'],
  'id': ['ENVO:00006776'],
  'oio:hasExactSynonym': ['metazoan-associated habitat'],
  'oio:hasOBONamespace': ['ENVO'],
  'oio:id': ['ENVO:00006776'],
  'oio:inSubset': ['obo:envo#EnvO-Lite-GSC'],
  'owl:deprecated': [True],
  'rdfs:isDefinedBy': ['http://purl.obolibrary.org/obo/envo.owl'],
  'rdfs:label': ['obsolete animal-associated habitat'],
  'schema:url': ['http://purl.obolibrary.org/obo/ENVO_00006776'],
  'sh:prefix': ['ENVO']},
 {'IAO:0000115': ['OBSOLETE A prominent or distinctive aspect, quality, or '
                  'characteristic of environments occurring on or along marine '
                  'benthic environments.'],
  'id': ['ENVO:01000105'],
  'oio:consider': ['ENVO:01001788',
                   'ENVO:01001479',
                   'ENVO:00000447',
                   'ENVO:0

In [33]:
obsoletes_predicate_counter = Counter()

for metadata in obsoletes_metadata_list:
    if isinstance(metadata, dict):  # Ensure it's a dictionary
        obsoletes_predicate_counter.update(metadata.keys())


In [34]:
for key, count in obsoletes_predicate_counter.most_common():
    print(f"{key}: {count}")

id: 5
owl:deprecated: 5
rdfs:label: 5
sh:prefix: 5
schema:url: 5
rdfs:isDefinedBy: 5
oio:hasOBONamespace: 4
oio:id: 4
IAO:0000115: 3
oio:consider: 2
oio:hasExactSynonym: 1
oio:inSubset: 1
oio:created_by: 1
oio:creation_date: 1
rdfs:comment: 1
IAO:0100001: 1
oio:hasDbXref: 1


In [35]:
# Compute high-level summary statistics
summary = {
    "total_unique_terms": len(env_content_counter),  # Unique terms submitted
    "perfect_matches": len(perfect_match_content_values),  # Unique perfect matches
    "matches_need_review": len(needs_review_content_values),  # Unique terms needing review
    "no_valid_oak_match": len(unmatched_content_values),  # Terms that didn't get a valid OAK annotation
    "exact_obsolete_matches": len(exact_obsolete_matches),  # Unique terms that exactly matched an obsolete label
    "obsolete_terms_with_replacement": sum(
        1 for m in obsoletes_metadata_list if "IAO:0100001" in m
    ),  # Obsolete terms with a clear replacement
    "obsolete_terms_with_consider": sum(
        1 for m in obsoletes_metadata_list if "oio:consider" in m
    ),  # Obsolete terms with "consider" alternatives
}

# Consistency check
if summary["perfect_matches"] + summary["matches_need_review"] + summary["no_valid_oak_match"] != summary[
    "total_unique_terms"]:
    print(
        f"WARNING: perfect_matches ({summary['perfect_matches']}) + matches_need_review ({summary['matches_need_review']}) "
        f"+ no_valid_oak_match ({summary['no_valid_oak_match']}) ≠ total_unique_terms ({summary['total_unique_terms']})"
    )

# Save to JSON
summary_json_file = "emp500_summary.json"
with open(summary_json_file, "w", encoding="utf-8") as json_file:
    json.dump(summary, json_file, indent=4)

print(f"Summary saved to {summary_json_file}")


Summary saved to emp500_summary.json


In [36]:
# Initialize an empty list to store term legitimacy data
term_legitimacy_data = []

for content_value, emp500_usages in env_content_counter.items():
    # Determine perfect match status
    is_perfect_match = content_value in perfect_match_content_values

    # Get matched_label and matched_curie from perfect_match_list
    perfect_match_entry = next(
        (entry for entry in perfect_match_list if entry["content_value"] == content_value),
        None
    )
    matched_label = perfect_match_entry["object_label"] if perfect_match_entry else ""
    matched_curie = perfect_match_entry["object_id"] if perfect_match_entry else ""

    # Determine obsolete status
    obsolete_entry = next((e for e in exact_obsolete_matches if e["content_value"] == content_value), None)
    is_obsolete = bool(obsolete_entry)
    obsolete_label = obsolete_entry["obsolete_label"] if obsolete_entry else ""
    obsolete_curie = ""

    # Get obsolete CURIE from obsoletes_curies_labelled
    if obsolete_entry:
        obsolete_curie_entry = next(
            (o for o in obsoletes_curies_labelled if obsolete_label in o["label"]),
            None
        )
        obsolete_curie = obsolete_curie_entry["curie"] if obsolete_curie_entry else ""

    # Determine replacement term (IAO:0100001)
    replacement_curie = ""
    replacement_label = ""
    consider_replacements = ""

    if obsolete_curie:
        metadata = next((m for m in obsoletes_metadata_list if "id" in m and obsolete_curie in m["id"]), None)

        # Process replacement_curie
        if metadata and "IAO:0100001" in metadata:
            replacement_curie = metadata["IAO:0100001"][0]  # Extract replacement CURIE
            replacement_label = envo_adapter.label(replacement_curie) if replacement_curie else ""

        # Process oio:consider replacements
        if metadata and "oio:consider" in metadata:
            consider_list = metadata["oio:consider"]
            consider_replacements = " | ".join(
                f"{curie}/{envo_adapter.label(curie)}" for curie in consider_list
            )

    # Append data to the list
    term_legitimacy_data.append({
        "content_value": content_value,
        "emp500_usages": emp500_usages,
        "perfect_match": is_perfect_match,
        "matched_label": matched_label,
        "matched_curie": matched_curie,
        "is_obsolete": is_obsolete,
        "obsolete_label": obsolete_label,
        "obsolete_curie": obsolete_curie,
        "replacement_curie": replacement_curie,
        "replacement_label": replacement_label,
        "consider_replacements": consider_replacements
    })

# Define file path
detailed_tsv = "emp500_term_legitimacy.tsv"

# Write collected data to TSV using csv.DictWriter
with open(detailed_tsv, mode="w", newline="", encoding="utf-8") as file:
    fieldnames = [
        "content_value", "emp500_usages", "perfect_match", "matched_label", "matched_curie",
        "is_obsolete", "obsolete_label", "obsolete_curie", "replacement_curie", "replacement_label",
        "consider_replacements"
    ]
    writer = csv.DictWriter(file, delimiter="\t", fieldnames=fieldnames)

    # Write header
    writer.writeheader()

    # Write data rows
    writer.writerows(term_legitimacy_data)


In [37]:
# Extract `content_value` entries from term_legitimacy_data where `is_obsolete` is True
obsolete_content_values = {
    entry["content_value"] for entry in term_legitimacy_data if entry["is_obsolete"]
}

# Create a unified list to hold all formatted entries for writing
needs_review_data = []

# Process `needs_review_list` while filtering out obsolete content values
for entry in needs_review_list:
    if entry["content_value"] not in obsolete_content_values:
        formatted_entry = {
            "content_value": entry["content_value"],
            "emp500_usages": entry.pop("count", 0),  # Rename "count" to "emp500_usages"
            "predicate_id": entry.get("predicate_id", ""),
            "object_id": entry.get("object_id", ""),
            "object_label": entry.get("object_label", ""),
            "subject_start": entry.get("subject_start", ""),
            "subject_end": entry.get("subject_end", ""),
            "match_string": entry.get("match_string", ""),
            "match_string_len": entry.get("match_string_len", ""),
            "matches_whole_text": entry.get("matches_whole_text", "")
        }
        needs_review_data.append(formatted_entry)

# Process unmatched content values (excluding obsolete terms)
filtered_unmatched_content_values = unmatched_content_values - obsolete_content_values

for content_value in filtered_unmatched_content_values:
    formatted_entry = {
        "content_value": content_value,
        "emp500_usages": env_content_counter.get(content_value, 0),
        "predicate_id": "",
        "object_id": "",
        "object_label": "",
        "subject_start": "",
        "subject_end": "",
        "match_string": "",
        "match_string_len": "",
        "matches_whole_text": ""
    }
    needs_review_data.append(formatted_entry)

# Define fieldnames based on actual expected columns
fieldnames = [
    "content_value", "emp500_usages", "predicate_id", "object_id", "object_label",
    "subject_start", "subject_end", "match_string", "match_string_len", "matches_whole_text"
]

# Save Needs Review to TSV
needs_review_tsv = "emp500_env_context_vocabulary_for_review.tsv"

# Write collected data to TSV using csv.DictWriter
with open(needs_review_tsv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, delimiter="\t", fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(needs_review_data)

print(f"Needs Review TSV saved to {needs_review_tsv} (Filtered & ensures obsolete terms do NOT reappear)")


Needs Review TSV saved to emp500_env_context_vocabulary_for_review.tsv (Filtered & ensures obsolete terms do NOT reappear)


In [38]:
# Create a lookup dictionary from term_legitimacy_data using content_value as the key
term_lookup = {entry["content_value"]: entry for entry in term_legitimacy_data}

In [39]:
# List to store the transformed MIxS-style data
emp500_mixs_triad_rows = []

# List to collect logs for missing term annotations
missing_term_logs = []

# Expected subset mappings and corresponding boolean flags
subset_validations = {
    "env_broad_scale": ("is_biome", biome_curies),
    "env_local_scale": ("is_selected_abp", non_biome_non_env_mat_abp_curies),
    "env_medium": ("is_env_mat", env_mat_curies)  # Corrected order
}

# Iterate over each biosample record
for sample in emp500_env_triad_rows:
    # Retain original data
    transformed_sample = sample.copy()

    for key in target_harmonized_names:
        original_value = sample.get(key, "").strip()

        # Add original value as-is
        transformed_sample[key] = original_value

        # Initialize MIxS-style key
        mixs_key = f"{key}_mixs"

        if not original_value:
            transformed_sample[mixs_key] = ""  # Keep blank if no value
            continue  # Skip empty values

        # Lookup the term legitimacy data
        term_data = term_lookup.get(original_value, {})

        # Extract matching or replacement details
        matched_label = term_data.get("matched_label", "")
        matched_curie = term_data.get("matched_curie", "")
        replacement_label = term_data.get("replacement_label", "")
        replacement_curie = term_data.get("replacement_curie", "")
        consider_replacements = term_data.get("consider_replacements", "")

        # Build MIxS-style value
        mixs_value = ""
        via_obsolescence = False  # Default to False

        if replacement_label and replacement_curie:
            mixs_value = f"{replacement_label} [{replacement_curie}]"
            via_obsolescence = True  # Mark as replaced due to obsolescence
        elif matched_label and matched_curie:
            mixs_value = f"{matched_label} [{matched_curie}]"
        else:
            # Log missing annotation details
            missing_term_logs.append({
                "accession": sample["accession"],
                "problematic_key": key,
                "problematic_value": original_value,
                "consider_replacements": consider_replacements
            })

        # Store the transformed MIxS-style value
        transformed_sample[mixs_key] = mixs_value
        transformed_sample[f"{mixs_key}_via_obsolescence"] = via_obsolescence  # Store the obsolescence flag

        # Validate subset membership and set boolean flag
        curie = replacement_curie if via_obsolescence else matched_curie
        subset_key, expected_curies = subset_validations.get(key, (None, None))

        if subset_key:
            transformed_sample[subset_key] = curie in expected_curies if curie else False

    # Append transformed row
    emp500_mixs_triad_rows.append(transformed_sample)


In [40]:
# Define output file names
missing_terms_tsv = "emp500_annotations_for_review.tsv"

In [41]:
# Write missing_term_logs to a TSV file
if missing_term_logs:
    fieldnames = missing_term_logs[0].keys()  # Get all column headers from the first row

    with open(missing_terms_tsv, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, delimiter="\t", fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(missing_term_logs)

    print(f"Missing term logs saved to {missing_terms_tsv}")

Missing term logs saved to emp500_annotations_for_review.tsv


In [42]:
# Initialize a Counter to track occurrences of env_medium_mixs
env_medium_mixs_counter = Counter()

# Iterate through the list of dictionaries
for row in emp500_mixs_triad_rows:
    # Ensure is_env_mat is False
    if not row.get("is_env_mat", True):  # Default to True if missing
        env_medium_mixs_value = row.get("env_medium_mixs", None)  # Get the value, default to None if missing
        env_medium_mixs_counter[env_medium_mixs_value] += 1  # Increment the count



In [43]:
env_medium_mixs_counter

Counter({'feces [UBERON:0001988]': 184,
         'intestine environment [ENVO:2100002]': 15,
         '': 15,
         'water [CHEBI:15377]': 7,
         'mucus [UBERON:0000912]': 4,
         'sebum [UBERON:0001866]': 3,
         'biofilm [ENVO:00002034]': 2})

feces, mucus and sebum are reasonable for some samples. Should we just add them to the env_medium acceptable list (currently just environment materials)

check NMDC value sets/enums for biofile

inappropriate/inadequate
- intestine environment
- ""

water should be replaced with EnvO's 'liquid water' ENVO:00002006


In [44]:
# Initialize a Counter to track occurrences of (problematic_key, problematic_value)
problematic_counts = Counter()

# Count occurrences of each unique problematic_key/problematic_value combination
for entry in missing_term_logs:
    key_value_pair = (entry["problematic_key"], entry["problematic_value"])
    problematic_counts[key_value_pair] += 1

# Convert the Counter into a list of dictionaries
problematic_summary = [
    {"problematic_key": key, "problematic_value": value, "count": count}
    for (key, value), count in problematic_counts.items()
]


In [45]:
problematic_summary

[{'problematic_key': 'env_broad_scale',
  'problematic_value': 'Small lake biome',
  'count': 20},
 {'problematic_key': 'env_local_scale',
  'problematic_value': 'marine benthic feature',
  'count': 125},
 {'problematic_key': 'env_local_scale',
  'problematic_value': 'host-associated habitat',
  'count': 133},
 {'problematic_key': 'env_local_scale',
  'problematic_value': 'animal-associated habitat',
  'count': 84},
 {'problematic_key': 'env_local_scale',
  'problematic_value': 'mangrove',
  'count': 1},
 {'problematic_key': 'env_local_scale',
  'problematic_value': 'intermittent pond',
  'count': 7},
 {'problematic_key': 'env_local_scale',
  'problematic_value': 'insecta-associated habitat',
  'count': 28},
 {'problematic_key': 'env_medium',
  'problematic_value': 'coastal water',
  'count': 9},
 {'problematic_key': 'env_local_scale',
  'problematic_value': 'bog',
  'count': 3},
 {'problematic_key': 'env_local_scale',
  'problematic_value': 'oil reservior',
  'count': 14},
 {'problema

In [46]:
problematic_summary_tsv = "emp500_undefined_triad_slot_value_counts.tsv"

fieldnames = problematic_summary[0].keys()  # Get all column headers from the first row

with open(problematic_summary_tsv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, delimiter="\t", fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(problematic_summary)

In [47]:
LATEST_GOLD_MONGODB = "gold_metadata_studies_with_samples"
GOLD_SEQ_PROJ_COLLECTION = "projects"

gold_db = client[LATEST_GOLD_MONGODB]  # Dynamically select database

gold_seq_proj_collection = gold_db[GOLD_SEQ_PROJ_COLLECTION]

In [48]:
# retrieve emp500 projects from gold (PRJEB42019)
query = {"ncbiBioProjectAccession": emp500_bioproject_accession}

# Execute the query
cursor = gold_seq_proj_collection.find(query)


In [49]:
emp500_gold_seq_projs = list(cursor)

In [50]:
emp500_gold_seq_proj_summary = []
for proj in emp500_gold_seq_projs:
    proj_summary = {
        "biosampleGoldId": proj.get("biosampleGoldId", ""),
        "ncbiBioSampleAccession": proj.get("ncbiBioSampleAccession", ""),
        "ncbiBioProjectAccession": proj.get("ncbiBioProjectAccession", ""),
        "studyGoldId": proj.get("studyGoldId", ""),
    }
    emp500_gold_seq_proj_summary.append(proj_summary)

In [52]:
# Convert emp500_gold_seq_proj_summary into a dict keyed on ncbiBioSampleAccession
gold_summary_dict = {}
for entry in emp500_gold_seq_proj_summary:
    accession = entry["ncbiBioSampleAccession"]
    new_data = {k: v for k, v in entry.items() if k != "ncbiBioSampleAccession"}

    if accession in gold_summary_dict:
        existing_data = gold_summary_dict[accession]

        if existing_data != new_data:  # Only report if different
            print(f"Duplicate found for accession {accession}. Updating with new data.")
            pprint.pprint(existing_data)
            pprint.pprint(new_data)

        # Update to keep the most recent entry
        gold_summary_dict[accession] = new_data
    else:
        gold_summary_dict[accession] = new_data

In [53]:
# Merge with emp500_mixs_triad_rows
merged_rows = []
for row in emp500_mixs_triad_rows:
    accession = row["accession"]
    merged_row = row.copy()  # Start with the original row

    # Merge in GOLD project data if available
    if accession in gold_summary_dict:
        merged_row.update(gold_summary_dict[accession])

    merged_rows.append(merged_row)

In [54]:
emp500_mixs_style_triads_plus_gold_ids_tsv = "emp500_mixs_style_triads_plus_gold_ids.tsv"

# Define column order based on user specification
ordered_columns = [
    "accession", "package",
    "ncbiBioProjectAccession",
    "biosampleGoldId",
    "studyGoldId",
    "env_broad_scale", "env_broad_scale_mixs", "env_broad_scale_mixs_via_obsolescence", "is_biome",
    "env_local_scale", "env_local_scale_mixs", "env_local_scale_mixs_via_obsolescence", "is_selected_abp",
    "env_medium", "env_medium_mixs", "env_medium_mixs_via_obsolescence", "is_env_mat"
]

# Write merged data to a TSV file
with open(emp500_mixs_style_triads_plus_gold_ids_tsv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, delimiter="\t", fieldnames=ordered_columns)
    writer.writeheader()
    writer.writerows(merged_rows)

print(f"Merged MIxS-style environmental triad data saved to {emp500_mixs_style_triads_plus_gold_ids_tsv}")

Merged MIxS-style environmental triad data saved to emp500_mixs_style_triads_plus_gold_ids.tsv
