In [1]:
import pprint
import re
import string
import urllib.parse
from collections import Counter, defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import requests_cache
import yaml
from curies import Converter
from dotenv import dotenv_values
from prefixmaps.io.parser import load_multi_context
from pymongo import MongoClient
from tqdm.notebook import tqdm


In [4]:
def normalize_string(s):
    """
    Normalizes a string by lowercasing, trimming, and removing duplicate whitespace.
    """
    if not isinstance(s, str):
        return ""  # handle non string input.
    s = s.lower()
    s = s.strip()
    s = re.sub(r'\s+', ' ', s)  # Replace multiple whitespace with single space
    return s

In [8]:
def repair_curie(raw_curie: str) -> str:
    """
    Attempt to normalize or 'repair' the CURIE-like string:
      1. Normalize namespace to uppercase if it looks like 'ENVO'.
      2. Replace underscores or periods with a colon (':') between prefix & ID.
      3. Strip extra whitespace around the ID.
    """
    curie = raw_curie.strip()

    # Attempt to split at the first occurrence of [.:_]
    match = re.match(r'^([A-Za-z0-9]+)([\.:_])\s*(.*)$', curie) # todo not sensitive enough to the length and letter/number composition of prefix and local portion
    if not match:
        # If it doesn't match, just return the stripped version
        return curie

    prefix, sep, rest = match.groups()

    # Rebuild as prefix:rest with no extra spaces
    # todo repair should include normalizing the case of the prefix
    repaired = f"{prefix}:{rest.strip()}" # todo excessive replacement of underscores with colons in insensitive curies above
    return repaired

In [9]:
def split_annotations(raw_content: str):
    """
    Split a single raw line into sub-annotations, returning a list of dicts:
      [
         {
           'raw_chunk': "...",         # substring after splitting
           'raw_curie': "...",         # curie as found in text (if any)
           'repaired_curie': "...",    # normalized curie (if any)
           'raw_label': "...",         # substring minus the matched curie portion
           'cleaned_label': "..."      # lowercased + extra whitespace removed
         },
         ...
      ]
    """
    # Split on major delimiters (pipe, semicolon, slash, comma)
    components = re.split(DELIMITERS, raw_content)

    results = []
    for component in components:
        component = component.strip()
        if not component:
            continue

        match = CURIE_PATTERN.search(component) # todo pattern is over-eager in detecting CURIes
        if match:
            raw_curie = match.group(1)
            repaired = repair_curie(raw_curie)

            # Remove the matched substring from the chunk to guess the label
            label_guess = component[:match.start()] + component[match.end():]
            # Also remove leftover brackets or parentheses
            label_guess = re.sub(r'[\(\)\[\]]+', ' ', label_guess).strip()

            cleaned_label = re.sub(f"[{re.escape(string.punctuation)}]", " ", label_guess.lower())
            cleaned_label = re.sub(f"\d+", " ", cleaned_label)
            cleaned_label = re.sub(r'\s+', ' ', cleaned_label.strip())

        else:
            # No curie found; treat entire chunk as label
            raw_curie = ''
            repaired = ''
            cleaned_label = re.sub(f"[{re.escape(string.punctuation)}]", " ", component.lower())
            cleaned_label = re.sub(f"\d+", " ", cleaned_label)
            cleaned_label = re.sub(r'\s+', ' ', cleaned_label.strip())
            label_guess = component

        results.append({
            'raw_component': component,
            'raw_curie': raw_curie,
            'repaired_curie': repaired,
            'repaired_prefix': repaired.split(":")[0],
            'raw_label': label_guess,
            'cleaned_label': cleaned_label,
            'cleaned_label_len': len(cleaned_label),
        })

    return results

In [12]:
def analyze_ontology_importance(ontology_matches):
    # Step 1: Compute term frequencies across all ontologies
    term_counts = Counter()
    for terms in ontology_matches.values():
        term_counts.update(terms)

    # Step 2: Identify unique terms and count them per ontology
    unique_terms_per_ontology = defaultdict(set)
    unique_counts_per_ontology = {}

    for ontology, terms in ontology_matches.items():
        unique_terms = {term for term in terms if term_counts[term] == 1}
        unique_terms_per_ontology[ontology] = unique_terms
        unique_counts_per_ontology[ontology] = len(unique_terms)

    # Step 3: Sort ontologies by unique term count in descending order
    ranked_ontologies = sorted(unique_counts_per_ontology.items(), key=lambda x: x[1], reverse=True)

    return ranked_ontologies, dict(unique_terms_per_ontology)  # Ensure it returns a dictionary, not defaultdict


----

## For determining which ontologies to include in the OAK annotation process
or just trust `prefix_count_frame`?

enhance that by checking `id` in https://raw.githubusercontent.com/OBOFoundry/OBOFoundry.github.io/refs/heads/master/registry/ontologies.yml

will have to normalize cases

In [79]:
# Specify the path to your .env file
env_path = "../../../local/.env"
# Load variables into a dictionary
env_vars = dotenv_values(env_path)
BIOPORTAL_API_KEY = env_vars['BIOPORTAL_API_KEY']

In [80]:
# BIOPORTAL_SEARCH_URL = "https://data.bioontology.org/search"

In [81]:
# BIOPORTAL_PAGE_SIZE = 100

In [82]:
# # Counter to store occurrences of each ontology
# ontology_counter = Counter()
#
# # Iterate over each query
# for query in queries:
#     print(f"\nSearching for: {query}")
#     page = 1  # Start from page 1
#
#     while True:  # Keep iterating until there are no more pages
#         params = {
#             "q": query,
#             "apikey": BIOPORTAL_API_KEY,
#             "page": page,
#             "pagesize": BIOPORTAL_PAGE_SIZE,
#             "require_exact_match": True
#         }
#
#         # Build the full URL
#         request = requests.Request('GET', BIOPORTAL_SEARCH_URL, params=params)
#         prepared = request.prepare()
#
#         # Print the full URL
#         print(prepared.url)
#
#         # Now make the actual request
#         response = requests.get(BIOPORTAL_SEARCH_URL, params=params)
#
#         if response.status_code == 200:
#             data = response.json()
#             if "collection" not in data or not data["collection"]:
#                 break  # No more results
#
#             for result in data["collection"]:
#                 ontology = result["links"]["ontology"]
#                 ontology_counter[ontology] += 1  # Increment count for this ontology
#
#             # Check if there is a next page
#             if "nextPage" in data["links"]:
#                 page += 1  # Move to the next page
#                 print(f"{page = }")
#             else:
#                 break  # No more pages
#         else:
#             print(f"Error for query '{query}', page {page}: {response.status_code} {response.text}")
#             break  # Exit loop on error
#


In [83]:
# ontology_hit_counts = {k:len(v) for k,v in ontology_matches.items()}

In [84]:
# ontology_hit_counts = dict(sorted(ontology_hit_counts.items(), key=lambda item: item[1], reverse=True))

In [85]:
# # Invert the mapping
# query_matches = defaultdict(set)
# for ontology, classes in ontology_matches.items():
#     for cls in classes:
#         query_matches[cls].add(ontology)

In [86]:
def analyze_ontology_importance_from_mongo(db_name, collection_name, mongo_uri="mongodb://localhost:27017/"):
    # Connect to MongoDB
    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]

    ontology_matches = defaultdict(set)

    for doc in collection.find():
        component_label = doc["component_label"]
        ols_hits = doc.get("ols_hits", [])

        for hit in ols_hits:
            ontology = hit["ontology"]
            ontology_matches[ontology].add(component_label)

    return dict(ontology_matches)

In [87]:
hits_by_ontology = analyze_ontology_importance_from_mongo(
    db_name="ncbi_metadata",
    collection_name="triad_components_labels"
)

In [88]:
hits_by_ontology

{'envo': {'ditch',
  'pharmacy',
  'administrative region',
  'agricultural facility',
  'sheepfold',
  'metallic material',
  'mediterranean shrubland biome',
  'saline spring',
  'salt lake',
  'mouth',
  'ice mass',
  'material primarily composed of biogenic carbonates',
  'lake water',
  'waterbody',
  'island',
  'culturing environment',
  'biosphere',
  'chicken manure',
  'terrestrial environmental zone',
  'alluvial soil',
  'shrimp pond',
  'horse manure',
  'mesoplastic particle',
  'pancake ice',
  'microcosm',
  'hydrothermal vent',
  'town hall',
  'dense settlement biome',
  'tropical woodland biome',
  'sink',
  'brine channel',
  'biomass',
  'cryopeg',
  'stony soil',
  'glacier',
  'container of an intermittent lake',
  'precipitation process',
  'lakeshore',
  'flood fringe',
  'contaminated soil',
  'salt marsh',
  'university campus',
  'infralittoral zone',
  'polar biome',
  'wetness of soil',
  'mountain forest soil',
  'umbrisol',
  'hallway',
  'glacial proces

In [89]:
def invert_hits_by_ontology(hits_by_ontology):
    labels_to_ontologies = defaultdict(set)

    for ontology, labels in hits_by_ontology.items():
        for label in labels:
            labels_to_ontologies[label].add(ontology)

    return dict(labels_to_ontologies)


In [90]:
hits_by_component_label = invert_hits_by_ontology(hits_by_ontology)

In [91]:
hits_by_component_label

{'ditch': {'envo'},
 'pharmacy': {'envo', 'ncit', 'snomed'},
 'administrative region': {'envo'},
 'agricultural facility': {'envo'},
 'sheepfold': {'envo'},
 'metallic material': {'envo', 'snomed'},
 'mediterranean shrubland biome': {'envo'},
 'saline spring': {'envo'},
 'salt lake': {'envo'},
 'mouth': {'envo', 'ncit', 'uberon'},
 'ice mass': {'envo'},
 'material primarily composed of biogenic carbonates': {'envo'},
 'lake water': {'envo', 'snomed'},
 'waterbody': {'envo'},
 'island': {'envo', 'ncit', 'snomed'},
 'culturing environment': {'envo'},
 'biosphere': {'envo', 'exo'},
 'chicken manure': {'envo'},
 'terrestrial environmental zone': {'envo'},
 'alluvial soil': {'envo'},
 'shrimp pond': {'envo'},
 'horse manure': {'envo'},
 'mesoplastic particle': {'envo'},
 'pancake ice': {'envo'},
 'microcosm': {'envo'},
 'hydrothermal vent': {'envo'},
 'town hall': {'envo'},
 'dense settlement biome': {'envo'},
 'tropical woodland biome': {'envo'},
 'sink': {'envo'},
 'brine channel': {'envo

In [92]:
def find_exclusive_hits(hits_by_component_label):
    exclusive_hits = defaultdict(set)

    for component_label, ontologies in hits_by_component_label.items():
        if len(ontologies) == 1:  # If the label is found in only one ontology
            ontology = next(iter(ontologies))  # Get the single ontology
            exclusive_hits[ontology].add(component_label)

    return dict(exclusive_hits)

In [93]:
exclusive_hits_by_ontology = find_exclusive_hits(hits_by_component_label)


In [94]:
exclusive_hits_by_ontology

{'envo': {'ditch',
  'administrative region',
  'agricultural facility',
  'sheepfold',
  'mediterranean shrubland biome',
  'saline spring',
  'salt lake',
  'ice mass',
  'material primarily composed of biogenic carbonates',
  'waterbody',
  'culturing environment',
  'chicken manure',
  'terrestrial environmental zone',
  'alluvial soil',
  'shrimp pond',
  'horse manure',
  'mesoplastic particle',
  'pancake ice',
  'microcosm',
  'hydrothermal vent',
  'town hall',
  'dense settlement biome',
  'sink',
  'tropical woodland biome',
  'brine channel',
  'biomass',
  'cryopeg',
  'stony soil',
  'glacier',
  'container of an intermittent lake',
  'precipitation process',
  'lakeshore',
  'flood fringe',
  'contaminated soil',
  'salt marsh',
  'university campus',
  'infralittoral zone',
  'polar biome',
  'wetness of soil',
  'mountain forest soil',
  'umbrisol',
  'glacial process',
  'acid rock drainage',
  'cleanroom',
  'terrestrial ecosystem',
  'slush ice',
  'marine coral ree

In [95]:
def count_exclusive_hits(exclusive_hits_by_ontology):
    exclusive_hit_counts = {ontology: len(labels) for ontology, labels in exclusive_hits_by_ontology.items()}
    sorted_exclusive_hit_counts = dict(sorted(exclusive_hit_counts.items(), key=lambda x: x[1], reverse=True))
    return sorted_exclusive_hit_counts


In [96]:
exclusive_hit_counts = count_exclusive_hits(exclusive_hits_by_ontology)



display the number of unique hits per target ontology

In [97]:
exclusive_hit_counts

{'envo': 1578,
 'ncit': 724,
 'ncbitaxon': 375,
 'uberon': 232,
 'snomed': 225,
 'foodon': 172,
 'micro': 32,
 'po': 29,
 'agro': 27,
 'pato': 21,
 'obi': 16,
 'genepio': 7,
 'pco': 6,
 'mco': 6,
 'ohmi': 5,
 'exo': 3}

In [98]:
COMPONENT_COUNT_THRESHOLD

2

In [99]:
exclusive_hits_by_ontology['snomed']

{'acute pulpitis',
 'agar',
 'agar medium',
 'agarose',
 'ageing',
 'agrochemical',
 'altitude',
 'aluminium',
 'amoeba culture',
 'anaerobic chamber',
 'anaerobic culture',
 'anal swab',
 'animal hide',
 'avulsion',
 'bacterial culture',
 'bacterial genome',
 'baker',
 'bedroom',
 'biliary sludge',
 'blood culture',
 'brain tissue',
 'bran',
 'brass',
 'breast normal',
 'bronchial fluid',
 'burial',
 'canalisation',
 'caries active',
 'catarrhal gingivitis',
 'catheter tip',
 'cereal',
 'cervical swab',
 'chimney',
 'city environment',
 'coagulant',
 'coastal environment',
 'coastal sea',
 'cockroach',
 'cold water',
 'colonic contents',
 'complicated appendicitis',
 'concha bullosa',
 'controlled temperature',
 'coriaria thymifolia',
 'countryside',
 'cyst fluid',
 'cytology brush',
 'dermatophyte',
 'diabetic foot ulcer',
 'diarrhoea',
 'dining room',
 'douglas fir',
 'downstream',
 'duodenal',
 'duodenal biopsy sample',
 'duodenal juice',
 'ear discharge',
 'elevator',
 'endodontic

In [100]:
triad_components_labels_collection

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'ncbi_metadata'), 'triad_components_labels')

In [101]:
# Aggregation query
pipeline = [
    {
        "$project": {
            "_id": 0,
            "component_label": 1,
            "count": 1,
            "exact_something_match_count": {
                "$size": {
                    "$filter": {
                        "input": { "$ifNull": ["$ols_hits", []] },  # Ensure ols_hits is always an array
                        "as": "hit",
                        "cond": {"$eq": ["$$hit.exact_something_match", True]}
                    }
                }
            },
            "exact_label_match_count": {
                "$size": {
                    "$filter": {
                        "input": { "$ifNull": ["$ols_hits", []] },  # Ensure ols_hits is always an array
                        "as": "hit",
                        "cond": {"$eq": ["$$hit.exact_label_match", True]}
                    }
                }
            }
        }
    }
]

In [102]:
# Run the aggregation query
results = list(triad_components_labels_collection.aggregate(pipeline))

In [103]:
# Convert results to a Pandas DataFrame
df = pd.DataFrame(results)

In [104]:
df

Unnamed: 0,component_label,count,exact_something_match_count,exact_label_match_count
0,terrestrial biome,185397,1,1
1,human associated habitat,471324,0,0
2,biological product,1950,2,1
3,homo sapiens associated habitat,246,0,0
4,aquatic,43759,0,0
...,...,...,...,...
45152,illinois,2,2,2
45153,salar de pajonale,14,0,0
45154,human esophagus tissue,68,0,0
45155,intretidal zone,229,0,0


In [105]:
df.shape

(45157, 4)

In [166]:
followup_df = df.loc[(df['exact_label_match_count'] == 0) & (df['exact_something_match_count'] == 0) & (df['count'] > 1), ['component_label']]


In [167]:
followup_df

Unnamed: 0,component_label
1,human associated habitat
3,homo sapiens associated habitat
4,aquatic
5,pacific ocean
8,rainforest division
...,...
45150,dna isolation from enriched medium http
45153,salar de pajonale
45154,human esophagus tissue
45155,intretidal zone


In [168]:
followup_df.to_csv("common_components_with_no_exact_label_match.tsv", sep="\t", index=False)

In [109]:
df[df["count"] >= COMPONENT_COUNT_THRESHOLD].shape[0]

32402

In [110]:
# low return on investment in absolute terms but does give a good sense of which ontologioes to use
df[(df["count"] >= COMPONENT_COUNT_THRESHOLD) & (df["exact_label_match_count"] > 0)].shape[0]

4109

In [111]:
df[(df["count"] < COMPONENT_COUNT_THRESHOLD) & (df["exact_label_match_count"] > 0)].shape[0]

0

In [112]:
df[(df["count"] >= COMPONENT_COUNT_THRESHOLD) & (df["exact_label_match_count"] == 0)].shape[0]

28293

In [113]:
df[df["count"] == 1].shape[0]

12755

In [114]:
df[df["count"] == 2].shape[0]

4489

In [115]:
df[df["count"] == 3].shape[0]

2817

In [116]:
# # Aggregate to count unique occurrences
# pipeline = [
#     {"$unwind": "$parsed_annotations"},
#     {"$group": {"_id": "$parsed_annotations.repaired_prefix", "count": {"$sum": 1}}}
# ]

In [117]:
# results = list(triad_values_collection.aggregate(pipeline))

In [118]:
# # Convert to dictionary (optional)
# count_dict = {doc["_id"]: doc["count"] for doc in results if doc["_id"] is not None}

In [119]:
# # Convert to DataFrame
# count_frame = pd.DataFrame(list(count_dict.items()), columns=["Key", "Value"])

In [120]:
# count_frame

In [121]:
# see also prefix_count_frame ?
#count_frame only adds an explicit row for no prefix

In [122]:
# add mondo and or doid?

In [123]:
# isolate species names like human, mouse

In [124]:
# what happened to strings like envo:soil?

In [125]:
# look up CURIes

In [126]:
# Aggregate to count unique occurrences
pipeline = [
    {"$unwind": "$parsed_annotations"},
    {"$group": {"_id": "$parsed_annotations.repaired_curie", "count": {"$sum": 1}}}
]

In [127]:
results = list(triad_values_collection.aggregate(pipeline))

In [128]:
count_dict = {doc["_id"]: doc["count"] for doc in results if doc["_id"] is not None}

In [129]:
curie_count_frame = pd.DataFrame(list(count_dict.items()), columns=["CURIe", "count"])

In [130]:
curie_count_frame[["prefix", "local_id"]] = curie_count_frame["CURIe"].str.split(":", expand=True)

In [131]:
curie_count_frame["obo_or_bioportal"] = curie_count_frame["prefix"].str.lower().isin(legit_prefixes_lc)

In [132]:
context = load_multi_context(["merged"])


In [133]:
extended_prefix_map = context.as_extended_prefix_map()


In [134]:
converter = Converter.from_extended_prefix_map(extended_prefix_map)
# 20 seconds

In [135]:
# Function to safely expand CURIEs, ignoring invalid ones
def safe_expand(curie):
    if isinstance(curie, str) and ":" in curie:  # Ensure it's a CURIE
        return converter.expand(curie.upper())
    return None  # Return None for invalid CURIEs



In [136]:
# Apply the function to expand CURIEs
curie_count_frame["inferred_uri"] = curie_count_frame["CURIe"].apply(safe_expand)

In [137]:
curie_count_frame

Unnamed: 0,CURIe,count,prefix,local_id,obo_or_bioportal,inferred_uri
0,ENVO:00002632,2,ENVO,00002632,True,http://purl.obolibrary.org/obo/ENVO_00002632
1,seqMAT57:40,1,seqMAT57,40,False,
2,946:C,1,946,C,False,
3,mouse:0310,1,mouse,0310,False,
4,ENVO:00002316,5,ENVO,00002316,True,http://purl.obolibrary.org/obo/ENVO_00002316
...,...,...,...,...,...,...
38310,Horse:4N1,1,Horse,4N1,False,
38311,sample:620,1,sample,620,False,
38312,Rhizosphere:27,1,Rhizosphere,27,False,
38313,sample:606,1,sample,606,False,


Those counts may not be really useful because they're not multiplied out my the number of Biosample annotations that were split into a component from whihc that CURIe could be extracted

In [138]:
# Function to get the label from BioPortal safely
def get_bioportal_info(uri, prefix, BIOPORTAL_API_KEY):
    if not isinstance(uri, str) or not uri.startswith(("http://", "https://")):
        return None, None  # Ignore invalid URIs

    if not isinstance(prefix, str):
        return None, None

    # print(uri)

    # Upper-case the ontology prefix
    ontology = prefix.upper()

    # URL-encode the inferred URI
    encoded_uri = urllib.parse.quote(uri, safe="")

    # Correct API request URL format
    url = f"https://data.bioontology.org/ontologies/{ontology}/classes/{encoded_uri}?apikey={BIOPORTAL_API_KEY}"

    # # Print the URL before making the request
    # print(f"Querying BioPortal: {url}")

    try:
        response = requests.get(url, headers={"Authorization": f"apikey {BIOPORTAL_API_KEY}"})

        if response.status_code == 200:
            data = response.json()
            pref_label = data.get("prefLabel", "")
            # print(pref_label)
            obsolete = data.get("obsolete", False)  # Default to False if missing
            # print(obsolete)
            return pref_label, obsolete
        else:
            return f"Error: {response.status_code}"
    except Exception as e:
        return None, None  # todo better handling/reporting

In [139]:
# Filter rows where count > 1 and obo_or_bioportal is True
promising_uri_frame = curie_count_frame[
    (curie_count_frame["count"] > 1) & (curie_count_frame["obo_or_bioportal"] == True)
].copy()

In [323]:
promising_uri_frame

Unnamed: 0,CURIe,count,prefix,local_id,obo_or_bioportal,inferred_uri,bioportal_label,is_obsolete
0,ENVO:00002632,2,ENVO,00002632,True,http://purl.obolibrary.org/obo/ENVO_00002632,Error: 404,
4,ENVO:00002316,5,ENVO,00002316,True,http://purl.obolibrary.org/obo/ENVO_00002316,Error: 404,
10,ENVO:02000076,4,ENVO,02000076,True,http://purl.obolibrary.org/obo/ENVO_02000076,obsolete potato salad,True
22,UBERON:0003347,3,UBERON,0003347,True,http://purl.obolibrary.org/obo/UBERON_0003347,UBERON_0003347,True
28,ENVO:00000168,2,ENVO,00000168,True,http://purl.obolibrary.org/obo/ENVO_00000168,blowhole,False
...,...,...,...,...,...,...,...,...
38291,ENVO:00002394,3,ENVO,00002394,True,http://purl.obolibrary.org/obo/ENVO_00002394,Error: 404,
38292,ENVO:00000571,5,ENVO,00000571,True,http://purl.obolibrary.org/obo/ENVO_00000571,obsolete arboreal habitat,True
38294,ENVO:00001013,2,ENVO,00001013,True,http://purl.obolibrary.org/obo/ENVO_00001013,Error: 404,
38306,ENVO:03501290,2,ENVO,03501290,True,http://purl.obolibrary.org/obo/ENVO_03501290,supermarket,False


In [141]:
# Enable tqdm for Pandas in Jupyter Notebook
tqdm.pandas()

In [142]:
# Apply function to get BioPortal labels and obsolescence status
promising_uri_frame[["bioportal_label", "is_obsolete"]] = promising_uri_frame.progress_apply(
    lambda row: pd.Series(get_bioportal_info(row["inferred_uri"], row["prefix"], BIOPORTAL_API_KEY)),
    axis=1  # Apply function row-wise
)

  0%|          | 0/4545 [00:00<?, ?it/s]

----

In [143]:
# bioportal ncbi taxon URIs look like
# http://purl.bioontology.org/ontology/NCBITAXON/9606

# note most of the taxon code appear to be run-ons from 9606 human

# not
# http://purl.obolibrary.org/obo/NCBITaxon_7460

# i just dealt with that and NCIT CURIes in the curation below

In [144]:
promising_uri_frame.to_csv("non_singleton_prefiltered_bioportal_label_lookup.tsv", sep="\t", index=False)

In [145]:
# see CURIe	count	prefix	local_id	obo_or_bioportal	inferred_uri	bioportal_label	is_obsolete	manual_lookup	assuming_curie_drag	manual_lookup_curie

# in non_singleton_prefiltered_bioportal_label_lookup_curated.csv

In [146]:
# look for these in "parsed_annotations.repaired_curie" and return content

malformed_curies = [
    'ENVO:agricultural',
    'ENVO:alcohol',
    'ENVO:anaerobic',
    'ENVO:anthropogenic',
    'ENVO:coastal',
    'ENVO:cropland',
    'ENVO:desert',
    'ENVO:dry',
    'ENVO:ENVO',
    'ENVO:farm',
    'ENVO:food',
    'ENVO:forest',
    'ENVO:freshwater',
    'ENVO:grassland',
    'ENVO:human',
    'ENVO:marine',
    'ENVO:microbial',
    'ENVO:Montane',
    'ENVO:montane',
    'ENVO:ocean',
    'ENVO:PO',
    'ENVO:research',
    'ENVO:shrubland',
    'ENVO:small',
    'ENVO:surface',
    'ENVO:temperate',
    'ENVO:Temperate',
    'ENVO:tropical',
    'ENVO:Tropical',
    'ENVO:tundra',
    'ENVO:UBERON',
    'ENVO:urban',
    'ENVO:vinegar',
    'ENVO:wastewater',
    'ENVO:water',
]

In [147]:
# Query to find documents where 'parsed_annotations.repaired_curie' is in our list
query = {"parsed_annotations.repaired_curie": {"$in": malformed_curies}}

# Project to only return the 'content' field
projection = {"content": 1, "_id": 0}

# Fetch results
results = triad_values_collection.find(query, projection)

In [148]:
# Extract and print content
content_list = {doc["content"] for doc in results}


In [149]:
# Write to a text file
output_file = "contents_of_local_text_curies.tsv"
with open(output_file, "w", encoding="utf-8") as f:
    for content in content_list:
        f.write(content + "\n")

print(f"Content list written to {output_file}")

Content list written to contents_of_local_text_curies.tsv


----


In [301]:
curie_curations = dict()

split out the curie knowledge form automated searches and from curation

In [302]:
non_singleton_prefiltered_bioportal_label_lookup_curated_tsv = "non_singleton_prefiltered_bioportal_label_lookup_curated.tsv"

In [303]:
non_singleton_prefiltered_bioportal_label_lookup_curated_frame = pd.read_csv(non_singleton_prefiltered_bioportal_label_lookup_curated_tsv, sep="\t")

In [304]:
non_singleton_prefiltered_bioportal_label_lookup_curated_lod = non_singleton_prefiltered_bioportal_label_lookup_curated_frame.to_dict("records")

In [305]:
# Define the keys to extract
keys_to_include = [
    'bioportal_label', 'is_obsolete', 'manual_lookup_label',
    'manual_lookup_curie', 'curie_drag_label', 'curie_drag_curie'
]

# Iterate through the list of dictionaries and construct the curation dictionary
for record in non_singleton_prefiltered_bioportal_label_lookup_curated_lod:
    curie_key = record.get("curie")  # Extract the curie field

    if curie_key:  # Ensure curie is not None or missing
        curie_curations[curie_key] = {
            key: value
            for key, value in record.items()
            if key in keys_to_include
            and not pd.isna(value)  # Exclude NaN values
            and value not in (None, "")  # Exclude None and empty strings
            and not str(value).startswith("Error:")  # Exclude "Error:" values
        }


In [306]:
len(curie_curations)

4545

In [307]:
contents_of_local_text_curies_curated_tsv = "contents_of_local_text_curies_curated.tsv"

In [308]:
contents_of_local_text_curies_curated_frame = pd.read_csv(contents_of_local_text_curies_curated_tsv, sep="\t")

In [309]:
contents_of_local_text_curies_curated_lod = contents_of_local_text_curies_curated_frame.to_dict("records")

In [310]:
text_curations = dict()

In [311]:
# Define the keys to extract
keys_to_include = [
    'manual_lookup_curie', 'manual_lookup_label',
    'curie_drag_curie', 'curie_drag_label'
]

# Iterate through the list of dictionaries
for index, record in enumerate(contents_of_local_text_curies_curated_lod):
    if not isinstance(record, dict):  # Ensure it's a dictionary
        print(f"Skipping non-dictionary record at index {index}: {record}")
        continue

    triad_key = record.get("triad_component_label")  # Extract the new key

    if not triad_key:  # Check for missing or empty triad_component_label
        print(f"Skipping record at index {index}: Missing 'triad_component_label' field")
        continue

    # Extract only the necessary fields, filtering out None, empty strings, "Error:", and NaN values
    filtered_fields = {
        key: value
        for key, value in record.items()
        if key in keys_to_include
        and not pd.isna(value)  # Exclude NaN values
        and value not in (None, "")  # Exclude None and empty strings
        and not str(value).startswith("Error:")  # Exclude "Error:" values
    }

    if filtered_fields:  # Only add if there's valid data left
        text_curations[triad_key] = filtered_fields



In [312]:
len(text_curations)

79

In [313]:
# Get the 'curations' collection from the existing db object
curie_curations_collection = db["curie_curations"]

In [314]:
# dump scuration to JSON or YAML

In [315]:
# Prepare data for insertion by transforming curations dict into a list of documents
temp_documents = [
    {"curie": key, **curation_data}  # Expand inner dict into MongoDB document format
    for key, curation_data in curie_curations.items()
]

In [316]:
# Insert documents into MongoDB
if temp_documents:  # Ensure there's data to insert
    curie_curations_collection.insert_many(temp_documents)
    print(f"Inserted {len(temp_documents)} records.")
else:
    print("No records to insert.")


Inserted 4545 records.


In [317]:
text_curations_collection = db["text_curations"]

In [318]:
# Prepare data for insertion by transforming curations dict into a list of documents
temp_documents = [
    {"text": key, **curation_data}  # Expand inner dict into MongoDB document format
    for key, curation_data in text_curations.items()
]

In [319]:
# Insert documents into MongoDB
if temp_documents:  # Ensure there's data to insert
    text_curations_collection.insert_many(temp_documents)
    print(f"Inserted {len(temp_documents)} records .")
else:
    print("No records to insert.")

Inserted 79 records .


In [320]:
# Ensure indexes exist for efficient querying
# curations_collection.create_index([("text", 1)])

In [299]:
unmapped_component_labels = list(followup_df['component_label'])

In [300]:
len(unmapped_component_labels)

27224

In [174]:
# Load YAML into a dictionary
def load_yaml_to_dict(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = yaml.safe_load(file)
    return data


In [188]:
organisms_anatomy_terms_yaml = "organisms_anatomy_terms.yaml"

In [189]:
organisms_anatomy_terms = load_yaml_to_dict(organisms_anatomy_terms_yaml)

In [190]:
def extract_leaf_values(data):
    leaf_values = []

    if isinstance(data, dict):
        for value in data.values():
            leaf_values.extend(extract_leaf_values(value))
    elif isinstance(data, list):
        for item in data:
            leaf_values.extend(extract_leaf_values(item))
    else:
        leaf_values.append(data)  # It's a scalar value

    return leaf_values



In [191]:
organisms_anatomy_terms = extract_leaf_values(organisms_anatomy_terms)

In [192]:
len(organisms_anatomy_terms)

267

In [231]:
def extract_longest_terms(unmapped_component_labels, organisms_anatomy_terms):
    # Sort terms by length (longest first) to maximize coverage with the fewest terms
    organisms_anatomy_terms = sorted(organisms_anatomy_terms, key=len, reverse=True)

    result = {}

    for label in unmapped_component_labels:
        label_lower = label.lower()
        extracted_terms = []
        remaining_text = label_lower

        for term in organisms_anatomy_terms:
            term_lower = term.lower()
            # Search for term in the label
            if term_lower in remaining_text:
                extracted_terms.append(term)
                # Remove the matched portion
                remaining_text = re.sub(r'\b' + re.escape(term_lower) + r'\b', '', remaining_text, count=1).strip()

        if len(extracted_terms) > 0:
            result[label] = {
                "extracted_terms": extracted_terms,
                "remaining": remaining_text if remaining_text and remaining_text != label_lower else None
            }

    return result

In [232]:
# # Example Usage
# unmapped_component_labels = ["Human liver tissue", "Bacterial cell wall", "Mouse brain sample"]
# organisms_anatomy_terms = ["liver", "human", "tissue", "cell wall", "bacterial", "brain", "mouse"]



In [233]:
organisms_anatomy_components_extracted = extract_longest_terms(unmapped_component_labels, organisms_anatomy_terms)


In [234]:
def dump_dict_to_yaml(data, file_path):
    """Dump a dictionary to a YAML file."""
    with open(file_path, "w") as file:
        yaml.dump(data, file, default_flow_style=False, allow_unicode=True)

In [236]:
dump_dict_to_yaml(organisms_anatomy_components_extracted, "organisms_anatomy_components_extracted.yaml")

In [237]:
organisms_anatomy_components_extracted

{'human associated habitat': {'extracted_terms': ['human'],
  'remaining': 'associated habitat'},
 'homo sapiens associated habitat': {'extracted_terms': ['homo sapiens'],
  'remaining': 'associated habitat'},
 'temperate grasslands': {'extracted_terms': ['rat'], 'remaining': None},
 'mammalia associated habitat': {'extracted_terms': ['mammalia'],
  'remaining': 'associated habitat'},
 'root soil interface of potato': {'extracted_terms': ['potato', 'root'],
  'remaining': 'soil interface of'},
 'temperate broadleaf and mixed forest biome': {'extracted_terms': ['leaf',
   'rat'],
  'remaining': None},
 'phyllosphere': {'extracted_terms': ['phyllosphere'], 'remaining': None},
 'mammillaria carnea rhizosphere': {'extracted_terms': ['rhizosphere'],
  'remaining': 'mammillaria carnea'},
 'human gut': {'extracted_terms': ['human', 'gut'], 'remaining': None},
 'mice gut': {'extracted_terms': ['mice', 'gut'], 'remaining': None},
 'near water': {'extracted_terms': ['ear'], 'remaining': None},
 

In [242]:
def accumulate_extracted_terms(extraction_dict):
    accumulated_set = set()

    for key, value in extraction_dict.items():
        accumulated_set.update(value["extracted_terms"])
        remaining = value["remaining"]
        # if remaining and remaining.strip() and remaining.lower() != key.lower():
        #     accumulated_set.add(remaining)

    return accumulated_set

In [243]:
unique_extractions = accumulate_extracted_terms(organisms_anatomy_components_extracted)

In [244]:
len(unique_extractions)

234

In [245]:
unique_extractions

{'acropora palmata',
 'agrostis capillaris',
 'airway',
 'airways',
 'amniotic fluid',
 'antecubital fossa',
 'anterior nares',
 'apis cerana',
 'apis mellifera',
 'apple',
 'apples',
 'arthropod',
 'ascites',
 'asian hornet',
 'aspirate fluid',
 'aurelia aurita',
 'axilla',
 'bark',
 'barley',
 'bee',
 'beech',
 'bees',
 'beetle',
 'betula pendula',
 'bile',
 'birch',
 'bird',
 'birds',
 'blood',
 'blueberry',
 'bos indicus',
 'bos taurus',
 'brachypodium distachyon',
 'brain',
 'breastmilk',
 'bumblebee',
 'caeca',
 'caecum',
 'canine',
 'cat',
 'cattle',
 'cecum',
 'ceratophyllum demersum',
 'cerebrospinal fluid',
 'cervical',
 'cervicovaginal fluid',
 'cervicovaginal secretion',
 'cervicovaginal secretions',
 'cervix',
 'cervus canadensis nelsoni',
 'channel island fox',
 'cheetah',
 'chicken',
 'chickens',
 'citrus',
 'colon',
 'coral',
 'corals',
 'corn',
 'coronal sulcus',
 'cotton',
 'cotton rat',
 'cow',
 'cows',
 'daphnia magna',
 'deer',
 'digestive tract',
 'dog',
 'ear',
 

these labels arent' necessarily the real labels of the terms

In [335]:
observed_cache = db["observed_cache"]

In [336]:
# Ensure the index on curie field for fast lookup
observed_cache.create_index("curie", unique=True)

'curie_1'

In [337]:
# Iterate through documents in triad_components_labels_collection
for doc in triad_components_labels_collection.find({}, {"ols_hits": 1}):
    if "ols_hits" in doc and isinstance(doc["ols_hits"], list):
        for ols_hit in doc["ols_hits"]:
            obo_id = ols_hit.get("obo_id")
            label = ols_hit.get("label")

            if obo_id and label:
                # Check if the curie already exists in observed_validated
                if not observed_cache.find_one({"curie": obo_id}):
                    # Insert new document
                    observed_cache.insert_one({"curie": obo_id, "label": label})

In [338]:
# Iterate through documents in triad_components_labels_collection
for doc in curie_curations_collection.find({}):
    if "curie" in doc and "bioportal_label" in doc:
        if not observed_cache.find_one({"curie": doc['curie']}):
            # Insert new document
            observed_cache.insert_one({"curie": doc['curie'], "label": doc['bioportal_label']})