In [1]:
from oaklib import get_adapter
import pandas as pd
import pprint

In [2]:

def annotate_text(adapter, text):
    """
    Annotate text using the adapter.
    :param adapter: The initialized adapter.
    :param text: Input text to annotate.
    :return: List of annotation dictionaries.
    """
    annotations = adapter.annotate_text(text)
    return [vars(annotation) for annotation in annotations]

def filter_non_contained_annotations(annotations):
    """
    Filter annotations to keep only non-contained ones.
    :param annotations: List of annotation dictionaries.
    :return: List of filtered annotations.
    """
    annotations.sort(key=lambda x: (x["subject_start"], -(x["subject_end"] - x["subject_start"])))
    
    for i, annotation in enumerate(annotations):
        annotation["is_contained"] = False
        for j, other_annotation in enumerate(annotations):
            if i == j:
                continue
            if (
                annotation["subject_start"] >= other_annotation["subject_start"]
                and annotation["subject_end"] <= other_annotation["subject_end"]
            ):
                annotation["is_contained"] = True
                break

    return [ann for ann in annotations if not ann["is_contained"]]

def filter_by_span_length(annotations, min_length=3):
    """
    Filter annotations to remove those with a span shorter than the specified minimum length.
    :param annotations: List of annotation dictionaries.
    :param min_length: Minimum length of span to retain an annotation.
    :return: List of filtered annotations.
    """
    return [ann for ann in annotations if (ann["subject_end"] - ann["subject_start"]) >= min_length]

def extract_unique_object_ids(annotations, ranked_predicates=None):
    """
    Extract unique object IDs for annotations, resolving conflicts based on ranked predicates.
    :param annotations: List of annotation dictionaries.
    :param ranked_predicates: List of predicates in order of preference.
    :return: List of unique object IDs.
    """
    if ranked_predicates is None:
        ranked_predicates = ["rdfs:label", "oio:hasExactSynonym", "oio:hasRelatedSynonym"]

    resolved_ids = {}

    for annotation in annotations:
        obj_id = annotation["object_id"]
        predicate = annotation["predicate_id"]

        if obj_id not in resolved_ids or ranked_predicates.index(predicate) < ranked_predicates.index(
            resolved_ids[obj_id]["predicate_id"]
        ):
            resolved_ids[obj_id] = annotation

    return list(resolved_ids.keys())

def report_unaccounted_items(annotations, ranked_predicates=None, ranked_namespaces=None):
    """
    Report unique unaccounted predicates and namespaces from the annotations.
    :param annotations: List of annotation dictionaries.
    :param ranked_predicates: List of predicates in order of preference.
    :param ranked_namespaces: List of namespaces in order of preference.
    :return: Tuple of unique unaccounted predicates and namespaces.
    """
    if ranked_predicates is None:
        ranked_predicates = ["rdfs:label", "oio:hasExactSynonym", "oio:hasRelatedSynonym"]
    if ranked_namespaces is None:
        ranked_namespaces = ["ENVO", "CHEBI", "PATO"]

    # Collect predicates and namespaces
    all_predicates = {ann["predicate_id"] for ann in annotations}
    all_namespaces = {ann["object_id"].split(":")[0] for ann in annotations if ":" in ann["object_id"]}

    # Identify unaccounted predicates and namespaces
    unaccounted_predicates = all_predicates - set(ranked_predicates)
    unaccounted_namespaces = all_namespaces - set(ranked_namespaces)

    return list(unaccounted_predicates), list(unaccounted_namespaces)

def process_annotations(adapter, text, ranked_predicates=None, ranked_namespaces=None, min_span_length=3):
    """
    Full pipeline to process annotations, extract unique object IDs, and report unaccounted items.
    :param adapter: The initialized adapter.
    :param text: The input text to annotate.
    :param ranked_predicates: List of predicates in order of preference.
    :param ranked_namespaces: List of namespaces in order of preference.
    :param min_span_length: Minimum length of span to retain an annotation.
    :return: Tuple of filtered annotations, unique object IDs, and unaccounted items (predicates and namespaces).
    """
    annotations = annotate_text(adapter, text)
    filtered_annotations = filter_non_contained_annotations(annotations)
    filtered_annotations = filter_by_span_length(filtered_annotations, min_span_length)
    unique_object_ids = extract_unique_object_ids(filtered_annotations, ranked_predicates)

    # Report unaccounted items
    unaccounted_predicates, unaccounted_namespaces = report_unaccounted_items(filtered_annotations, ranked_predicates, ranked_namespaces)

    return filtered_annotations, unique_object_ids, unaccounted_predicates, unaccounted_namespaces


def process_text_annotations(adapter, dataframe, text_column, ranked_predicates=None, ranked_namespaces=None, min_span_length=3):
    """
    Process annotations for a column of text in a pandas DataFrame.
    :param adapter: The initialized adapter.
    :param dataframe: The pandas DataFrame containing text.
    :param text_column: The name of the column containing text.
    :param ranked_predicates: List of predicates in order of preference.
    :param ranked_namespaces: List of namespaces in order of preference.
    :param min_span_length: Minimum length of span to retain an annotation.
    :return: DataFrame with a new column for unique object IDs per text.
    """
    dataframe["unique_object_ids"] = dataframe[text_column].apply(
        lambda text: process_annotations(
            adapter, text, ranked_predicates, ranked_namespaces, min_span_length
        )[0]
    )
    return dataframe


def map_object_ids_to_labels(adapter, object_ids):
    """
    Map a list of object IDs to their corresponding labels using the adapter.
    :param adapter: The initialized adapter.
    :param object_ids: List of object IDs.
    :return: List of dictionaries with object IDs as keys and labels as values.
    """
    return [{"object_id": obj_id, "label": adapter.label(obj_id)} for obj_id in object_ids]


def get_annotation_span_from_processed_annotations(processed_annotations, object_id):
    """
    Retrieve the span information for a specific object ID from the processed annotations.
    :param processed_annotations: List of filtered annotation dictionaries (non-contained, valid spans).
    :param object_id: The object ID for which to retrieve span information.
    :return: A dictionary with the span details or None if the object ID is not found.
    """
    for annotation in processed_annotations:
        if annotation["object_id"] == object_id:
            return {
                "object_id": object_id,
                "subject_start": annotation["subject_start"],
                "subject_end": annotation["subject_end"],
                "span_text": annotation.get("match_string"),
            }
    return None


In [3]:
envo_adapter_string = "sqlite:obo:envo"
envo_adapter = get_adapter(envo_adapter_string)

In [4]:
# Example text
text = """
Soil organic matter (SOM) is a critical part of the global carbon (C) cycle. Belowground ecosystems contain more C than stored in terrestrial vegetation and the atmosphere combined (1–3), and SOM is the largest and most biologically active portion of soil C. SOM decomposition is regulated by a complex and interacting set of factors including soil structure, moisture distribution, temperature, pH, and nutrient status; collectively, these factors determine accessibility, bioavailability, and rate kinetics of SOM (4). Despite the importance of SOM in the global C cycle, the drivers of SOM decomposition from molecular to continental scales are not well understood.
"""

# Process the text
filtered_annotations, unique_object_ids, unaccounted_predicates, unaccounted_namespaces = process_annotations(
    adapter=envo_adapter,
    text=text,
    ranked_predicates=["rdfs:label", "oio:hasExactSynonym", "oio:hasRelatedSynonym", 'oio:hasNarrowSynonym',],
    ranked_namespaces=["ENVO", "CHEBI", "PATO", 'RO', 'BFO',],
    min_span_length=3
)


ERROR:root:Skipping statements(subject=ENVO:01001644,predicate=oio:hasDbXref,object=None,value=Carbonate which is formed as the result of some biological process.,datatype=None,language=None,); ValueError: Carbonate which is formed as the result of some biological process. is not a valid URI or CURIE


In [5]:
pprint.pprint(filtered_annotations)

[{'confidence': None,
  'info': None,
  'is_contained': False,
  'is_longest_match': None,
  'match_string': 'Soil organic matter',
  'match_type': None,
  'matches_whole_text': False,
  'object_aliases': [],
  'object_categories': [],
  'object_id': 'ENVO:04000008',
  'object_label': 'soil organic matter',
  'object_source': None,
  'predicate_id': 'rdfs:label',
  'subject_end': 20,
  'subject_label': None,
  'subject_source': None,
  'subject_start': 2,
  'subject_text_id': None},
 {'confidence': None,
  'info': None,
  'is_contained': False,
  'is_longest_match': None,
  'match_string': 'part of',
  'match_type': None,
  'matches_whole_text': False,
  'object_aliases': [],
  'object_categories': [],
  'object_id': 'BFO:0000050',
  'object_label': 'part of',
  'object_source': None,
  'predicate_id': 'rdfs:label',
  'subject_end': 48,
  'subject_label': None,
  'subject_source': None,
  'subject_start': 42,
  'subject_text_id': None},
 {'confidence': None,
  'info': None,
  'is_conta

In [6]:
# Print results
pprint.pprint(unique_object_ids)


['ENVO:04000008',
 'BFO:0000050',
 'ENVO:01001110',
 'ENVO:01000267',
 'PATO:0000586',
 'RO:0002334',
 'ENVO:09200009',
 'PATO:0000060',
 'PATO:0000146',
 'CHEBI:33284',
 'PATO:0000161',
 'ENVO:00000064',
 'CHEBI:25367',
 'ENVO:00000026']


In [7]:
# Map object IDs to their labels
mapped_labels = map_object_ids_to_labels(envo_adapter, unique_object_ids)


In [8]:
pprint.pprint(mapped_labels)

[{'label': 'soil organic matter', 'object_id': 'ENVO:04000008'},
 {'label': 'part of', 'object_id': 'BFO:0000050'},
 {'label': 'ecosystem', 'object_id': 'ENVO:01001110'},
 {'label': 'atmosphere', 'object_id': 'ENVO:01000267'},
 {'label': 'increased size', 'object_id': 'PATO:0000586'},
 {'label': 'regulated by', 'object_id': 'RO:0002334'},
 {'label': 'structure of soil', 'object_id': 'ENVO:09200009'},
 {'label': 'spatial pattern', 'object_id': 'PATO:0000060'},
 {'label': 'temperature', 'object_id': 'PATO:0000146'},
 {'label': 'nutrient', 'object_id': 'CHEBI:33284'},
 {'label': 'rate', 'object_id': 'PATO:0000161'},
 {'label': 'road', 'object_id': 'ENVO:00000064'},
 {'label': 'molecule', 'object_id': 'CHEBI:25367'},
 {'label': 'well', 'object_id': 'ENVO:00000026'}]


In [9]:
pprint.pprint(unaccounted_predicates)

[]


In [10]:
pprint.pprint(unaccounted_namespaces)

[]


In [11]:
object_id = "ENVO:00000064"  # Object ID for "road"

In [12]:
span_details = get_annotation_span_from_processed_annotations(filtered_annotations, object_id)

In [13]:
if span_details:
    print(f"Span details for object ID {object_id}:")
    pprint.pprint(span_details)
else:
    print(f"Object ID {object_id} not found in the annotations.")


Span details for object ID ENVO:00000064:
{'object_id': 'ENVO:00000064',
 'span_text': 'drive',
 'subject_end': 584,
 'subject_start': 580}


In [14]:
# Example DataFrame
data = {
    "id": [1, 2],
    "text": [
        "Soil organic matter is critical for the carbon cycle.",
        "Temperature and moisture affect decomposition rates."
    ]
}
df = pd.DataFrame(data)

ranked_predicates = ["rdfs:label", "oio:hasExactSynonym", "oio:hasRelatedSynonym"]
ranked_namespaces = ["ENVO", "CHEBI", "PATO"]

# Apply the processing to the 'text' column
df["unique_object_ids"] = df["text"].apply(
    lambda text: process_annotations(
        adapter=envo_adapter,
        text=text,
        ranked_predicates=ranked_predicates,
        ranked_namespaces=ranked_namespaces,
        min_span_length=3
    )[1]  # Extract only the list of unique object IDs
)

# You can inspect unaccounted_predicates or unaccounted_namespaces separately if needed by modifying the [0]
# filtered_annotations, unique_object_ids, unaccounted_predicates, unaccounted_namespaces

In [15]:
df

Unnamed: 0,id,text,unique_object_ids
0,1,Soil organic matter is critical for the carbon...,"[ENVO:04000008, ENVO:02500010]"
1,2,Temperature and moisture affect decomposition ...,"[PATO:0000146, PATO:0000161]"
