In [34]:
import re

from oaklib import get_adapter
from oaklib.datamodels.lexical_index import LexicalIndex
from oaklib.datamodels.lexical_index import LexicalTransformationPipeline
from oaklib.datamodels.lexical_index import RelationshipToTerm, LexicalGrouping, LexicalTransformation, \
    TransformationType
from oaklib.datamodels.synonymizer_datamodel import Synonymizer
from oaklib.interfaces.text_annotator_interface import TextAnnotatorInterface
from oaklib.utilities.lexical.lexical_indexer import apply_transformation
from oaklib.utilities.lexical.lexical_indexer import create_lexical_index
from oaklib.utilities.lexical.lexical_indexer import save_lexical_index

In [35]:
envo_adapter_string  = "sqlite:obo:envo"

In [36]:
po_adapter_string  = "sqlite:obo:po"

In [37]:
def are_pipelines_compatible(pipeline1, pipeline2):
  """
  Check if two pipelines are functionally equivalent

  Parameters
  ----------
  pipeline1 : LexicalTransformationPipeline
  pipeline2 : LexicalTransformationPipeline

  Returns
  -------
  bool
      True if the pipelines are functionally equivalent
  """
  # Check if they have the same number of transformations
  if len(pipeline1.transformations) != len(pipeline2.transformations):
      return False

  # Check if each transformation is the same type and has the same parameters
  for t1, t2 in zip(pipeline1.transformations, pipeline2.transformations):
      # Get type values safely, handling different object structures
      t1_type = getattr(t1.type, 'text', str(t1.type))
      t2_type = getattr(t2.type, 'text', str(t2.type))

      if t1_type != t2_type:
          return False

      # Compare params if they exist
      t1_params = getattr(t1, 'params', [])
      t2_params = getattr(t2, 'params', [])

      # Simple length comparison for params
      if len(t1_params) != len(t2_params):
          return False

  return True


In [38]:
def relationship_equals(rel1, rel2):
  """
  Compare two relationships for functional equality

  Parameters
  ----------
  rel1 : RelationshipToTerm
  rel2 : RelationshipToTerm

  Returns
  -------
  bool
      True if relationships are functionally equivalent
  """
  # Attributes that define uniqueness for a relationship
  key_attrs = ['predicate', 'element', 'element_term', 'source']

  return all(getattr(rel1, attr, None) == getattr(rel2, attr, None)
            for attr in key_attrs)

def deduplicate_lexical_index(lexical_index):
  """
  Remove duplicate relationships from all groupings in a lexical index

  Parameters
  ----------
  lexical_index : LexicalIndex
      The lexical index to deduplicate

  Returns
  -------
  LexicalIndex
      The same lexical index with deduplicated relationships
  """
  for term, grouping in lexical_index.groupings.items():
      unique_relationships = []

      # For each relationship, check if an equivalent one exists
      for rel in grouping.relationships:
          is_duplicate = False
          for unique_rel in unique_relationships:
              if relationship_equals(rel, unique_rel):
                  is_duplicate = True
                  break

          if not is_duplicate:
              unique_relationships.append(rel)

      # Replace with deduplicated list
      grouping.relationships = unique_relationships

  return lexical_index

In [39]:
def merge_lexical_indexes(index1, index2, validate_pipelines=True):
  """
  Merge two lexical indexes ensuring pipeline compatibility

  Parameters
  ----------
  index1 : LexicalIndex
      First lexical index to merge
  index2 : LexicalIndex
      Second lexical index to merge
  validate_pipelines : bool, default=True
      If True, verify that similarly named pipelines are compatible

  Returns
  -------
  LexicalIndex
      A new lexical index containing all entries from both inputs
  """
  merged_index = LexicalIndex()

  # Check pipeline compatibility if requested
  if validate_pipelines:
      for name, pipeline1 in index1.pipelines.items():
          if name in index2.pipelines:
              pipeline2 = index2.pipelines[name]
              if not are_pipelines_compatible(pipeline1, pipeline2):
                  raise ValueError(
                      f"Pipeline '{name}' is defined differently in the two indexes. "
                      "Set validate_pipelines=False to override this check."
                  )

  # Copy pipelines from the first index
  for name, pipeline in index1.pipelines.items():
      merged_index.pipelines[name] = pipeline

  # Add pipelines from second index (if not already present)
  for name, pipeline in index2.pipelines.items():
      if name not in merged_index.pipelines:
          merged_index.pipelines[name] = pipeline

  # Merge groupings - dict ensures no duplicate keys
  for term, grouping in index1.groupings.items():
      # Add the grouping directly with its relationships
      merged_index.groupings[term] = LexicalGrouping(term=term)
      merged_index.groupings[term].relationships = list(grouping.relationships)

  # Add or merge groupings from the second index
  for term, grouping in index2.groupings.items():
      if term not in merged_index.groupings:
          # New term, add the grouping directly
          merged_index.groupings[term] = LexicalGrouping(term=term)
          merged_index.groupings[term].relationships = list(grouping.relationships)
      else:
          # Term exists, append relationships
          merged_index.groupings[term].relationships.extend(grouping.relationships)

  # Deduplicate relationships
  return deduplicate_lexical_index(merged_index)


In [68]:
def add_obsolete_terms_to_lexical_index(oi, lexical_index):
    """
    Find all obsolete classes in an ontology, removes the 'obsolete ' prefix
    from their labels, and adds them to an existing lexical index.
    Also applies punctuation normalization.

    Parameters
    ----------
    oi : BasicOntologyInterface
        The ontology interface to search for obsolete classes
    lexical_index : LexicalIndex
        The existing lexical index to add the processed obsolete terms to

    Returns
    -------
    LexicalIndex
        The updated lexical index with obsolete terms added
    """

    # Get all obsolete classes using the proper method
    obsolete_classes = list(oi.obsoletes())
    print(f"Found {len(obsolete_classes)} obsolete classes")

    # Define punctuation normalization rule
    punctuation_rule = Synonymizer(
        description="Replace all punctuation with spaces",
        match=r"[^\w\s]|_",  # Match any non-alphanumeric, non-whitespace character or underscore
        replacement=r" "  # Replace with a space
    )

    # Process each obsolete class
    for obsolete_entity in obsolete_classes:
        # Get the original label
        orig_label = oi.label(obsolete_entity)
        if not orig_label:
            continue  # Skip entities without labels

        # Remove "obsolete " prefix if present
        if orig_label.lower().startswith("obsolete "):
            clean_label = re.sub(r'^obsolete\s+', '', orig_label, flags=re.IGNORECASE)
        else:
            clean_label = orig_label

        # Process the main label first with all pipelines
        for pipeline_name, pipeline in lexical_index.pipelines.items():
            # Start with the clean label (obsolete prefix removed)
            transformed_label = clean_label

            # First apply punctuation normalization
            punctuation_transformation = LexicalTransformation(
                TransformationType.Synonymization,
                params=[punctuation_rule]
            )
            result = apply_transformation(transformed_label, punctuation_transformation)
            if isinstance(result, tuple):
                transformed_label = result[1]
            else:
                transformed_label = result

            # Then apply the pipeline's regular transformations
            for transformation in pipeline.transformations:
                result = apply_transformation(transformed_label, transformation)
                if isinstance(result, tuple):
                    transformed_label = result[1]
                else:
                    transformed_label = result

            # Create relationship for the main label
            rel = RelationshipToTerm(
                predicate='rdfs:label',
                element=obsolete_entity,
                element_term=clean_label,
                pipeline=[pipeline_name],
                synonymized=False
            )

            # Add to lexical index
            if transformed_label not in lexical_index.groupings:
                lexical_index.groupings[transformed_label] = LexicalGrouping(term=transformed_label)
                lexical_index.groupings[transformed_label].relationships = [rel]
            else:
                # Check if this relationship already exists to avoid duplicates
                exists = False
                for existing_rel in lexical_index.groupings[transformed_label].relationships:
                    if (existing_rel.element == rel.element and
                            existing_rel.predicate == rel.predicate):
                        exists = True
                        break

                if not exists:
                    lexical_index.groupings[transformed_label].relationships.append(rel)

        # Now process all other aliases
        alias_map = oi.entity_alias_map(obsolete_entity)

        # For each pipeline and each alias (skipping the label we just processed)
        for pipeline_name, pipeline in lexical_index.pipelines.items():
            for predicate, aliases in alias_map.items():
                for alias in aliases:
                    # Skip the main label as we already processed it
                    if alias == orig_label and predicate == 'rdfs:label':
                        continue

                    # Remove "obsolete " prefix if present
                    if alias.lower().startswith("obsolete "):
                        clean_alias = re.sub(r'^obsolete\s+', '', alias, flags=re.IGNORECASE)
                    else:
                        clean_alias = alias

                    # Apply transformations including punctuation normalization
                    transformed_alias = clean_alias

                    # First apply punctuation normalization
                    result = apply_transformation(transformed_alias, punctuation_transformation)
                    if isinstance(result, tuple):
                        transformed_alias = result[1]
                    else:
                        transformed_alias = result

                    # Then apply the pipeline's regular transformations
                    for transformation in pipeline.transformations:
                        result = apply_transformation(transformed_alias, transformation)
                        if isinstance(result, tuple):
                            transformed_alias = result[1]
                        else:
                            transformed_alias = result

                    # Create relationship for the alias
                    rel = RelationshipToTerm(
                        predicate=predicate,
                        element=obsolete_entity,
                        element_term=clean_alias,
                        pipeline=[pipeline_name],
                        synonymized=False
                    )

                    # Add to lexical index
                    if transformed_alias not in lexical_index.groupings:
                        lexical_index.groupings[transformed_alias] = LexicalGrouping(term=transformed_alias)
                        lexical_index.groupings[transformed_alias].relationships = [rel]
                    else:
                        # Check if this relationship already exists to avoid duplicates
                        exists = False
                        for existing_rel in lexical_index.groupings[transformed_alias].relationships:
                            if (existing_rel.element == rel.element and
                                    existing_rel.predicate == rel.predicate):
                                exists = True
                                break

                        if not exists:
                            lexical_index.groupings[transformed_alias].relationships.append(rel)

    # Apply deduplication to the entire index
    return deduplicate_lexical_index(lexical_index)


In [54]:
def create_punctuation_insensitive_index(oi):
  """
  Create a lexical index that is insensitive to punctuation by replacing all punctuation with spaces

  Parameters
  ----------
  oi : BasicOntologyInterface
      The ontology interface

  Returns
  -------
  LexicalIndex
      A lexical index with punctuation normalization
  """
  # Define synonymizer rules for punctuation
  punctuation_rules = [
      # Replace all punctuation with spaces
      # This covers hyphens, periods, commas, semicolons, etc.
      Synonymizer(
          description="Replace all punctuation with spaces",
          match=r"[^\w\s]|_",  # Match any non-alphanumeric, non-whitespace character or underscore
          replacement=r" "      # Replace with a space
      )
  ]

  # Create a pipeline with punctuation normalization
  pipeline = LexicalTransformationPipeline(
      name="punctuation_insensitive",
      transformations=[
          # First apply synonymization for punctuation
          LexicalTransformation(TransformationType.Synonymization, params=punctuation_rules),
          # Then standard normalizations - WhitespaceNormalization will consolidate multiple spaces
          LexicalTransformation(TransformationType.CaseNormalization),
          LexicalTransformation(TransformationType.WhitespaceNormalization)
      ]
  )

  # Create the lexical index with our pipeline and rules
  lexical_index = create_lexical_index(
      oi,
      pipelines=[pipeline],
      synonym_rules=punctuation_rules
  )

  return lexical_index



In [41]:
envo_adapter = get_adapter(envo_adapter_string)

In [42]:
po_adapter = get_adapter(envo_adapter_string)

In [50]:
# envo_po_tai = TextAnnotatorInterface()
# envo_po_tai.lexical_index = updated_index

In [51]:
# annotation_results = envo_po_tai.annotate_text("the is some agricultural soil in my desert biome. I better check the root system width in my human associated habitat")

In [52]:
# annotation_results = list(annotation_results)

In [53]:
# annotation_results

[TextAnnotation(predicate_id='oio:hasBroadSynonym', object_id='ENVO:00000172', object_label='desert', object_categories=[], object_source=None, confidence=None, match_string='desert', is_longest_match=None, matches_whole_text=False, match_type=None, info=None, object_aliases=[], subject_start=37, subject_end=42, subject_label=None, subject_source=None, subject_text_id=None),
 TextAnnotation(predicate_id='oio:hasBroadSynonym', object_id='ENVO:00000173', object_label='desert', object_categories=[], object_source=None, confidence=None, match_string='desert', is_longest_match=None, matches_whole_text=False, match_type=None, info=None, object_aliases=[], subject_start=37, subject_end=42, subject_label=None, subject_source=None, subject_text_id=None),
 TextAnnotation(predicate_id='oio:hasBroadSynonym', object_id='ENVO:00000183', object_label='desert', object_categories=[], object_source=None, confidence=None, match_string='desert', is_longest_match=None, matches_whole_text=False, match_type=

In [74]:
envo_pi_ix = create_punctuation_insensitive_index(envo_adapter)

ERROR:root:Skipping statements(subject=ENVO:01001644,predicate=oio:hasDbXref,object=None,value=Carbonate which is formed as the result of some biological process.,datatype=None,language=None,); ValueError: Carbonate which is formed as the result of some biological process. is not a valid URI or CURIE


In [75]:
envo_pi_with_obsoletes_ix = add_obsolete_terms_to_lexical_index(envo_adapter, envo_pi_ix)

Found 460 obsolete classes


In [76]:
# annotation_results
po_pi_ix = create_punctuation_insensitive_index(po_adapter)

ERROR:root:Skipping statements(subject=ENVO:01001644,predicate=oio:hasDbXref,object=None,value=Carbonate which is formed as the result of some biological process.,datatype=None,language=None,); ValueError: Carbonate which is formed as the result of some biological process. is not a valid URI or CURIE


In [78]:
envo_po_ix = None

try:
    # Merge with pipeline validation
    envo_po_ix = merge_lexical_indexes(envo_pi_with_obsoletes_ix, po_pi_ix)

except ValueError as e:
    print(f"Merging failed: {e}")
    # Handle pipeline incompatibility


In [79]:
save_lexical_index(envo_po_ix, "envo_inc_obsoletes_po_punct_free_index.yaml")