In [1]:
from json import JSONDecodeError

import pandas as pd
from IPython.core.debugger import prompt

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch

In [3]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", torch_dtype=torch.float32)

# Tokenize sentences
# inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

In [4]:
import os
os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")  # allow CPU fallback for missing MPS ops

import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

model.to(device)
model.eval()

@torch.no_grad()
def meanpooling(model_output, attention_mask):
    # model_output[0] -> last hidden states [B, L, H]
    token_embs = model_output[0]
    mask = attention_mask.unsqueeze(-1).type_as(token_embs)  # [B, L, 1] on same device + dtype
    summed = (token_embs * mask).sum(dim=1)                 # [B, H]
    counts = mask.sum(dim=1).clamp(min=1e-9)                # [B, 1]
    return summed / counts

def embed(texts):
    """
    texts: str or List[str]
    returns: np.ndarray [B, d] float32
    """
    if isinstance(texts, str):
        texts = [texts]
    enc = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    # move inputs to MPS
    enc = {k: v.to(device) for k, v in enc.items()}
    # forward in fp32 (no autocast on MPS)
    outputs = model(**enc)
    embs = meanpooling(outputs, enc['attention_mask'])      # torch [B, d] on MPS
    # IMPORTANT: materialize on CPU + contiguous before numpy
    return embs.detach().to('cpu').contiguous().numpy().astype(np.float32)

embed('EGFR inhibition reduces tumor growth in glioblastoma; however, resistance via PTEN loss emerges').shape

(1, 384)

In [5]:
from pathlib import Path
from knowledge_engine.ontology_manager import OntologyManager

om = OntologyManager(use_owlready=False, use_pronto=True, ontology_dir=Path('../data/ontologies'))


om.register_embedder(embed)


Loading ontologies...
Loading mondo with pronto...


  self.pronto_ontologies[onto_name] = pronto.Ontology(str(obo_file))
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


✅ mondo: 56695 terms
Loading hp with pronto...
2025-10-11 17:17:48,386 - chardet.charsetprober - DEBUG - EUC-TW Taiwan prober hit error at byte 989
2025-10-11 17:17:48,399 - chardet.charsetprober - DEBUG - utf-8  confidence = 0.505
2025-10-11 17:17:48,400 - chardet.charsetprober - DEBUG - SHIFT_JIS Japanese confidence = 0.01
2025-10-11 17:17:48,400 - chardet.charsetprober - DEBUG - EUC-JP Japanese confidence = 0.01
2025-10-11 17:17:48,400 - chardet.charsetprober - DEBUG - GB2312 Chinese confidence = 0.01
2025-10-11 17:17:48,400 - chardet.charsetprober - DEBUG - EUC-KR Korean confidence = 0.01
2025-10-11 17:17:48,401 - chardet.charsetprober - DEBUG - CP949 Korean confidence = 0.01
2025-10-11 17:17:48,401 - chardet.charsetprober - DEBUG - Big5 Chinese confidence = 0.01
2025-10-11 17:17:48,401 - chardet.charsetprober - DEBUG - EUC-TW not active
2025-10-11 17:17:48,401 - chardet.charsetprober - DEBUG - Johab Korean confidence = 0.01
2025-10-11 17:17:48,402 - chardet.charsetprober - DEBUG -

  self.pronto_ontologies[onto_name] = pronto.Ontology(str(obo_file))


✅ hp: 19653 terms
Loading go with pronto...
✅ go: 48106 terms
Loading cl with pronto...
❌ Failed to load cl with pronto: expected EOL, QuotedString, RFC3987_IriPctEncoded, or RFC3987_IriUCSChar (cl.obo, line 181636)
Loading uberon with pronto...


  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_clause_header(clause, metadata, self.ont)
  process_cl

❌ Failed to load uberon with pronto: expected EOL, QuotedString, RFC3987_IriPctEncoded, or RFC3987_IriUCSChar (uberon.obo, line 166503)
Building unified ontology graph...
✅ Loaded 3 pronto + 0 owlready2 ontologies
📊 Total terms in graph: 115313


In [6]:
abstract = """EGFR inhibition reduces tumor growth in glioblastoma; however, resistance via PTEN loss emerges..."""
mentions = ["glioblastoma", "EGFR", "PTEN"]

In [7]:
print(embed('EGFR inhibition ...').shape)      # -> (1, d)
print(embed(['a','b','c']).shape)              # -> (3, d)

(1, 384)
(3, 384)


In [8]:
om._build_text_corpus()

In [9]:
# om.precompute_embeddings(batch_size=15)

In [10]:
# np.savez_compressed("./data/ontology_embeddings.npz",
#                     ids=np.array(om._node_ids),
#                     embs=om._node_embeddings)

In [11]:
data = np.load("./data/ontology_embeddings.npz", allow_pickle=True)
om._node_ids = list(data['ids'])
om._node_embeddings = data['embs']

In [12]:
latent = om.build_latent_subgraph(
    text=abstract,
    mentions=mentions,
    top_k_retrieval=1500,
    seed_top_k=80,
    expand_hops=2,
    final_top_k=160,
    allowed_biolink_types={"Disease", "ChemicalSubstance", "BiologicalProcess", "MolecularActivity", "Cell"},
    allowed_relations={"is_a", "part_of", "regulates", "positively_regulates", "negatively_regulates", "has_part"}
)

# node_ids, node_feats, edges = om.pack_for_model(latent)

In [21]:
import bioc


def parse_bioc_document(doc):
    full_text = "".join(p.text or "" for p in doc.passages)
    entities = {}
    relations = []

    for passage in doc.passages:
        for ann in passage.annotations:
            eid = ann.infons["identifier"]
            entities[eid] = {
                "text": ann.text,
                "type": ann.infons.get("type", "Entity")
            }
    for rel in doc.relations:
        h = entities.get(rel.infons['entity1'])
        t = entities[rel.infons['entity2']]
        p = rel.infons.get("type", "related_to")
        novel = rel.infons.get("novel", 'No') != 'No'
        if h and t and p:
            relations.append((h, p, t, novel))

    return {
        "text": full_text,
        "entities": entities,       # gold supervision
        "relations": relations      # gold supervision
    }

with open('../data/BioRED/Train.BioC.XML', 'r', encoding='utf-8') as f:
    collection = bioc.load(f)

test = parse_bioc_document(collection.documents[0])
test['relations']

[({'text': 'HNF-6', 'type': 'GeneOrGeneProduct'},
  'Association',
  {'text': 'Type II diabetes', 'type': 'DiseaseOrPhenotypicFeature'},
  False),
 ({'text': 'glucose', 'type': 'ChemicalEntity'},
  'Positive_Correlation',
  {'text': 'insulin', 'type': 'GeneOrGeneProduct'},
  False),
 ({'text': 'glucose', 'type': 'ChemicalEntity'},
  'Association',
  {'text': 'Type II diabetes', 'type': 'DiseaseOrPhenotypicFeature'},
  False)]