# HPO ontology loading

In [1]:
import json
import os

In [2]:
RESOURCES_DIR="../../resources"

In [3]:
with open(os.path.join(RESOURCES_DIR, "hpo_es.json"), "r") as fp:
    hpo = json.load(fp)

In [4]:
#Read the desired fields of the ontology
fields = ["esp_name", "esp_def", "is_a"]
hpo_dict = {}

for element in hpo:
    hpo_dict[element["id"]] = {field:element[field] for field in fields if field in element}

Procesar linaje

In [5]:
def clean_lineage(s):
    return s.split('!')[0].strip()

In [6]:
#clean lineage
for k,v in hpo_dict.items():
    if "is_a" in v:
        if isinstance(v["is_a"], list):
            for i, parent in enumerate(v["is_a"]):
                v["is_a"][i] = clean_lineage(parent)
        else:
            v["is_a"] = clean_lineage(v["is_a"] )

In [8]:
#clean lineage
def find_parent(hpo_code, hpo_dict=hpo_dict):
    lineage = hpo_dict[hpo_code]["is_a"]
    if isinstance(lineage, list):
        parents = set(lineage)
        for parent in lineage:
            parents.update(find_parent(parent))
        return parents
    
    if "is_a" not in hpo_dict[lineage]:
        return []
    
    return [lineage] + list(find_parent(lineage))
    

_ = {v.update({"lineage": find_parent(k)}) for k,v in hpo_dict.items() if "is_a" in v}

Creating info for chroma db

In [39]:
documents_text = []
metadata_list = []
for hpo_code, hpo_values in hpo_dict.items():
    metadata = {"hpo_id":hpo_code}
    cleaned_info = []
    if "esp_name" in hpo_values:
        cleaned_info.append(hpo_values["esp_name"])
    if "esp_def" in hpo_values:
        cleaned_info.append(hpo_values["esp_def"])
    # if "esp_synonyms" in hpo_values:
    #     cleaned_info + hpo_values["synonyms"]
    cleaned_info = [s.strip() + "." if not s.strip().endswith(".") else s.strip() for s in cleaned_info]
    cleaned_info = " ".join(cleaned_info)
    documents_text.append(cleaned_info)
    if "lineage" in hpo_values:
        metadata["lineage"] = "->".join(hpo_values["lineage"])
    metadata_list.append(metadata)

In [36]:
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
MODEL_NAME = "BAAI/bge-small-en-v1.5"

embeddings = FastEmbedEmbeddings(model_name=MODEL_NAME)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 5 files: 100%|██████████| 5/5 [00:11<00:00,  2.22s/it]


In [40]:
from langchain_chroma import Chroma
chroma_db = Chroma.from_texts(documents_text, embeddings, metadata_list
                              , collection_name = 'hpo_ontology', persist_directory= "../../chroma_db")

In [47]:
chroma_db.max_marginal_relevance_search("Presencia de múltiples quistes en el riñón izquierdo")

[Document(id='3eac21dd-316f-4c65-bb92-bd75111f1d23', metadata={'hpo_id': 'HP:0005562', 'lineage': 'HP:0000107->HP:0012210->HP:0000077->HP:0010935->HP:0000079->HP:0000119->HP:0000118'}, page_content='Quistes renales múltiples. Presencia de numerosos quistes en el riñón.'),
 Document(id='64109213-992c-45f6-a92e-7aa50b3cf401', metadata={'hpo_id': 'HP:0034946', 'lineage': 'HP:0000492->HP:0030669->HP:0032039->HP:0000315->HP:0000271->HP:0000234->HP:0000152->HP:0000118'}, page_content='Quistes múltiples en el borde del párpado.'),
 Document(id='894c7af0-1b8b-46b8-8ad8-c3ec213bc513', metadata={'hpo_id': 'HP:0001571', 'lineage': 'HP:0011079->HP:0000706->HP:0006292->HP:0000164->HP:0000163->HP:0031816->HP:0000153->HP:0000271->HP:0000234->HP:0000152->HP:0000118'}, page_content='Múltiples dientes impactados. La presencia de múltiples dientes impactados.'),
 Document(id='d03a976d-9480-4891-bcc1-4bd1ec6e90a1', metadata={'hpo_id': 'HP:0000105', 'lineage': 'HP:0012210->HP:0000077->HP:0010935->HP:000007