# HPO ontology loading

In [None]:
import os
import time 
import tqdm 
import json
import chromadb
import voyageai
import pickle as pkl
from dotenv import load_dotenv
from rapidfuzz import process, fuzz
from langchain_chroma import Chroma
from rapidfuzz.utils import default_process
from langchain_core.documents import Document
from langchain.retrievers import BM25Retriever
from langchain_voyageai import VoyageAIEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv(override=True)
VOYAGE_API_KEY = os.getenv("VOYAGE_API_KEY")
PROJECT_DIR=os.environ["PROJECT_DIR"]

In [3]:
with open(os.path.join(PROJECT_DIR, "resources", "hpo_es.json"), "r") as fp:
    hpo = json.load(fp)

In [34]:
#Read the desired fields of the ontology
fields = ["esp_name", "esp_def", 'esp_synonyms', "esp_addons"] # "is_a",
hpo_dict = {}

for element in hpo:
    hpo_dict[element["id"]] = {field:element[field] for field in fields if field in element}

In [35]:
def_count = 0
name_count = 0
synonym_count = 0
addon_count = 0
for k,v in hpo_dict.items():
    if "esp_def" in v:
        def_count += 1
    if "esp_name" in v:
        name_count += 1
    if "esp_synonyms" in v:
        synonym_count += 1
    if "esp_addons" in v:
        addon_count += 1

print(f"""Total docs: {len(hpo_dict)}
Total elements with a spanish name: {name_count}
Total elements with a spanish definition: {def_count}
Total elements with a spanish synonym: {synonym_count}
Total elements with a spanish addon: {addon_count}
""")

Total docs: 19077
Total elements with a spanish name: 19077
Total elements with a spanish definition: 16504
Total elements with a spanish synonym: 10852
Total elements with a spanish addon: 1764



## Process lineage

In [20]:
def clean_lineage(s):
    return s.split('!')[0].strip()

In [21]:
#clean lineage
for k,v in hpo_dict.items():
    if "is_a" in v:
        if isinstance(v["is_a"], list):
            for i, parent in enumerate(v["is_a"]):
                v["is_a"][i] = clean_lineage(parent)
        else:
            v["is_a"] = clean_lineage(v["is_a"] )

In [22]:
#clean lineage
def find_parent(hpo_code, hpo_dict=hpo_dict):
    lineage = hpo_dict[hpo_code]["is_a"]
    if isinstance(lineage, list):
        parents = set(lineage)
        for parent in lineage:
            parents.update(find_parent(parent))
        return parents
    
    if "is_a" not in hpo_dict[lineage]:
        return []
    
    return [lineage] + list(find_parent(lineage))
    

_ = {v.update({"lineage": find_parent(k)}) for k,v in hpo_dict.items() if "is_a" in v}

## Create chunks

In [25]:
def add_to_names_dict(terms, hpo_code, names_dict):
    for term in terms:
        term = term.lower()
        if term in names_dict and hpo_code not in names_dict[term]:
            names_dict[term] += [hpo_code]
        else:
            names_dict[term] = [hpo_code]
    return names_dict

In [None]:
documents_text = []
metadata_list = []
names_dict = {}
for hpo_code, hpo_values in hpo_dict.items():
    metadata = {"hpo_id":hpo_code}
    cleaned_info = []
    if "esp_name" in hpo_values:
        cleaned_info.append(hpo_values["esp_name"])
        names_dict = add_to_names_dict([hpo_values["esp_name"]], hpo_code, names_dict)
    if "esp_synonyms" in hpo_values:
        syn_list = hpo_values["esp_synonyms"] if isinstance (hpo_values["esp_synonyms"], list) else [hpo_values["esp_synonyms"]]
        syn_list = [str(s) for s in syn_list]
        cleaned_info += syn_list
        names_dict = add_to_names_dict(syn_list, hpo_code, names_dict)
    if "esp_def" in hpo_values:
        cleaned_info.append(hpo_values["esp_def"])
    cleaned_info = [str(s) for s in cleaned_info]
    cleaned_info = [s.strip() + "." if not s.strip().endswith(".") else s.strip() for s in cleaned_info]
    cleaned_info = " ".join(cleaned_info)
    documents_text.append(cleaned_info)
    if "lineage" in hpo_values:
        metadata["lineage"] = "->".join(hpo_values["lineage"])
    metadata_list.append(metadata)
ids_list = [v['hpo_id'] for v in metadata_list]

## Create Voyage Embeddings

In [None]:
MODEL_NAME = "voyage-3"
embeddings_model = VoyageAIEmbeddings(voyage_api_key=VOYAGE_API_KEY,model="voyage-3")

In [None]:
idx_to_delete = [i for i,doc in enumerate(documents_text) if not isinstance(doc,str)][0]
documents_text.pop(idx_to_delete)
metadata.pop(idx_to_delete)

In [None]:
vo = voyageai.Client(api_key=VOYAGE_API_KEY)
batch_size = 1000
embeddings= []
for i in tqdm.tqdm(range(len(embeddings), len(documents_text), batch_size), desc="Batch: " ):       
    response= vo.embed(
        documents_text[i:i + batch_size], model=MODEL_NAME, input_type="document"
    )
    embeddings += response.embeddings

Batch: 100%|██████████| 23/23 [00:30<00:00,  1.32s/it]


In [None]:
#guardar los resultados 
with open( os.path.join(PROJECT_DIR, "./resources/Voyage Embeddings/embeddings_w_synonyms.pkl"), "wb") as fp:
    pkl.dump(embeddings, fp)

with open( os.path.join(PROJECT_DIR, "./resources/Voyage Embeddings/docs_w_synonyms.pkl"), "wb") as fp:
    pkl.dump(documents_text, fp)

In [None]:
#cargar los resultados
with open(f"{PROJECT_DIR}/resources/Voyage Embeddings/embeddings_w_synonyms.pkl", "rb") as fp:
    embeddings = pkl.load(fp)
with open(f"{PROJECT_DIR}/resources/Voyage Embeddings/docs_w_synonyms.pkl", "rb") as fp:
    docs = pkl.load(fp)

## Load data into chroma client

In [5]:
chroma_client = chromadb.HttpClient(host='localhost', port=8001)

In [None]:
# chroma_client = chromadb.PersistentClient(path="../../chroma_db/Voyage3")
collection = chroma_client.get_or_create_collection("hpo_ontology_esp_FULL")
ids_list = [metadata[i]['hpo_id'] + str(i) for i in list(range(len(embeddings)))]
BATCH_SIZE = 1000
for i in range(0, len(embeddings), BATCH_SIZE):
        collection.add(
                embeddings=embeddings[i: i+BATCH_SIZE],
                documents=documents_text[i: i+BATCH_SIZE],
                metadatas=metadata[i: i+BATCH_SIZE],
                ids = ids_list[i: i+BATCH_SIZE]
        )

In [9]:
langchain_chroma = Chroma(
    client=chroma_client,
    collection_name="hpo_ontology_esp_FULL",
    embedding_function=embeddings_model,
)

In [10]:
print("There are", langchain_chroma._collection.count(), "documents in the collection")

There are 19077 documents in the collection


In [29]:
langchain_chroma.as_retriever().invoke("Tiene dolor en el riñon izquierdo")

[Document(id='HP:0100542', metadata={'hpo_id': 'HP:0100542', 'lineage': 'HP:0012210->HP:0000077->HP:0010935->HP:0000079->HP:0000119->HP:0000118'}, page_content='Localización anormal de los riñones. Localización anormal de los riñones. Un lugar anormal del riñón.'),
 Document(id='HP:0008738', metadata={'hpo_id': 'HP:0008738', 'lineage': 'HP:0000075->HP:0000119->HP:0005217->HP:0001438->HP:0010935->HP:0000118->HP:0025031->HP:0012210->HP:0000077->HP:0000079'}, page_content='Riñón parcialmente duplicado. Riñón parcialmente duplicado. La presencia de un riñón parcialmente duplicado.'),
 Document(id='HP:0030157', metadata={'hpo_id': 'HP:0030157', 'lineage': 'HP:0012531->HP:0025142->HP:0000118'}, page_content='Dolor de costado. Dolor de costado. Dolor de riñón. Sensación desagradable caracterizada por molestias físicas (como pinchazos, palpitaciones o dolores) y que se percibe como originada en el flanco.'),
 Document(id='HP:0000085', metadata={'lineage': 'HP:0100542->HP:0012210->HP:0000077->H

## Load data into BM25 Retriever

In [None]:
docs_list = []
for id, metadata, page_content in zip(ids_list, metadata_list, documents_text):
    docs_list.append(Document(id=id, metadata=metadata, page_content=page_content))

keyword_retriever = BM25Retriever.from_documents(docs_list)
keyword_retriever.invoke("convulsiones inducibles")

In [150]:
with open("../../resources/keyword_retriever.pkl", 'wb') as fp:
    pkl.dump(keyword_retriever, fp)

## Fuzzy matching

In [None]:
# Build search list: [(text, phenotype_id), ...]
search_entries = []
ids_list2 = []
for id, docs in zip(ids_list, documents_text):
    for doc in docs:
        search_entries.append(doc)
        ids_list2.append(id)

In [None]:
class FuzzyRetriever:
    search_entries = search_entries
    ids_list = ids_list2

    def invoke(self, query):
        results = process.extract(query, search_entries,
                                  scorer=fuzz.QRatio, limit=10,
                                  processor=default_process)
        return [(self.ids_list[result[2]], result[0], result[1]) for result in results]

In [34]:
fuzzyretriever = FuzzyRetriever()
fuzzyretriever.invoke("convulsiones inducibles")

[('HP:0007332', 'Convulsiones hemifaciales.', 79.16666666666666),
 ('HP:0007332', 'Convulsiones hemifaciales.', 79.16666666666666),
 ('HP:0007359', 'Convulsiones focales.', 79.06976744186046),
 ('HP:0007359', 'Convulsiones focales.', 79.06976744186046),
 ('HP:0002373', 'Convulsiones febriles.', 77.27272727272727),
 ('HP:0002373', 'Convulsiones febriles.', 77.27272727272727),
 ('HP:0002373', 'Convulsiones inducidas por fiebre.', 75.0),
 ('HP:0033349', 'Convulsiones crecientes.', 73.91304347826086),
 ('HP:0010819', 'Convulsiones atónicas.', 72.72727272727273),
 ('HP:0033349', 'Convulsiones en serie.', 72.72727272727273)]

In [55]:
with open("../../resources/fuzzy_retriever.pkl", 'wb') as fp:
    pkl.dump({"search_entries": search_entries, "ids":ids_list2}, fp)