# HPO ontology loading

In [123]:
import os
import time 
import tqdm 
import json
import chromadb
import voyageai
import pickle as pkl
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_voyageai import VoyageAIEmbeddings
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings

In [124]:
load_dotenv(override=True)
VOYAGE_API_KEY = os.getenv("VOYAGE_API_KEY")
RESOURCES_DIR="../../resources"

In [134]:
with open(os.path.join(RESOURCES_DIR, "hpo_es.json"), "r") as fp:
    hpo = json.load(fp)

In [135]:
#Read the desired fields of the ontology
fields = ["esp_name", "esp_def", 'esp_synonyms', "is_a"]
hpo_dict = {}

for element in hpo:
    hpo_dict[element["id"]] = {field:element[field] for field in fields if field in element}

In [136]:
def_count = 0
name_count = 0
synonym_count = 0
for k,v in hpo_dict.items():
    if "esp_def" in v:
        def_count += 1
    if "esp_name" in v:
        name_count += 1
    if "esp_synonyms" in v:
        synonym_count += 1

print(f"""Total docs: {len(hpo_dict)}
Total elements with a spanish name: {name_count}
Total elements with a spanish definition: {def_count}
Total elements with a spanish synonym: {synonym_count}
""")

Total docs: 19077
Total elements with a spanish name: 19077
Total elements with a spanish definition: 16504
Total elements with a spanish synonym: 10852



Procesar linaje

In [137]:
def clean_lineage(s):
    return s.split('!')[0].strip()

In [138]:
#clean lineage
for k,v in hpo_dict.items():
    if "is_a" in v:
        if isinstance(v["is_a"], list):
            for i, parent in enumerate(v["is_a"]):
                v["is_a"][i] = clean_lineage(parent)
        else:
            v["is_a"] = clean_lineage(v["is_a"] )

In [139]:
#clean lineage
def find_parent(hpo_code, hpo_dict=hpo_dict):
    lineage = hpo_dict[hpo_code]["is_a"]
    if isinstance(lineage, list):
        parents = set(lineage)
        for parent in lineage:
            parents.update(find_parent(parent))
        return parents
    
    if "is_a" not in hpo_dict[lineage]:
        return []
    
    return [lineage] + list(find_parent(lineage))
    

_ = {v.update({"lineage": find_parent(k)}) for k,v in hpo_dict.items() if "is_a" in v}

Creating info for chroma db

In [120]:
def add_to_names_dict(terms, hpo_code, names_dict):
    for term in terms:
        term = term.lower()
        if term in names_dict and hpo_code not in names_dict[term]:
            names_dict[term] += [hpo_code]
        else:
            names_dict[term] = [hpo_code]
    return names_dict

In [146]:
documents_text = []
metadata_list = []
names_dict = {}
for hpo_code, hpo_values in hpo_dict.items():
    metadata = {"hpo_id":hpo_code}
    cleaned_info = []
    if "esp_name" in hpo_values:
        cleaned_info.append(hpo_values["esp_name"])
        names_dict = add_to_names_dict([hpo_values["esp_name"]], hpo_code, names_dict)
    if "esp_synonyms" in hpo_values:
        syn_list = hpo_values["esp_synonyms"] if isinstance (hpo_values["esp_synonyms"], list) else [hpo_values["esp_synonyms"]]
        syn_list = [str(s) for s in syn_list]
        cleaned_info += syn_list
        names_dict = add_to_names_dict(syn_list, hpo_code, names_dict)
    if "esp_def" in hpo_values:
        cleaned_info.append(hpo_values["esp_def"])
    cleaned_info = [str(s) for s in cleaned_info]
    # if "esp_synonyms" in hpo_values:
    #     cleaned_info + hpo_values["synonyms"]
    cleaned_info = [s.strip() + "." if not s.strip().endswith(".") else s.strip() for s in cleaned_info]
    cleaned_info = " ".join(cleaned_info)
    documents_text.append(cleaned_info)
    if "lineage" in hpo_values:
        metadata["lineage"] = "->".join(hpo_values["lineage"])
    metadata_list.append(metadata)
ids_list = [v['hpo_id'] for v in metadata_list]

Create Voyage Embeddings

In [48]:
# MODEL_NAME = "BAAI/bge-small-en-v1.5"
MODEL_NAME = "voyage-3"

# embeddings = FastEmbedEmbeddings(model_name=MODEL_NAME)
embeddings_model = VoyageAIEmbeddings(voyage_api_key=VOYAGE_API_KEY,model="voyage-3")

In [None]:
embeddings = []

In [90]:
# vo = voyageai.Client(api_key=VOYAGE_API_KEY)

# batch_size = 50
# tokens=0
# starttime = time.time()

# for i in tqdm.tqdm(range(len(embeddings), len(documents_text), batch_size), desc="Batch: " ):       
#     if tokens >= 9000:
#         while time.time() < starttime + 61:
#             time.sleep(1)
#         tokens = 0
#         starttime = time.time()

#     response= vo.embed(
#         documents_text[i:i + batch_size], model=MODEL_NAME, input_type="document"
#     )
#     tokens += response.total_tokens 
#     embeddings += response.embeddings

#     time.sleep(20)

Batch: 100%|██████████| 150/150 [51:56<00:00, 20.77s/it]


In [55]:
vo = voyageai.Client(api_key=VOYAGE_API_KEY)
batch_size = 1000
embeddings= []
for i in tqdm.tqdm(range(len(embeddings), len(documents_text), batch_size), desc="Batch: " ):       
    response= vo.embed(
        documents_text[i:i + batch_size], model=MODEL_NAME, input_type="document"
    )
    embeddings += response.embeddings


Batch: 100%|██████████| 20/20 [00:37<00:00,  1.89s/it]


In [56]:
len(embeddings)

19533

In [60]:
with open("../../resources/Voyage Embeddings/docs_w_synonyms.pkl", "wb") as fp:
    pkl.dump({"ids": ids_list, "docs":documents_text}, fp)

In [58]:
with open("../../resources/Voyage Embeddings/embeddings_w_synonyms.pkl", "wb") as fp:
    pkl.dump(embeddings, fp)

In [None]:
with open("embeddings_w_synonyms.pkl", "rb") as fp:
    embeddings = pkl.load(fp)

In [11]:
with open("docs.pkl", "rb") as fp:
    docs = pkl.load(fp)

In [62]:
chroma_client = chromadb.PersistentClient(path="../../chroma_db/Voyage3")
collection = chroma_client.get_or_create_collection("hpo_ontology_esp_FULL")
collection.add(
        embeddings=embeddings,
        documents=documents_text,
        metadatas=metadata_list,
        ids=ids_list
)

In [122]:
with open("../../resources/names_dict.pkl", "wb") as fp:
    pkl.dump(names_dict, fp)

In [63]:
langchain_chroma = Chroma(
    client=chroma_client,
    collection_name="hpo_ontology_esp_FULL",
    embedding_function=embeddings_model,
)

In [64]:
print("There are", langchain_chroma._collection.count(), "documents in the collection")

There are 19533 documents in the collection


In [132]:
vectordb = Chroma(persist_directory="../../chroma_db/Voyage3", embedding_function=embeddings_model, 
                  collection_name="hpo_ontology_esp_FULL")

In [None]:
vectordb.as_retriever(search_kwargs= "where_document={'$contains':'mareos'}")

In [23]:
vectordb.max_marginal_relevance_search("Tiene dolor en el riñon izquierdo")

[Document(id='HP:0008738', metadata={'hpo_id': 'HP:0008738', 'lineage': 'HP:0000075->HP:0001438->HP:0000077->HP:0000119->HP:0000118->HP:0010935->HP:0005217->HP:0025031->HP:0000079->HP:0012210'}, page_content='Riñón parcialmente duplicado. La presencia de un riñón parcialmente duplicado.'),
 Document(id='HP:0430044', metadata={'hpo_id': 'HP:0430044', 'lineage': 'HP:0012836->HP:0012830->HP:0012823'}, page_content='Radiación en el brazo izquierdo. Se refiere a un dolor o molestia que se percibe desde el pecho hacia el brazo izquierdo.'),
 Document(id='HP:0012784', metadata={'hpo_id': 'HP:0012784', 'lineage': 'HP:0000123->HP:0011277->HP:0000077->HP:0010978->HP:0012211->HP:0000119->HP:0002715->HP:0010935->HP:0012647->HP:0012649->HP:0000118->HP:0000079'}, page_content='Perinefritis. Inflamación de los tejidos conjuntivo y adiposo que rodean al riñón.'),
 Document(id='HP:0011126', metadata={'hpo_id': 'HP:0011126', 'lineage': 'HP:0100542->HP:0012210->HP:0000077->HP:0010935->HP:0000079->HP:0000

BM25 Retreiver

In [147]:
from langchain_core.documents import Document

docs_list = []
for id, metadata, page_content in zip(ids_list, metadata_list, documents_text):
    docs_list.append(Document(id=id, metadata=metadata, page_content=page_content))

In [148]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

keyword_retriever = BM25Retriever.from_documents(docs_list)

In [149]:
len(keyword_retriever.docs)

19077

In [90]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectordb.as_retriever(),
                                                   keyword_retriever],
                                       weights=[0.6, 0.4])

In [150]:
with open("../../resources/keyword_retriever.pkl", 'wb') as fp:
    pkl.dump(keyword_retriever, fp)