In [None]:
pip install -r /Users/mastorga/Documents/BTE-LLM/requirements.txt

In [None]:
pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz

In [None]:
from langchain_openai import ChatOpenAI
from scispacy.linking import EntityLinker
from pydantic import BaseModel, Field
from langchain_core.tools import tool
from typing import Annotated
import re
import spacy
import requests
import time

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
# loading scispacy
nlp = spacy.load("en_core_sci_sm")
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

In [None]:
def BioNERToolLLM(query: str):
    global nlp
    # getting linker
    linker = nlp.get_pipe("scispacy_linker")

    llm = ChatOpenAI(temperature=0, model="gpt-4o")
    
    bioIDs = {}
    idList = {}

    ent_prompt = f"""You are a helpful assistant that can extract biological terms/entities from a given query. 
                    These might include diseases (such as malaria, early onset dementia), genes, proteins, biological entities (such as viruses), etc.

                    Do NOT include common nouns/phrases such as "gene", "compound", "pathogen", "associated with", etc. 
                    Your response must ONLY include discrete proper noun entities such as "early onset dementia" or "sickle cell anemia". 
                    Make sure to include symptoms such as "fever" or "cough".

                    If extracting terms concerning proteins/genes such as "ESR1 upregulation", only return the gene/protein name ("ESR1").
                    You must always return the full phrase/long form of each biomedical entity 
                    (for example, "type I and type II diabetes" should result in "type I diabetes" and "type II diabetes"; "DNA polymerases of human vs. mice" should result in "human DNA polymerase" and "mouse DNA polymerase")
                    Return results as a list.
                    Here is your query: {query}"""

    entities = llm.invoke(ent_prompt).content

    verify_prompt = f"""Given the following query, have all relevant biomedical terms been extracted?
                    {query}

                    Here are the extracted biomedical terms: {entities}
                    
                    Return TRUE if all biomedical entities were extracted. Return FALSE if there are any missing biomedical entities from the list.
                    Your response MUST be boolean"""
    
    verified = llm.invoke(verify_prompt).content
    
    if verified.lower() != "true":
        retry_prompt = f"""Given the following query, which biomedical terms are missing from the list?
                    {query}

                    Here are the extracted biomedical terms: {entities}

                    Your response must ONLY include discrete proper noun entities such as "early onset dementia" or "sickle cell anemia". 
                    Make sure to include symptoms such as "fever" or "cough".
                    
                    Return a list of ONLY the missing biomedical terms"""
        
        addtl_ents = llm.invoke(retry_prompt).content
        entities = entities + " " + addtl_ents
    
    print(entities)
    
    doc = nlp(entities)

    for ent in doc.ents:
        if ent._.kb_ents:  # Check if entity has linked knowledge base IDs
            for id in ent._.kb_ents:
                idList[id[0]] = remove_after_character(str(linker.kb.cui_to_entity[id[0]]), "TUI")
                
                print(idList[id[0]])

                print('\n')

                print(getCosineSimilarity(query, idList[id[0]]))

                print("\n\n")

            select_prompt = f"""You are a smart biomedical assistant that can understand the context and the intent behind a query. 
                        Be careful when choosing IDs for entities that can refer to different concepts (for example, HIV can refer either to the virus or the disease; you MUST choose the most appropriate concept/definition based on the query). 
                        Use the context and the intent behind the query to choose the most appropriate ID. 
                        Here is the complete query: {query}
                        Select the one most appropriate ID/CUI for {ent.text} from the list below:
                        {idList}
                        If none of the choices are appropriate, return "".
                        Otherwise, return only the ID/CUI.
                        """

            # LLM selects most appropriate ID from list
            selectedID = llm.invoke(select_prompt).content.strip()

            # Extract just the UMLS CUI using regex
            match = re.search(r"C\d{7}", selectedID)
            if match:
                bioIDs[ent.text] = "UMLS:" + match.group(0)
            else:
                bioIDs[ent.text] = ""

            print(ent.text + " - " + bioIDs[ent.text])

    return bioIDs if bioIDs else {"message": "No entities found"}    

In [None]:
def BioNERToolcosine(query: str):
    global nlp
    # getting linker
    linker = nlp.get_pipe("scispacy_linker")

    llm = ChatOpenAI(temperature=0, model="gpt-4o")
    
    bioIDs = {}
    idList = {}

    ent_prompt = f"""You are a helpful assistant that can extract biological terms/entities from a given query. 
                    These might include diseases (such as malaria, early onset dementia, parkinsonism), genes, proteins, biological entities (such as viruses), etc.

                    Do NOT include common nouns/phrases such as "gene", "compound", "pathogen", "associated with", etc. 
                    Your response must ONLY include discrete proper noun entities such as "early onset dementia" or "sickle cell anemia". 
                    Make sure to include symptoms such as "fever" or "cough".

                    If extracting terms concerning proteins/genes such as "ESR1 upregulation", only return the gene/protein name ("ESR1").
                    You must always return the full phrase/long form of each biomedical entity 
                    (for example, "type I and type II diabetes" should result in "type I diabetes" and "type II diabetes"; "DNA polymerases of human vs. mice" should result in "human DNA polymerase" and "mouse DNA polymerase")
                    Return results as a list.
                    Here is your query: {query}"""

    entities = llm.invoke(ent_prompt).content

    verify_prompt = f"""Given the following query, have all relevant biomedical terms been extracted?
                    {query}

                    Here are the extracted biomedical terms: {entities}
                    
                    Return TRUE if all biomedical entities were extracted. Return FALSE if there are any missing biomedical entities from the list.
                    Your response MUST be boolean"""
    
    verified = llm.invoke(verify_prompt).content
    
    if verified.lower() != "true":
        retry_prompt = f"""Given the following query, which biomedical terms are missing from the list?
                    {query}

                    Here are the extracted biomedical terms: {entities}

                    Here were the extraction instructions: {ent_prompt}
                    
                    Return a list of ONLY the missing biomedical terms"""
        
        addtl_ents = llm.invoke(retry_prompt).content
        entities = entities + " " + addtl_ents
    
    print(entities)
    
    doc = nlp(entities)

    for ent in doc.ents:
        if ent._.kb_ents:  # Check if entity has linked knowledge base IDs
            for id in ent._.kb_ents:
                idList[id[0]] = remove_after_character(str(linker.kb.cui_to_entity[id[0]]), "TUI")
                
                print(idList[id[0]])

                print('\n')

                print(getCosineSimilarity(query, idList[id[0]]))

                print("\n\n")

                print(str(linker.kb.cui_to_entity[id[0]]).get(

            select_prompt = f"""You are a smart biomedical assistant that can understand the context and the intent behind a query. 
                        Be careful when choosing IDs for entities that can refer to different concepts (for example, HIV can refer either to the virus or the disease; you MUST choose the most appropriate concept/definition based on the query). 
                        Use the context and the intent behind the query to choose the most appropriate ID. 
                        Here is the complete query: {query}
                        Select the one most appropriate ID/CUI for {ent.text} from the list below:
                        {idList}
                        If none of the choices are appropriate, return "".
                        Otherwise, return only the ID/CUI.
                        """

            # LLM selects most appropriate ID from list
            selectedID = llm.invoke(select_prompt).content.strip()

            # Extract just the UMLS CUI using regex
            match = re.search(r"C\d{7}", selectedID)
            if match:
                bioIDs[ent.text] = "UMLS:" + match.group(0)
            else:
                bioIDs[ent.text] = ""

            print(ent.text + " - " + bioIDs[ent.text])

    return bioIDs if bioIDs else {"message": "No entities found"}    

In [None]:
def BioNERTool(query: str):
    """Extract biological entities from a query and returns them along with their ID"""

    def remove_TUI(text):
        parts = text.split("TUI", 1)
        return parts[0]

    # Setting up nlp model
    global nlp
    linker = nlp.get_pipe("scispacy_linker")

    bioIDs = {}
    idList = {}

    llm = ChatOpenAI(temperature=0, model="gpt-4o")

    ent_prompt = f"""You are a helpful assistant that can extract biological terms/entities from a given query. 
                    These might include diseases (such as malaria, early onset dementia, parkinsonism), genes, proteins, biological entities (such as viruses), etc.

                    Do NOT include common nouns/phrases such as "gene", "compound", "pathogen", "associated with", etc. 
                    Your response must ONLY include discrete proper noun entities such as "early onset dementia" or "sickle cell anemia". 
                    Make sure to include symptoms such as "fever" or "cough".

                    If extracting terms concerning proteins/genes such as "ESR1 upregulation", only return the gene/protein name ("ESR1").
                    You must always return the full phrase/long form of each biomedical entity 
                    (for example, "type I and type II diabetes" should result in "type I diabetes" and "type II diabetes"; "DNA polymerases of human vs. mice" should result in "human DNA polymerase" and "mouse DNA polymerase")
                    Return results as a list.
                    Here is your query: {query}"""

    entities = llm.invoke(ent_prompt).content

    verify_prompt = f"""Given the following query, have all relevant biomedical terms been extracted?
                    {query}

                    Here are the extracted biomedical terms: {entities}
                    
                    Return TRUE if all biomedical entities were extracted. Return FALSE if there are any missing biomedical entities from the list.
                    Your response MUST be boolean"""
    
    verified = llm.invoke(verify_prompt).content
    
    if verified.lower() != "true":
        retry_prompt = f"""Given the following query, which biomedical terms are missing from the list?
                    {query}

                    Here are the extracted biomedical terms: {entities}

                    Here were the extraction instructions: {ent_prompt}
                    
                    Return a list of ONLY the missing biomedical terms"""
        
        addtl_ents = llm.invoke(retry_prompt).content
        entities = entities + " " + addtl_ents
    
    print(entities + "\n")
    
    doc = nlp(entities)

    for ent in doc.ents:
        if ent._.kb_ents:  # Check if entity has linked knowledge base IDs
            for id in ent._.kb_ents:
                idList[id[0]] = remove_TUI(str(linker.kb.cui_to_entity[id[0]]))

            select_prompt = f"""You are a smart biomedical assistant that can understand the context and the intent behind a query. 
                        Be careful when choosing IDs for entities that can refer to different concepts (for example, HIV can refer either to the virus or the disease; you MUST choose the most appropriate concept/definition based on the query). 
                        Use the context and the intent behind the query to choose the most appropriate ID. 
                        Here is the complete query: {query}
                        Select the one most appropriate ID/CUI for {ent.text} from the list below:
                        {idList}
                        If none of the choices are appropriate, return "".
                        Otherwise, return only the ID/CUI.
                        """

            # LLM selects most appropriate ID from list
            selectedID = llm.invoke(select_prompt).content.strip()

            # Extract just the UMLS CUI using regex
            match = re.search(r"C\d{7}", selectedID)
            if match:
                bioIDs[ent.text] = "UMLS:" + match.group(0)
                definition = idList[match.group(0)]
            else:
                bioIDs[ent.text] = ""

            print(ent.text + " - " + bioIDs[ent.text] + '\n' + definition)

            

In [None]:
def getCosineSimilarity(str1: str, str2: str):
    # Tokenizing strings
    str1_list = word_tokenize(str1)
    str2_list = word_tokenize(str2)

    # sw contains the list of stopwords
    sw = stopwords.words('english')
    l1 = [];l2 = []

    # Removing stop words from the string
    str1_set = {w for w in str1_list if not w in sw}
    str2_set = {w for w in str2_list if not w in sw}

    # Forming a set containing the keywords of both strings
    rvector = str1_set.union(str2_set)
    for w in rvector:
        if w in str1_set: l1.append(1)
        else: l1.append(0)
        if w in str2_set: l2.append(1)
        else: l2.append(0)

    c = 0

    # Cosine formula
    for i in range(len(rvector)):
        c += l1[i]*l2[i]

    cosine = c / float((sum(l1)*sum(l2))**0.5)

    return cosine

In [None]:
sampleent1 = """CUI: C0019682, Name: HIV
Definition: Human immunodeficiency virus. A non-taxonomic and historical term referring to any of two species, specifically HIV-1 and/or HIV-2. Prior to 1986, this was called human T-lymphotropic virus type III/lymphadenopathy-associated virus (HTLV-III/LAV). From 1986-1990, it was an official species called HIV. Since 1991, HIV was no longer considered an official species name; the two species were designated HIV-1 and HIV-2.
"""

In [None]:
sampleent2 = """CUI: C0019693, Name: HIV Infections
Definition: Includes the spectrum of human immunodeficiency virus infections that range from asymptomatic seropositivity, thru AIDS-related complex (ARC), to acquired immunodeficiency syndrome (AIDS).
"""

In [None]:
sampleent3 = """CUI: C0019704, Name: HIV-1
Definition: The type species of LENTIVIRUS and the etiologic agent of AIDS. It is characterized by its cytopathic effect and affinity for the T4-lymphocyte.
"""

In [None]:
sampleent4 = """CUI: C0019707, Name: Human immunodeficiency virus 2 (HIV-2)
Definition: An HIV species related to HIV-1 but carrying different antigenic components and with differing nucleic acid composition. It shares serologic reactivity and sequence homology with the simian Lentivirus SIMIAN IMMUNODEFICIENCY VIRUS and infects only T4-lymphocytes expressing the CD4 phenotypic marker.
"""

In [None]:
getCosineSimilarity("What receptor does the HIV virus bind to?", sampleent1)

In [None]:
getCosineSimilarity("What receptor does the HIV virus bind to?", sampleent2)

In [None]:
getCosineSimilarity("What receptor does the HIV virus bind to?", sampleent3)

In [None]:
getCosineSimilarity("What receptor does the HIV virus bind to?", sampleent4)

In [None]:
getCosineSimilarity("What drugs can treat HIV?", sampleent1)

In [None]:
getCosineSimilarity("What drugs can treat HIV?", sampleent2)

In [None]:
getCosineSimilarity("What drugs can treat HIV?", sampleent3)

In [None]:
getCosineSimilarity("What drugs can treat HIV?", sampleent4)

In [None]:
BioNERTool("What drugs can treat Parkinsonism?")

In [None]:
BioNERToolLLM("What receptor does HIV bind to?")

In [None]:
import nltk

nltk.download('punkt_tab')
nltk.download('stopwords')

In [None]:
def remove_after_character(text, character):
    """
    Removes all characters in a string after the first occurrence of a specified character.

    Args:
        text (str): The input string.
        character (str): The character to split the string by.

    Returns:
        str: The modified string with everything after the character removed.
             Returns the original string if the character is not found.
    """
    parts = text.split(character, 1) # Split only at the first occurrence
    return parts[0]