## self-ask with search - info gathering on Rett disease

In [40]:
from langchain.agents import initialize_agent,Tool, load_tools
#from langchain.tools import BaseTool
from langchain.llms import OpenAI
from langchain import SerpAPIWrapper
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import os

In [9]:
os.environ['SERPAPI_API_KEY']="40f6fba6120a66c379436fe451a4117f8ffea1edffbb935fef773be799b9bac4"

In [43]:
# extract entities from paragraph
def extract_entities(para, entities):
    llm = OpenAI(temperature=0.0)
    extract_template = """
    Given the following paragraph: {paragraph}, extract the relevant {entities}
    """
    prompt_template = PromptTemplate(
        input_variables=["paragraph", "entities"],
        template=extract_template
    )
    extract_chain = LLMChain(
        llm=llm, prompt=prompt_template, output_key="list", verbose=False
    )

    output = extract_chain({"paragraph": para, "entities": entities})
    return output.get('list')


In [51]:
# init search tool
search = SerpAPIWrapper()
tools = [
    Tool(
        name="Symptoms",
        func=lambda x: search.run(f'Inflammatory pathways involved in {x}'),
        description="useful for when you want the high level mechanism of the disease symptoms"
    ),
    Tool(
        name="Molecule Search",
        func= lambda x: search.run(f'Molecules involved in {x}'), 
        description = "Useful for when you need specific information about the molecules involved in a disease"
    ),
    Tool(
        name="Drug Search",
        func = lambda x: search.run(f'Drugs used to treat {x}'),
        description = "useful for when you want to know what drugs are used to treat a disease"
    ),
    Tool(
        name="Similar Disease Search",
        func=lambda x: search.run(f'Diseases phenotypically similar to {x}'),
        description="useful for when you want information on similar diseases"
    )
    
    #TODO: add tool to search for disease in reactome api
]

In [53]:
agent = initialize_agent(tools, OpenAI(temperature=0),agent="zero-shot-react-description",verbose=True)

In [55]:
agent.run("Rett disease inflammatory pathway")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I need to find out what molecules are involved in the inflammatory pathway of Rett disease
Action: Molecule Search
Action Input: Rett disease inflammatory pathway[0m
Observation: [33;1m[1;3mRett syndrome (RTT) is a neurological disorder caused by mutations in the X-linked gene methyl-CpG-binding protein 2 (MECP2), a ubiquitously ...[0m
Thought:[32;1m[1;3m I need to find out what drugs are used to treat Rett disease
Action: Drug Search
Action Input: Rett disease[0m
Observation: [38;5;200m[1;3mNo medications are available specifically for treatment of Rett syndrome (RS). Antiepileptic drugs (AEDs) may be prescribed to control seizurelike activity. Antireflux agents may be given to treat gastroesophageal reflux (GER). There is some evidence that levocarnitine may be effective.[0m
Thought:[32;1m[1;3m I now know the final answer
Final Answer: No medications are available specifically for treatment of Rett syndrome (RS

'No medications are available specifically for treatment of Rett syndrome (RS). Antiepileptic drugs (AEDs) may be prescribed to control seizurelike activity. Antireflux agents may be given to treat gastroesophageal reflux (GER). There is some evidence that levocarnitine may be effective.'

## define agent to search for 

In [45]:
para = "Rett syndrome is caused by mutations in X-linked MECP2, encoding methyl-CpG-binding protein 2. No medications are available specifically for treatment of Rett syndrome (RS). Antiepileptic drugs (AEDs) may be prescribed to control seizurelike activity. Antireflux agents may be given to treat gastroesophageal reflux (GER). There is some evidence that levocarnitine may be effective. Similar disorders include intellectual disability (ID), autism spectrum disorder, cerebral palsy-like motor encephalopathy, and epilepsy."

extract_entities(para, "drugs")


'\nAnswer: Antiepileptic drugs (AEDs), Antireflux agents, Levocarnitine'

In [22]:
agent.run("What is the molecular basis for CIDP?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I need to find out what molecules are involved in CIDP.
Action: Molecule Search
Action Input: CIDP[0m
Observation: [36;1m[1;3mChronic inflammatory demyelinating polyneuropathy (CIDP) is a neurological disorder that involves progressive weakness and reduced senses in the arms and legs. It is caused by damage to the fat-based protective covering on nerves called the myelin sheath. Symptoms of CIDP are: Tingling or no feeling in fingers and toes.[0m
Thought:[32;1m[1;3m I need to find out what molecules are responsible for the damage to the myelin sheath.
Action: Molecule Search
Action Input: Myelin Sheath[0m
Observation: [36;1m[1;3mMyelin is an insulating layer, or sheath that forms around nerves, including those in the brain and spinal cord. It is made up of protein and fatty substances. This myelin sheath allows electrical impulses to transmit quickly and efficiently along the nerve cells. If myelin is damaged, these

'The molecular basis for CIDP is damage to the myelin sheath, which is composed of protein and fatty substances.'

### change action input to a query that returns more specific results

In [None]:
# prompt template

prefix = """Answer the following question. You have access to the following tools {tools}:
Ask yourself
Are following questions needed?

"""

# embed sota in chunks

In [56]:
import numpy as np
import openai
import pandas as pd
import pickle
import tiktoken

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

In [65]:
df = pd.read_csv('tests/uptodate.csv',on_bad_lines='skip')
df.head()


Unnamed: 0,text_chunk
0,Chronic inflammatory demyelinating polyneuropa...
1,"In the classic form of CIDP, motor involvement..."
2,CIDP generally responds to immunosuppressive o...
3,Early administration of effective treatment is...
4,The treatment approach depends on the severity...


In [77]:
def get_embedding(text, model=EMBEDDING_MODEL):
    result = openai.Embedding.create(
        model = model,
        input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df):
    return {
        idx: get_embedding(r.content) for idx, r in df.iterrows()
    }

In [81]:
get_embedding(text="Chronic inflammatory demyelinating polyneuropathy (CIDP; also known as chronic inflammatory demyelinating polyradiculoneuropathy) is an entity that describes a group of related neuropathies, all having chronicity, demyelination, inflammation, and immune mediation in common.",model=EMBEDDING_MODEL)


ServiceUnavailableError: The server is overloaded or not ready yet.

In [76]:
def load_embeddings(fname):
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns])
    return {
        (r.text_chunk): [r[str(i)] for i in range(max_dim +1)] for _, r in df.iterrows()
    }


## TODO: change csv format to allow document embeddings to work

In [78]:
document_embeddings = load_embeddings('tests/uptodate.csv')

ValueError: invalid literal for int() with base 10: 'index'

In [75]:
def vector_similarity(x,y):
    return np.dot(np.array(x),np.array(y))

def order_doc_sections_by_query(query:str, contexts:dict[(str,str),np.array]) -> list[(float,(str,str))]:
    query_embedding = get_embedding(query)
    document_similarities = sorted([
        (vector_similarity(query_embedding,doc_embedding),doc_index) for doc_index,doc_embedding in contexts.items()
    ],reverse=True)
    return document_similarities
