In [None]:
%%capture
%pip install torch torch_geometric stark-qa neo4j python-dotenv pcst_fast datasets pandas transformers langchain langchain-openai langchain-community

## Setup

load env variables

In [1]:
from dotenv import load_dotenv
import os

load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

Get stark-qa dataset

In [2]:
from stark_qa import load_qa, load_skb

dataset_name = 'prime'

# Load the retrieval dataset
qa_dataset = load_qa(dataset_name)

Use file from /Users/zachblumenfeld/.cache/huggingface/hub/datasets--snap-stanford--stark/snapshots/e3387ba9c873528521e125c3da687092d9872104/qa/prime/stark_qa/stark_qa_human_generated_eval_uncleaned.csv.


In [3]:
qa_dataset.data

Unnamed: 0,id,query,answer_ids
0,0,Could you identify any skin diseases associate...,[95886]
1,1,What drugs target the CYP3A4 enzyme and are us...,[15450]
2,2,What is the name of the condition characterize...,"[98851, 98853]"
3,3,What drugs are used to treat epithelioid sarco...,[15698]
4,4,Can you supply a compilation of genes and prot...,"[7161, 22045]"
...,...,...,...
11199,11199,Which gene or protein is not expressed in fema...,[2414]
11200,11200,Could you identify a biological pathway in whi...,[128199]
11201,11201,Is there an interaction between genes or prote...,"[127611, 62903]"
11202,11202,Which pharmacological agents that stimulate os...,[20180]


## Vector Retriever

In [4]:
import neo4j
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings

driver = neo4j.GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

#create text embedder
embedder = OpenAIEmbeddings()

In [5]:
from neo4j_graphrag.retrievers import VectorRetriever

vector_retriever = VectorRetriever(
    driver,
    index_name="text_embeddings",
    embedder=embedder,
    return_properties=["details", "name", "nodeId"],
)

In [6]:
from neo4j_graphrag.llm import OpenAILLM as LLM
from neo4j_graphrag.generation import RagTemplate
from neo4j_graphrag.generation.graphrag import GraphRAG

llm=LLM(
    model_name="gpt-4o",
    model_params={
        "response_format": {"type": "json_object"}, # use json_object formatting for best results
        "temperature": 0, # turning temperature down for more deterministic results
        "seed": 42
    }
)

rag_template = RagTemplate(template='''
Answer the Question using the following Context. Only respond with information mentioned in the Context. Do not inject any speculative information not mentioned. 

Return result as JSON using the following format:
{{"nodeIds": [list of nodeIds containing the correct answers in order of probability highest to smallest], "explanation": "written explanation as to why the node id answers were provided"}}

# Question:
{query_text}
 
# Context:
{context}

# Answer:
''', expected_inputs=['query_text', 'context'])

v_rag  = GraphRAG(llm=llm, retriever=vector_retriever, prompt_template=rag_template)

In [7]:
q=qa_dataset.data['query'][0]
print(q)

v_res = v_rag.search(q, return_context=True, retriever_config={"top_k": 10})

Could you identify any skin diseases associated with epithelial skin neoplasms? I've observed a tiny, yellowish lesion on sun-exposed areas of my face and neck, and I suspect it might be connected.


In [8]:
import json

print(json.dumps(eval(v_res.answer), indent=1))

{
 "nodeIds": [
  96054,
  96057,
  98637,
  39254
 ],
 "explanation": "The tiny, yellowish lesion observed on sun-exposed areas of the face and neck could be associated with basal cell carcinoma, a type of epithelial skin neoplasm. The context provides information on various types of basal cell carcinoma, which often develop on sun-exposed parts of the body, especially the head and neck. These lesions can appear as pearly white, skin-colored, or pink bumps that are translucent, and tiny blood vessels are often visible. The nodeIds 96054, 96057, 98637, and 39254 all describe different variants of basal cell carcinoma, which are relevant to the symptoms described."
}


## Dynamic Cypher Retriever

### MATCH Statement Generator

In [9]:
# Uncomment to Re-get Schema
#from neo4j_graphrag.schema import get_schema, get_structured_schema

#print(get_schema(driver))

In [10]:
schema = """(:Disease)-[:ASSOCIATED_WITH]->(:GeneOrProtein)
(:Disease)-[:PARENT_CHILD]->(:Disease)
(:Disease)-[:PHENOTYPE_ABSENT]->(:EffectOrPhenotype)
(:Disease)-[:PHENOTYPE_PRESENT]->(:EffectOrPhenotype)
(:Disease)-[:CONTRAINDICATION]->(:Drug)
(:Disease)-[:INDICATION]->(:Drug)
(:Disease)-[:OFF_LABEL_USE]->(:Drug)
(:Disease)-[:LINKED_TO]->(:Exposure)
(:GeneOrProtein)-[:ASSOCIATED_WITH]->(:Disease)
(:GeneOrProtein)-[:ASSOCIATED_WITH]->(:EffectOrPhenotype)
(:GeneOrProtein)-[:PPI]->(:GeneOrProtein)
(:GeneOrProtein)-[:EXPRESSION_PRESENT]->(:Anatomy)
(:GeneOrProtein)-[:INTERACTS_WITH]->(:MolecularFunction)
(:GeneOrProtein)-[:INTERACTS_WITH]->(:Pathway)
(:GeneOrProtein)-[:INTERACTS_WITH]->(:BiologicalProcess)
(:GeneOrProtein)-[:INTERACTS_WITH]->(:CellularComponent)
(:GeneOrProtein)-[:INTERACTS_WITH]->(:Exposure)
(:GeneOrProtein)-[:TARGET]->(:Drug)
(:GeneOrProtein)-[:EXPRESSION_ABSENT]->(:Anatomy)
(:GeneOrProtein)-[:TRANSPORTER]->(:Drug)
(:GeneOrProtein)-[:ENZYME]->(:Drug)
(:GeneOrProtein)-[:CARRIER]->(:Drug)
(:MolecularFunction)-[:PARENT_CHILD]->(:MolecularFunction)
(:MolecularFunction)-[:INTERACTS_WITH]->(:GeneOrProtein)
(:MolecularFunction)-[:INTERACTS_WITH]->(:Exposure)
(:Drug)-[:CARRIER]->(:GeneOrProtein)
(:Drug)-[:ENZYME]->(:GeneOrProtein)
(:Drug)-[:TARGET]->(:GeneOrProtein)
(:Drug)-[:TRANSPORTER]->(:GeneOrProtein)
(:Drug)-[:CONTRAINDICATION]->(:Disease)
(:Drug)-[:INDICATION]->(:Disease)
(:Drug)-[:SYNERGISTIC_INTERACTION]->(:Drug)
(:Drug)-[:SIDE_EFFECT]->(:EffectOrPhenotype)
(:Drug)-[:OFF_LABEL_USE]->(:Disease)
(:Pathway)-[:PARENT_CHILD]->(:Pathway)
(:Pathway)-[:INTERACTS_WITH]->(:GeneOrProtein)
(:Anatomy)-[:PARENT_CHILD]->(:Anatomy)
(:Anatomy)-[:EXPRESSION_PRESENT]->(:GeneOrProtein)
(:Anatomy)-[:EXPRESSION_ABSENT]->(:GeneOrProtein)
(:EffectOrPhenotype)-[:PARENT_CHILD]->(:EffectOrPhenotype)
(:EffectOrPhenotype)-[:PHENOTYPE_PRESENT]->(:Disease)
(:EffectOrPhenotype)-[:PHENOTYPE_ABSENT]->(:Disease)
(:EffectOrPhenotype)-[:ASSOCIATED_WITH]->(:GeneOrProtein)
(:EffectOrPhenotype)-[:SIDE_EFFECT]->(:Drug)
(:BiologicalProcess)-[:PARENT_CHILD]->(:BiologicalProcess)
(:BiologicalProcess)-[:INTERACTS_WITH]->(:GeneOrProtein)
(:BiologicalProcess)-[:INTERACTS_WITH]->(:Exposure)
(:CellularComponent)-[:PARENT_CHILD]->(:CellularComponent)
(:CellularComponent)-[:INTERACTS_WITH]->(:GeneOrProtein)
(:CellularComponent)-[:INTERACTS_WITH]->(:Exposure)
(:Exposure)-[:PARENT_CHILD]->(:Exposure)
(:Exposure)-[:INTERACTS_WITH]->(:GeneOrProtein)
(:Exposure)-[:INTERACTS_WITH]->(:BiologicalProcess)
(:Exposure)-[:INTERACTS_WITH]->(:MolecularFunction)
(:Exposure)-[:INTERACTS_WITH]->(:CellularComponent)
(:Exposure)-[:LINKED_TO]->(:Disease)"""

In [12]:
access_patterns = [
    {
        "pattern": "MATCH (n1:EffectOrPhenotype)-[r1:PHENOTYPE_ABSENT]->(n2:Disease)<-[r2:CONTRAINDICATION]-(n3:Drug)",
        "questionType": "Find diseases with zero indication drug and are associated with <effect/phenotype>"
    },
    {
        "pattern": "MATCH (n1:Drug)-[r1:CONTRAINDICATION]->(n2:Disease)<-[r2:ASSOCIATED_WITH]-(n3:GeneOrProtein)",
        "questionType": "Identify diseases associated with <gene/protein> and are contraindicated with <drug>"
    },
    {
        "pattern": "MATCH (n1:Anatomy)-[r1:EXPRESSION_PRESENT]->(n2:GeneOrProtein)<-[r2:EXPRESSION_ABSENT]-(n3:Anatomy)",
        "questionType": "What gene or protein is expressed in <anatomy1> while is absent in <anatomy2>?"
    },
    {
        "pattern": "MATCH (n1:Anatomy)-[r1:EXPRESSION_ABSENT]->(n2:GeneOrProtein)<-[r2:EXPRESSION_ABSENT]-(n3:Anatomy)",
        "questionType": "What gene/protein is absent in both <anatomy1> and <anatomy2>?"
    },
    {
        "pattern": "MATCH (n1:Drug)-[r1:CARRIER]->(n2:GeneOrProtein)<-[r2:CARRIER]-(n3:Drug)",
        "questionType": "Which target genes are shared carriers between <drug1> and <drug2>?"
    },
    {
        "pattern": "MATCH (n1:Anatomy)-[r1:EXPRESSION_PRESENT]->(n2:GeneOrProtein)-[r2:TARGET]->(n3:Drug)",
        "questionType": "What is the drug that targets the genes or proteins which are expressed in <anatomy>?"
    },
    {
        "pattern": "MATCH (n1:Drug)-[r1:SIDE_EFFECT]->(n2:EffectOrPhenotype)-[r2:SIDE_EFFECT]->(n3:Drug)",
        "questionType": "What drug has common side effects as <drug>?"
    },
    {
        "pattern": "MATCH (n1:Drug)-[r1:CARRIER]->(n2:GeneOrProtein)-[r2:CARRIER]->(n3:Drug)",
        "questionType": "What is the drug that has common gene/protein carrier with <drug>?"
    },
    {
        "pattern": "MATCH (n1:Anatomy)-[r1:EXPRESSION_PRESENT]->(n2:GeneOrProtein)-[r2:ENZYME]->(n3:Drug)",
        "questionType": "What is the drug that some genes or proteins act as an enzyme upon, where the genes or proteins are expressed in <anatomy>?"
    },
    {
        "pattern": "MATCH p=(n1:CellularComponent)-[r1:INTERACTS_WITH]->(n2:GeneOrProtein)-[r2:CARRIER]->(n3:Drug)",
        "questionType": "What is the drug carried by genes or proteins that interact with <cellular_component>?"
    },
    {
        "pattern": "MATCH (n1:MolecularFunction)-[r1:INTERACTS_WITH]->(n2:GeneOrProtein)-[r2:TARGET]->(n3:Drug)",
        "questionType": "What drug targets the genes or proteins that interact with <molecular_function>?"
    },
    {
        "pattern": "MATCH (n1:EffectOrPhenotype)-[r1:SIDE_EFFECT]->(n2:Drug)-[r2:SYNERGISTIC_INTERACTION]->(n3:Drug)",
        "questionType": "What drug has a synergistic interaction with the drug that has <effect/phenotype> as a side effect?"
    },
    {
        "pattern": "MATCH p=(n1:Disease)-[r1:INDICATION]->(n2:Drug)-[r2:CONTRAINDICATION]->(n3:Disease)",
        "questionType": "What disease is a contraindication for the drugs indicated for <disease>?"
    },
    {
        "pattern": "MATCH (n1:Disease)-[r1:PARENT_CHILD]->(n2:Disease)-[r2:PHENOTYPE_PRESENT]->(n3:EffectOrPhenotype)",
        "questionType": "What effect or phenotype is present in the sub type of <disease>?"
    },
    {
        "pattern": "MATCH (n1:GeneOrProtein)-[r1:TRANSPORTER]->(n2:Drug)-[r2:SIDE_EFFECT]->(n3:EffectOrPhenotype)",
        "questionType": "What effect or phenotype is a [side effect] of the drug transported by <gene/protein>?"
    },
    {
        "pattern": "MATCH (n1:Drug)-[r1:TRANSPORTER]->(n2:GeneOrProtein)-[r2:INTERACTS_WITH]->(n3:Exposure)",
        "questionType": "What exposure may affect <drug>s efficacy by acting on its transporter genes?"
    },
    {
        "pattern": "MATCH (n1:Pathway)-[r1:INTERACTS_WITH]->(n2:GeneOrProtein)-[r2:PPI]->(n3:GeneOrProtein)",
        "questionType": "What gene/protein interacts with the gene/protein that related to <pathway>?"
    },
    {
        "pattern": "MATCH (n1:Drug)-[r1:SYNERGISTIC_INTERACTION]->(n2:Drug)-[r2:TRANSPORTER]->(n3:GeneOrProtein)",
        "questionType": "What gene or protein transports the drugs that have a synergistic interaction with <drug>?"
    },
    {
        "pattern": "MATCH (n1:BiologicalProcess)-[r1:INTERACTS_WITH]->(n2:GeneOrProtein)-[r2:INTERACTS_WITH]->(n3:BiologicalProcess)",
        "questionType": "What biological process has the common interactino pattern with gene or proteins as <biological_process>?"
    },
    {
        "pattern": "MATCH (n1:EffectOrPhenotype)-[r1:ASSOCIATED_WITH]->(n2:GeneOrProtein)-[r2:INTERACTS_WITH]->(n3:BiologicalProcess)",
        "questionType": "What biological process interacts with the gene/protein associated with <effect/phenotype>?"
    },
    {
        "pattern": "MATCH p=(n1:Drug)-[r1:TRANSPORTER]->(n2:GeneOrProtein)-[r2:EXPRESSION_PRESENT]->(n3:Anatomy)",
        "questionType": "What anatomy expressesed by the gene/protein that affect the transporter of <drug>?"
    },
    {
        "pattern": "MATCH (n1:Drug)-[r1:TARGET]->(n2:GeneOrProtein)-[r2:INTERACTS_WITH]->(n3:CellularComponent)",
        "questionType": "What cellular component interacts with genes or proteins targeted by <drug>?"
    },
    {
        "pattern": "MATCH (n1:BiologicalProcess)-[r1:INTERACTS_WITH]->(n2:GeneOrProtein)-[r2:EXPRESSION_ABSENT]->(n3:Anatomy)",
        "questionType": "What anatomy does not express the genes or proteins that interacts with <biological_process>?"
    },
    {
        "pattern": "MATCH (n1:EffectOrPhenotype)-[r1:ASSOCIATED_WITH]->(n2:GeneOrProtein)-[r2:EXPRESSION_ABSENT]->(n3:Anatomy)",
        "questionType": "What anatomy does not express the genes or proteins associated with <effect/phenotype>?"
    },
    {
        "pattern": "MATCH (n1:Drug)-[r1:INDICATION]->(n2:Disease)-[r2:INDICATION]->(n3:Drug), (n1:Drug)-[r3:SYNERGISTIC_INTERACTION]->(n3:Drug)",
        "questionType": "Find drugs that has a synergistic interaction with <drug> and both are indicated for the same disease."
    },
    {
        "pattern": "MATCH (n1:Pathway)-[r1:INTERACTS_WITH]->(n2:GeneOrProtein)-[r2:INTERACTS_WITH]->(n3:Pathway), (n1:Pathway)-[r3:PARENT_CHILD]->(n3:Pathway)",
        "questionType": "Find pathway that is related with <pathway> and both can [interacts with] the same gene/protein."
    },
    {
        "pattern": "MATCH (n1:GeneOrProtein)-[r1:ASSOCIATED_WITH]->(n2:Disease)-[r2:ASSOCIATED_WITH]->(n3:GeneOrProtein), (n1:GeneOrProtein)-[r3:PPI]->(n3:GeneOrProtein)",
        "questionType": "Find gene/protein that can interect with <gene/protein> and both are associated with the same disease."
    },
    {
        "pattern": "MATCH (n1:GeneOrProtein)-[r1:ASSOCIATED_WITH]->(n2:EffectOrPhenotype)-[r2:ASSOCIATED_WITH]->(n3:GeneOrProtein), (n1:GeneOrProtein)-[r3:PPI]->(n3:GeneOrProtein)",
        "questionType": "Find gene/protein that can interect with <gene/protein> and both are associated with the same effect/phenotype."
    }
]

In [36]:
from langchain_core.prompts import PromptTemplate

#TODO: Allow for Creation of New Patterns too
MATCH_TEMPLATE = PromptTemplate.from_template("""
Task: From AccessPatterns, choose a questionType that best matches the Below Question. 
Return:
- the questionType
- associated pattern
- nodeSearchTerms: For each node in `pattern`, determine if the node requires a specific search phrase/filter based on the question.  Return a separate nodeSearchTerms mapping in the JSON form of {{"d1":"<search_prompt>","d2":"<search_prompt>"...}}  For each node that requires a search phrase/filter. Not all nodes will require one based on the question. Avoid generic filters like "condition", "disease", or "pathway".
- goodMatchFound
- feedback


If no match seems appropriate, provide feedback explaining why. 
Focus on matching intent and key concepts from the Question.


# Question
{question}

# AccessPatterns
{accessPatterns}

# Chosen MATCH Statement:
""")

In [37]:
from langchain_openai import ChatOpenAI


cypher_llm = ChatOpenAI(
    model_name="gpt-4o-mini",
    temperature=0, # turning temperature down for more deterministic results
    seed=42)

In [38]:
from typing import Dict
from pydantic import BaseModel, Field

class AccessPattern(BaseModel):
    """Access Pattern (Cypher Match Statement)"""
    pattern: str = Field(description="The Match statement")
    nodeSearchTerms: str = Field(description="Search terms to use for each node, decomposed from the question")
    questionType: str = Field(description="the type of question")
    goodMatchFound: bool = Field(description="Does the MATCH Statement fit the question well?")
    matchFeedback: str = Field(description="feedback explaining why.")

match_llm = cypher_llm.with_structured_output(AccessPattern)

In [39]:
def gen_match_pattern(question:str):
    prompt = MATCH_TEMPLATE.format(question=question, accessPatterns=json.dumps(access_patterns, indent=4), schema=schema)
    #print(prompt)
    match_pattern = match_llm.invoke(prompt)
    return match_pattern

In [40]:
# test
access_pattern_list = list()
for q in qa_dataset.data["query"][:10]:
    print('==================')
    print(q)
    print('----')
    ap = gen_match_pattern(q)
    access_pattern_list.append(ap)
    print(ap)

Could you identify any skin diseases associated with epithelial skin neoplasms? I've observed a tiny, yellowish lesion on sun-exposed areas of my face and neck, and I suspect it might be connected.
----
pattern='MATCH (n1:Disease)-[r1:PARENT_CHILD]->(n2:Disease)-[r2:PHENOTYPE_PRESENT]->(n3:EffectOrPhenotype)' nodeSearchTerms='{"n1":"skin diseases","n2":"epithelial skin neoplasms","n3":"lesion"}' questionType='What effect or phenotype is present in the sub type of <disease>?' goodMatchFound=True matchFeedback='The question is about identifying skin diseases associated with epithelial skin neoplasms, which aligns with the pattern that looks for diseases and their associated phenotypes.'
What drugs target the CYP3A4 enzyme and are used to treat strongyloidiasis?
----
pattern='MATCH (n1:Drug)-[r1:TARGET]->(n2:GeneOrProtein)-[r2:ENZYME]->(n3:Drug)' nodeSearchTerms='{"n1":"CYP3A4","n2":"strongyloidiasis"}' questionType='What drugs target the CYP3A4 enzyme and are used to treat strongyloidias

### Cypher Query Maker

In [28]:
from typing import Tuple, Optional
from pydantic import BaseModel
import re

p="(n1:EffectOrPhenotype)-[r1:PHENOTYPE_ABSENT]->(n2:Disease)<-[r2:!INDICATION]-(n3:Drug)"

class Head(BaseModel):
    node: str = None
    rel: Tuple[str, str, str, str] = None

    def from_string(s:str):

        if ":" not in s:
            return Head()
        else:
            node = s[:s.index(":")]
            rel_pattern = re.findall('\)(.*)\[(.*):(.*)](.*)', s)
            if len(rel_pattern) > 0:
                return Head(node=node, rel=rel_pattern[0])
            else:
                return Head(node=node)

    def make_where_part(self, ind:int) -> str:
        return f'(elementId({self.node}) IN nodeIdsLists[{ind}])'


    def make_fact_part(self) -> str:
        if self.node is None:
            return ''
        else:
            fact_part = f'{self.node}.name +  "{{" + "nodeId:" + {self.node}.nodeId + "}}"'
            if self.rel is not None:
                fact_part = fact_part + f'+ " {self.rel[0]} " + type({self.rel[1]}) + " {self.rel[3]} "'

        return fact_part

    def make_details_part(self) -> str:
        return f'collect(DISTINCT {self.node}.name +  "{{" + "nodeId:" + {self.node}.nodeId + ", details:" +  {self.node}.details + "}}") AS {self.node}TextDetails'


In [140]:
from typing import List

QUERY_START = '''
UNWIND $queryVectors AS q
CALL(q) {
  CALL db.index.vector.queryNodes("text_embeddings", 5000, q.queryVector) YIELD node AS n
  WHERE apoc.label.exists(n, q.label)
  WITH n LIMIT 100
  RETURN collect(elementId(n)) AS nodeIds
}
WITH collect(nodeIds) AS nodeIdsLists'''

def find_label(node:str, pattern: str) -> str:
    labels = re.findall(f'\({node}:(.*?)\)', pattern)
    if len(labels) > 0:
        return labels[0]
    else:
        return '__Entity__'

class GraphPattern(BaseModel):
    heads: List[Head] = []
    nodeSearchPrompts: List[Dict[str, str]] = []
    pattern:str

    def from_access_pattern(ap: AccessPattern):
        node_search_terms: Dict = eval(ap.nodeSearchTerms)
        node_search_prompts = [{'searchPrompt':v, 'label':find_label(k, ap.pattern), 'node':k} for k,v in node_search_terms.items()]
        return GraphPattern.from_string(s=ap.pattern, node_search_prompts=node_search_prompts)


    def from_string(s:str, node_search_prompts: List[Dict[str, str]] = []):
        heads = []
        for split in s.split("("):
            if len(split) > 0:
                h = Head.from_string(split)
                if h.node is not None:
                    heads.append(Head.from_string(split))
        return GraphPattern(heads=heads, pattern=s, nodeSearchPrompts=node_search_prompts)

    def make_where(self) -> str:
        if len(self.nodeSearchPrompts) < 1:
            return ''
        res_list = []
        filter_nodes = [d['node'] for d in self.nodeSearchPrompts]
        nodes_used=[]
        iter = 0
        for head in self.heads:
            if (head.node not in nodes_used) & (head.node in filter_nodes):
                res_list.append(head.make_where_part(iter))
                nodes_used.append(head.node)
                iter+=1
        return "WHERE " + " AND ".join(res_list)

    def make_facts(self) -> str:
        fact_list = []
        for head in self.heads:
            fact_part = head.make_fact_part()
            if len(fact_part) > 1:
                fact_list.append(fact_part)
        fact_ls = " + ".join(fact_list)
        
        return f"collect({fact_ls}) AS facts"

    def make_details(self) -> str:
        res_list = []
        nodes_used=[]
        for head in self.heads:
            if head.node not in nodes_used:
                res_list.append(head.make_details_part())
                nodes_used.append(head.node)
        return ",\n ".join(res_list)

    def make_query(self) -> str:
        if len(self.nodeSearchPrompts) < 1:
            return f'''
{self.pattern}
RETURN {self.make_facts()} AS facts,
{self.make_details()}
'''
        return f'''
{QUERY_START}
{self.pattern}
{self.make_where()}
RETURN {self.make_facts()},
{self.make_details()}
'''

        

In [141]:
access_pattern_list[3]

AccessPattern(pattern='MATCH (n1:Drug)-[r1:CONTRAINDICATION]->(n2:Disease)<-[r2:ASSOCIATED_WITH]-(n3:GeneOrProtein)', nodeSearchTerms='{"n2":"epithelioid sarcoma","n3":"EZH2"}', questionType='Identify diseases associated with <gene/protein> and are contraindicated with <drug>', goodMatchFound=True, matchFeedback="The chosen pattern effectively identifies drugs that are contraindicated for diseases associated with the EZH2 gene product, which aligns with the question's focus on drugs for epithelioid sarcoma.")

In [142]:
access_pattern_ex = access_pattern_list[3]
node_search_terms_ex = eval(access_pattern_ex.nodeSearchTerms)

In [143]:
# test

gp = GraphPattern.from_access_pattern(access_pattern_ex)
print(gp.make_query())



UNWIND $queryVectors AS q
CALL(q) {
  CALL db.index.vector.queryNodes("text_embeddings", 5000, q.queryVector) YIELD node AS n
  WHERE apoc.label.exists(n, q.label)
  WITH n LIMIT 100
  RETURN collect(elementId(n)) AS nodeIds
}
WITH collect(nodeIds) AS nodeIdsLists
MATCH (n1:Drug)-[r1:CONTRAINDICATION]->(n2:Disease)<-[r2:ASSOCIATED_WITH]-(n3:GeneOrProtein)
WHERE (elementId(n2) IN nodeIdsLists[0]) AND (elementId(n3) IN nodeIdsLists[1])
RETURN collect(n1.name +  "{" + "nodeId:" + n1.nodeId + "}"+ " - " + type(r1) + " -> " + n2.name +  "{" + "nodeId:" + n2.nodeId + "}"+ " <- " + type(r2) + " - " + n3.name +  "{" + "nodeId:" + n3.nodeId + "}") AS facts,
collect(DISTINCT n1.name +  "{" + "nodeId:" + n1.nodeId + ", details:" +  n1.details + "}") AS n1TextDetails,
 collect(DISTINCT n2.name +  "{" + "nodeId:" + n2.nodeId + ", details:" +  n2.details + "}") AS n2TextDetails,
 collect(DISTINCT n3.name +  "{" + "nodeId:" + n3.nodeId + ", details:" +  n3.details + "}") AS n3TextDetails



In [144]:
gp.nodeSearchPrompts

[{'searchPrompt': 'epithelioid sarcoma', 'label': 'Disease', 'node': 'n2'},
 {'searchPrompt': 'EZH2', 'label': 'GeneOrProtein', 'node': 'n3'}]

In [145]:
gp = GraphPattern.from_string(p.pattern)
print(gp.make_query())
print(gp.nodeSearchPrompts)


MATCH (n1:Drug)-[r1:CONTRAINDICATION]->(n2:Disease)<-[r2:ASSOCIATED_WITH]-(n3:GeneOrProtein)
RETURN collect(n1.name +  "{" + "nodeId:" + n1.nodeId + "}"+ " - " + type(r1) + " -> " + n2.name +  "{" + "nodeId:" + n2.nodeId + "}"+ " <- " + type(r2) + " - " + n3.name +  "{" + "nodeId:" + n3.nodeId + "}") AS facts AS facts,
collect(DISTINCT n1.name +  "{" + "nodeId:" + n1.nodeId + ", details:" +  n1.details + "}") AS n1TextDetails,
 collect(DISTINCT n2.name +  "{" + "nodeId:" + n2.nodeId + ", details:" +  n2.details + "}") AS n2TextDetails,
 collect(DISTINCT n3.name +  "{" + "nodeId:" + n3.nodeId + ", details:" +  n3.details + "}") AS n3TextDetails

[]


### Data Retriever

In [146]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.graphs import Neo4jGraph

embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")
graph = Neo4jGraph()

def retrieve(gp:GraphPattern):
    query_vectors = [{'queryVector':embedding_model.embed_query(sp['searchPrompt']), 'label':sp['label'] } for sp in gp.nodeSearchPrompts]
    query_results = graph.query(gp.make_query(), params={"queryVectors": query_vectors})
    return query_results

In [147]:
gp = GraphPattern.from_access_pattern(access_pattern_ex)
retrieve(gp)

[{'facts': [], 'n1TextDetails': [], 'n2TextDetails': [], 'n3TextDetails': []}]

In [154]:
PROMPT = PromptTemplate.from_template('''
Yu are a medical professional using your expertise to Answer the Question with the following Facts and Details knowledge.

Return result as JSON using the following format:
{{"nodeIds": [list of integer nodeIds containing the correct answers in order of probability highest to smallest], "explanation": "written explanation as to why the node id answers were provided"}}

# Question:
{question}
 
# Facts:
{facts}

# Details
{details}

# Answer:
''')

response_llm = ChatOpenAI(
    model_name="gpt-4o",
    response_format={"type": "json_object"}, # use json_object formatting for best results
    temperature=0, # turning temperature down for more deterministic results
    seed=42
)


def answer(question: str):
    
    #print(f"Starting CHain to Answer Question:\n{question}")
    
    #print('\n\nFinding Access Pattern')
    match_pattern = gen_match_pattern(question)
    #print(match_pattern)
    
    #print("\n\nCreating Cypher Query Graph Pattern")
    gp = GraphPattern.from_access_pattern(match_pattern)
    #print(f"CypherQuery:\n{gp.make_query()}")
    #print(f"SearchPrompts:\n{gp.nodeSearchPrompts}")

    #print("\n\nRetrieving Data")
    context = retrieve(gp)
    #print(f"context Len (should be 1): {len(context)}")
    #print(context)
    
    print("\n\nGenerating Response")
    if len(context) > 0:
        prompt = PROMPT.format(question=question, facts=context[0]['facts'], details={k:v for k,v in context[0].items() if k != 'facts'})
    else:
        prompt = PROMPT.format(question=question, facts='', details='')
    return llm.invoke(prompt), context

In [155]:

res_list = []
for q in qa_dataset.data["query"][:10]:
    try:
        res, context  = answer(q)
        res_list.append((res, context))
    except Exception as e:
        print(str(e))



Generating Response


Generating Response


Generating Response


Generating Response


Generating Response


Generating Response


Generating Response


Generating Response


Generating Response


Generating Response


In [156]:
for res in res_list:
    print(res[0].content)

{"nodeIds": [1, 2, 3], "explanation": "The description of a tiny, yellowish lesion on sun-exposed areas of the face and neck is suggestive of a condition known as actinic keratosis, which is a type of epithelial skin neoplasm. Actinic keratosis is a precancerous skin condition caused by long-term exposure to ultraviolet (UV) light, typically from the sun. It is characterized by rough, scaly patches on the skin that can appear yellowish or reddish. This condition is considered a precursor to squamous cell carcinoma, a type of skin cancer. Other epithelial skin neoplasms include basal cell carcinoma and seborrheic keratosis, but the description provided aligns most closely with actinic keratosis. Therefore, the most probable skin disease associated with the described lesion is actinic keratosis, followed by other potential neoplasms like basal cell carcinoma and seborrheic keratosis."}
{"nodeIds": [], "explanation": "Strongyloidiasis is a parasitic infection caused by the nematode Strong

In [153]:
for res in res_list:
    print(res[1][0]['facts'])

[]
[]
['transposition of the great arteries{nodeId:38925} - PARENT_CHILD -> congenitally corrected transposition of the great arteries{nodeId:33170} - PHENOTYPE_PRESENT -> Cardiac conduction abnormality{nodeId:23921}']
[]
[]
['congenital tricuspid malformation{nodeId:33663} - PARENT_CHILD -> Ebstein anomaly (disease){nodeId:28145} - PHENOTYPE_PRESENT -> Fatigue{nodeId:25620}', 'congenital heart disease{nodeId:35453} - PARENT_CHILD -> congenital heart defects, multiple types{nodeId:27908} - PHENOTYPE_PRESENT -> Anxiety{nodeId:22447}', 'atrioventricular septal defect{nodeId:28501} - PARENT_CHILD -> congenital heart defects, multiple types{nodeId:27908} - PHENOTYPE_PRESENT -> Anxiety{nodeId:22447}', 'congenital heart malformation{nodeId:37792} - PARENT_CHILD -> congenital heart defects, multiple types{nodeId:27908} - PHENOTYPE_PRESENT -> Anxiety{nodeId:22447}']
['PFKFB3{nodeId:4859} - ASSOCIATED_WITH -> bipolar disorder{nodeId:38242} - ASSOCIATED_WITH -> AKT1{nodeId:1057}PFKFB3{nodeId:485

### (Optional) Gen Match Patterns from Paper Relationql Patterns
This step is just to format the access_patterns for match stamtent generation.  Only need to run once, not for online graphRAG pipeline


In [None]:
access_logics = {
    "(effect/phenotype → [phenotype absent] → disease ← [!indication] ← drug)": "Find diseases with zero indication drug and are associated with <effect/phenotype>",
    "(drug → [contraindication] → disease ← [associated with] ← gene/protein)": "Identify diseases associated with <gene/protein> and are contraindicated with <drug>",
    "(anatomy → [expression present] → gene/protein ← [expression absent] ← anatomy)": "What gene or protein is expressed in <anatomy1> while is absent in <anatomy2>?",
    "(anatomy → [expression absent] → gene/protein ← [expression absent] ← anatomy)": "What gene/protein is absent in both <anatomy1> and <anatomy2>?",
    "(drug → [carrier] → gene/protein ← [carrier] ← drug)": "Which target genes are shared carriers between <drug1> and <drug2>?",
    "(anatomy → [expression present] → gene/protein → [target] → drug)": "What is the drug that targets the genes or proteins which are expressed in <anatomy>?",
    "(drug → [side effect] → effect/phenotype → [side effect] → drug)": "What drug has common side effects as <drug>?",
    "(drug → [carrier] → gene/protein → [carrier] → drug)": "What is the drug that has common gene/protein carrier with <drug>?",
    "(anatomy → [expression present] → gene/protein → enzyme → drug)": "What is the drug that some genes or proteins act as an enzyme upon, where the genes or proteins are expressed in <anatomy>?",
    "(cellular_component → [interacts with] → gene/protein → [carrier] → drug)": "What is the drug carried by genes or proteins that interact with <cellular_component>?",
    "(molecular_function → [interacts with] → gene/protein → [target] → drug)": "What drug targets the genes or proteins that interact with <molecular_function>?",
    "(effect/phenotype → [side effect] → drug → [synergistic interaction] → drug)": "What drug has a synergistic interaction with the drug that has <effect/phenotype> as a side effect?",
    "(disease → [indication] → drug → [contraindication] → disease)": "What disease is a contraindication for the drugs indicated for <disease>?",
    "(disease → [parent-child] → disease → [phenotype present] → effect/phenotype)": "What effect or phenotype is present in the sub type of <disease>?",
    "(gene/protein → [transporter] → drug → [side effect] → effect/phenotype)": "What effect or phenotype is a [side effect] of the drug transported by <gene/protein>?",
    "(drug → [transporter] → gene/protein → [interacts with] → exposure)": "What exposure may affect <drug>s efficacy by acting on its transporter genes?",
    "(pathway → [interacts with] → gene/protein → [ppi] → gene/protein)": "What gene/protein interacts with the gene/protein that related to <pathway>?",
    "(drug → [synergistic interaction] → drug → [transporter] → gene/protein)": "What gene or protein transports the drugs that have a synergistic interaction with <drug>?",
    "(biological_process → [interacts with] → gene/protein → [interacts with] → biological_process)": "What biological process has the common interactino pattern with gene or proteins as <biological_process>?",
    "(effect/phenotype → [associated with] → gene/protein → [interacts with] → biological_process)": "What biological process interacts with the gene/protein associated with <effect/phenotype>?",
    "(drug → [transporter] → gene/protein → [expression present] → anatomy)": "What anatomy expressesed by the gene/protein that affect the transporter of <drug>?",
    "(drug → [target] → gene/protein → [interacts with] → cellular_component)": "What cellular component interacts with genes or proteins targeted by <drug>?",
    "(biological_process → [interacts with] → gene/protein → [expression absent] → anatomy)": "What anatomy does not express the genes or proteins that interacts with <biological_process>?",
    "(effect/phenotype → [associated with] → gene/protein → [expression absent] → anatomy)": "What anatomy does not express the genes or proteins associated with <effect/phenotype>?",
    "(drug → [indication] → disease → [indication] → drug) & (drug → [synergistic interaction] → drug)": "Find drugs that has a synergistic interaction with <drug> and both are indicated for the same disease.",
    "(pathway → [interacts with] → gene/protein → [interacts with] → pathway) & (pathway → [parent-child] → pathway)": "Find pathway that is related with <pathway> and both can [interacts with] the same gene/protein.",
    "(gene/protein → [associated with] → disease → [associated with] → gene/protein) & (gene/protein → [ppi] → gene/protein)": "Find gene/protein that can interect with <gene/protein> and both are associated with the same disease.",
    "(gene/protein → [associated with] → effect/phenotype → [associated with] → gene/protein) & (gene/protein → [ppi] → gene/protein)": "Find gene/protein that can interect with <gene/protein> and both are associated with the same effect/phenotype."
}

In [None]:
examples = """
## Example 1
### AccessLogic
(disease → [parent-child] → disease → [phenotype present] → effect/phenotype)
#### MATCH Statement
MATCH p=(n1:Disease)-[r1:PARENT_CHILD]->(n2:Disease)-[r2:PHENOTYPE_PRESENT]->(n3:EffectOrPhenotype)

## Example 2
### AccessLogic
(effect/phenotype → [phenotype absent] → disease ← [!indication] ← drug)
#### MATCH Statement
MATCH p=(n1:EffectOrPhenotype)-[r1:PHENOTYPE_ABSENT]->(n2:Disease)<-[r2:!INDICATION]-(n3:Drug)

## Example 3
### AccessLogic
(pathway → [interacts with] → gene/protein → [interacts with] → pathway) & (pathway → [parent-child] → pathway)
#### MATCH Statement
MATCH (n1:Pathway)-[r1:INTERACTS_WITH]->(n2:GeneOrProtein)-[r2:INTERACTS_WITH]->(n3:Pathway), (n1:Pathway)-[r3:PARENT_CHILD]->(n3:Pathway)

## Example 4
### AccessLogic
(gene/protein → [associated with] → effect/phenotype → [associated with] → gene/protein) & (gene/protein → [ppi] → gene/protein)
#### MATCH Statement
MATCH (n1:GeneOrProtein)-[r1:ASSOCIATED_WITH]->(n2:EffectOrPhenotype)-[r2:ASSOCIATED_WITH]->(n3:GeneOrProtein), (n1:GeneOrProtein)-[r3:PPI]->(n3:GeneOrProtein)
"""

In [None]:
CYPHER_TEMPLATE = """
Task: Translate the below AccessLogic into a MATCH Statement in Cypher, Neo4j's Graph Query language, based on the supplied GraphSchema. 
Follow the format used in the Examples. 


# AccessLogic
{accessLogic}

# GraphSchema:
{schema}

# Examples:
{examples}


- Do not use any labels or relationships not included in the GraphSchema.
- The symbol ! should be respected and maintained in translation as long as it is valid syntax
- When an "&" is used in AccessLogic, it usually means no new nodes are being introduced after the "&" symbol. Instead reuse node variables assuming no cyclical relationships.  See examples.
- Do not include triple backticks ``` or any additional text except the generated Cypher MATCH Statement in your response.

# MATCH Statement:
"""

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAIEmbeddings, ChatOpenAI


cypher_llm = ChatOpenAI(
    model_name="gpt-4o-mini",
    temperature=0, # turning temperature down for more deterministic results
    seed=42
)



def gen_match_pattern(access_logic:str):
    prompt = CYPHER_TEMPLATE.format(accessLogic = access_logic, schema=schema, examples=examples)
    match_pattern = cypher_llm.invoke(prompt)
    gp = GraphPattern.from_string(match_pattern.content)
    return match_pattern.content #gp.make_query()

In [None]:
access_patterns = []
for k, v in access_logics.items():
    print('\n=========================')
    print(k)
    print('-------------------')
    access_patterns.append({"pattern": gen_match_pattern(k), "description": v})

In [None]:
print(json.dumps(access_patterns, indent=4))