# End-to-End Example

In [None]:
%%capture
%pip install fsspec langchain-text-splitters openai python-dotenv numpy torch

In [None]:
%%capture
%pip install -U git+https://github.com/neo4j/neo4j-graphrag-python

In [None]:
#%capture
#%pip install -U ../demo/neo4j-graphrag-python

In [1]:
from dotenv import load_dotenv
import os

# load neo4j credentials (and openai api key in background)
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

pdf_file_path = "pgpm-13-39.pdf"

## KG Building


In [2]:
import neo4j
from neo4j_graphrag.llm import OpenAILLM

driver = neo4j.GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

extractor_llm=OpenAILLM(
    model_name="gpt-4o-mini",
    model_params={
        "max_tokens": 16_000,
        "response_format": {"type": "json_object"}, # use json_object formatting for best results
        "temperature": 0 # turning temperature down for more deterministic results
    }
)

In [3]:
from neo4j_graphrag.experimental.components.schema import (
    SchemaEntity,
    SchemaProperty,
    SchemaRelation,
)

# define entity types (a.k.a node labels)
basic_node_labels = ["Object",
                     "Entity",
                     "Group",
                     "Person",
                     "Organization",
                     "Place"
                     ]
academic_node_labels = ["ArticleOrPaper",
                        "PublicationOrJournal"]

medical_node_labels = ["Anatomy",
                       "BiologicalProcess",
                       "Cell",
                       "CellularComponent",
                       "CellType",
                       "Condition",
                       "Disease",
                       "Drug",
                       "EffectOrPhenotype",
                       "Exposure",
                       "GeneOrProtein",
                       "Molecule",
                       "MolecularFunction",
                       "Pathway"]

node_labels = basic_node_labels + academic_node_labels + medical_node_labels

# define relationship types (a.k.a how entities can be related)
rel_types = ["ACTIVATES",
             "AFFECTS",
             "ASSESSES",
             "ASSOCIATED_WITH",
             "AUTHORED",
             "CAUSES",
             "CITES",
             "CLASSIFIES",
             "COLLABORATES_WITH"
             "CONTRIBUTES_TO",
             "CORRELATES_WITH",
             "DESCRIBES",
             "DEVELOPED",
             "DISCUSSES",
             "EXHIBITS",
             "EXPRESSES",
             "HAS_EFFECT",
             "HAS_SYMPTOM",
             "INCLUDES",
             "INDUCES",
             "INTERACTS_WITH",
             "INVOLVES",
             "LEADS_TO",
             "LINKED_TO",
             "LOCATED_IN",
             "MANIFESTS_AS",
             "OBSERVED_IN",
             "PARTICIPATES_IN",
             "PART_OF",
             "PRODUCES",
             "PUBLISHED_IN",
             "REACTS_WITH",
             "REDUCES",
             "RELATED_TO",
             "RESULTS_IN",
             "TARGETS",
             "TREATMENT_FOR",
             "TRIGGERS",
             "USED_FOR",
             "USED_WITH",
             "USES"]


# define properties. In this case we will allow for the same two properties on both nodes and relationships. 
properties = [SchemaProperty(name="name", type="STRING"), SchemaProperty(name="details", type="STRING")]

# build the schema objects

entities = [SchemaEntity(label=i, properties=properties) for i in node_labels],
relations = [SchemaRelation(label=i, properties=properties) for i in rel_types],
potential_schema = [] #this can be used to define which relationships types should connect to which entity types. We skip here, but recommended for refining knowledge graphs. 


In [4]:
prompt_template = '''
You are a medical researcher tasks with extracting information from papers 
and structuring it in a property graph to inform further medical and research Q&A

Extract the entities (nodes) and specify their type from the following Input text.
Also extract the relationships between these nodes. the relationship direction goes from the start node to the end node. 


Return result as JSON using the following format:
{{"nodes": [ {{"id": "0", "label": "the type of entity", "properties": {{"name": "name of entity", "details":" brief description of entity (dont include info about relationships)" }} }}],
  "relationships": [{{"type": "TYPE_OF_RELATIONSHIP", "start_node_id": "0", "end_node_id": "1", "properties": {{"details": "brief description of relationship if needed"}} }}] }}

Use only the information from the Input text.  Do not add any additional information.  If the input text is empty, return empty Json. 
Make sure to create as many nodes and relationships as needed to offer rich medical context for further research.
An AI knowledge assistant must be able to read this graph and immediately understand the context to inform detailed research questions. 

Use only fhe following nodes and relationships (if provided):
{schema}

Assign a unique ID (string) to each node, and reuse it to define relationships.
Do respect the source and target node types for relationship and
the relationship direction.

Do not return any additional information other than the JSON in it.

Examples:
{examples}

Input text:

{text}
'''

In [5]:
from langchain_text_splitters import CharacterTextSplitter
from neo4j_graphrag.experimental.components.text_splitters.langchain import LangChainTextSplitterAdapter
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline

kg_builder_pdf = SimpleKGPipeline(
    llm=extractor_llm,
    driver=driver,
    entities=node_labels,
    relations=rel_types,
    potential_schema=potential_schema,
    prompt_template=prompt_template,
    text_splitter=LangChainTextSplitterAdapter(CharacterTextSplitter(chunk_size=500, chunk_overlap=100, separator=".")),
    from_pdf=True
)

In [6]:
pdf_result = await kg_builder_pdf.run_async(file_path=pdf_file_path)
print(f"PDF Processing Result: {pdf_result}")

Created a chunk of size 563, which is longer than the specified 500
Created a chunk of size 646, which is longer than the specified 500
Created a chunk of size 732, which is longer than the specified 500


PDF Processing Result: run_id='ae54d809-a9c6-4953-984e-45e3c288f6e2' result={'resolver': {'number_of_nodes_to_resolve': 1041, 'number_of_created_nodes': 854}}


In [7]:
with neo4j.GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD)) as driver:
    with driver.session(database="neo4j") as session:
        session.run('''
        MATCH (n:Chunk) WHERE size(n.text) <> 0
        WITH collect(n) AS nodes, toInteger(rand()*$numberOfBatches) AS partition
        CALL (nodes){
            CALL genai.vector.encodeBatch([node IN nodes| node.text], "OpenAI", { token: $token})
            YIELD index, vector
            CALL db.create.setNodeVectorProperty(nodes[index], "embedding", vector)
        } IN TRANSACTIONS OF 1 ROW''', parameters={'token':os.getenv('OPENAI_API_KEY'), 'numberOfBatches':2})

## KG Retrieval
Now lets make some knowledge graph retrievers which we will later use for a GraphRAG pipeline

We will leverage Neo4j's vector search capabilities here. To do this we need to begin by creating a vector index on the text in our CHunk nodes

In [8]:
from neo4j_graphrag.indexes import create_vector_index

create_vector_index(driver, name="text_embeddings", label="Chunk", embedding_property="embedding", dimensions=1536, similarity_fn="cosine")

Neo4jIndexError: Neo4j vector index creation failed: An equivalent index already exists, 'Index( id=3, name='text_embeddings', type='VECTOR', schema=(:Chunk {embedding}), indexProvider='vector-2.0' )'.

Now that the index is set up we will start simple with a VectorRetriever.  The VectorRetriever just queries Chunk nodes, brining back the text and some metadata

In [9]:
from neo4j_graphrag.retrievers import VectorRetriever

from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings

embedder = OpenAIEmbeddings()

vector_retriever = VectorRetriever(
    driver,
    index_name="text_embeddings",
    embedder=embedder,
    return_properties=["text"],
)

In [10]:
vector_res = vector_retriever.get_search_results(query_text = "Which biomarkers are associated with predicting organ damage in lupus patients?")

for i in vector_res.records: print("====\n" + i.data()['node']['text'])

====
Ideally every
patient would undergo an initial evaluation that would
proﬁle his/her disease, assessing the main pathophysiolo-
gic pathway through biomarkers, therefore predicting risk
of speci ﬁc organ damage, most adequate treatment, and
would allow better follow-up and ﬂare prediction.
In this review, we will outline the pathological processes
in lupus in general terms with particular emphasis on neu-
ropsychiatric and renal involvement
====
2017 ;43:531 –548. doi:10.1016/j.rdc.2017.06.003
49. Hanly J, Urowitz MB, Su L, et al. Autoantibodies as biomarkers for
the prediction of neuropsychiatric events in systemic lupus
erythematosus. Ann Rheum Dis .2011 ;10:1726 –1732. doi:10.1136/
ard.2010.148502
50. McGlasson S, Wiseman S, Wardlaw J, Dhaun N, Hunt DPJ.
Neurological disease in lupus: toward a personalized medicine
approach. Front Immunol .2018 ;9(1146):1 –12. doi:10.3389/ ﬁmmu.
2018.01146
51. Tchessalova D, Posillico CK, Tronson NC
====
Use of blood
markers notably anti-dsDNA a

In [11]:
vector_res

RawSearchResult(records=[<Record node={'text': 'Ideally every\npatient would undergo an initial evaluation that would\nproﬁle his/her disease, assessing the main pathophysiolo-\ngic pathway through biomarkers, therefore predicting risk\nof speci ﬁc organ damage, most adequate treatment, and\nwould allow better follow-up and ﬂare prediction.\nIn this review, we will outline the pathological processes\nin lupus in general terms with particular emphasis on neu-\nropsychiatric and renal involvement'} score=0.93621826171875>, <Record node={'text': '2017 ;43:531 –548. doi:10.1016/j.rdc.2017.06.003\n49. Hanly J, Urowitz MB, Su L, et al. Autoantibodies as biomarkers for\nthe prediction of neuropsychiatric events in systemic lupus\nerythematosus. Ann Rheum Dis .2011 ;10:1726 –1732. doi:10.1136/\nard.2010.148502\n50. McGlasson S, Wiseman S, Wardlaw J, Dhaun N, Hunt DPJ.\nNeurological disease in lupus: toward a personalized medicine\napproach. Front Immunol .2018 ;9(1146):1 –12. doi:10.3389/ ﬁmmu

The GraphRAG Python Package offers a whole host of other useful retrieval covering different graphRAG patterns (link) including (examples).  and if none of those fit perfectly you can impliment your own custom retrievers (link). 

Below we will use the VectorCypherRetriever which allows you to run a graph traversal after finding text chunks.  We will use the Cypher Query language (link) to define the logic to traverse the graph.  

As a simple starting point, lets traverse three hops out from each chunk and textualize the different relationships we pick up.  We will use something called a quantified path pattern to accomplish in this. 

In [17]:
from neo4j_graphrag.retrievers import VectorCypherRetriever

vc_retriever=VectorCypherRetriever(
    driver,
    index_name="text_embeddings",
    embedder=embedder,
    retrieval_query="""
MATCH(node)<-[:FROM_CHUNK]-()-[rl:!FROM_CHUNK]-{1,3}()
UNWIND rl AS r
WITH DISTINCT r
MATCH (n)-[r]->(m)
// return textualize relations
RETURN n.name + '(' + coalesce(n.details, '') + ')'+ 
    ' - ' + type(r) + '(' + coalesce(r.details, '') + ')' +  ' -> ' + 
    m.name + '(' + coalesce(m.details, '') + ')' AS fact
        """,
)


In [18]:
vc_res = vc_retriever.get_search_results(query_text = "Which biomarkers are associated with predicting organ damage in lupus patients?")
for i in vc_res.records: print("====\n" + i.data()['fact']) 

====
lupus(A disease characterized by inflammation and damage to various organs.) - ASSOCIATED_WITH(Lupus is associated with neuropsychiatric involvement.) -> neuropsychiatric involvement(The impact of lupus on neurological and psychiatric health.)
====
lupus(A disease characterized by inflammation and damage to various organs.) - ASSOCIATED_WITH(Lupus is associated with renal involvement.) -> renal involvement(The impact of lupus on kidney function.)
====
BILAG score(A scoring system used to assess disease activity in patients.) - ASSOCIATED_WITH(BILAG score is associated with renal involvement.) -> renal involvement(The impact of lupus on kidney function.)
====
lupus(A disease characterized by inflammation and damage to various organs.) - ASSOCIATED_WITH(Flares in lupus are probably linked to epigenetic dis-equilibrium.) -> epigenetic dis-equilibrium(A disruption in the normal epigenetic regulation of gene expression.)
====
epigenetic dis-equilibrium(A disruption in the normal epigen

Different retrievers will return different types of data which may be more informative for your us4e case. in the examples above the vectorRetriever just returned some text chunks, but it wasn't clear if that fully answered the search prompt.  The VectorCypherRetriever returned much more information (using the entites and rels connected to the text chunks) some of those "rules" seem like they will be useful.  Lets put these into full GraphRAG pipelines below to understand how they will perform. 

## GraphRAG Pipelines
 You can construct GraphRAG pipelines VectorCypherRetriever with the `GraphRAG` class.  At minimum you will need to pass the constructor an LLM and a retriever. You can also pass the LLM a custome propmt template, but for now we will just use the default.
 
We construct two retrievers below.  One using vectorRetriever, the other using VectorCypherRetriever.  We will compare the results of each iwth a few questions.  

For the LLM we will use OpenAI gpt-4o for both, it as stated previously you can use [.....]

In [19]:
from neo4j_graphrag.llm import OpenAILLM as LLM
from neo4j_graphrag.generation.graphrag import GraphRAG

llm = LLM(model_name="gpt-4o",  model_params={"temperature": 0})

vector_rag = GraphRAG(llm=llm, retriever=vector_retriever)
vc_rag = GraphRAG(llm=llm, retriever=vc_retriever)

Lets start with  simple question and compare the answers for each

In [20]:
q = "What organ systems are most commonly affected by lupus? - provide a list"
print(f"Vector Response: \n{vector_rag.search(q, retriever_config={'top_k':3}).answer}")
print(f"\n\nVector + Cypher Response: \n{vc_rag.search(q, retriever_config={'top_k':3}).answer}")

Vector Response: 
The organ systems most commonly affected by lupus include:

1. Skin
2. Joints
3. Kidneys (lupus nephritis)
4. Blood cells
5. Brain and central nervous system (neuropsychiatric involvement)
6. Heart
7. Lungs


Vector + Cypher Response: 
Lupus can affect various organ systems, including:

1. Skin
2. Joints
3. Kidneys
4. Blood cells
5. Brain and central nervous system
6. Heart
7. Lungs


Now lets try a bit more of a complex question

In [21]:
q = "Which biomarkers are associated with predicting organ damage in lupus patients? - provide a list"
print(f"Vector Response: \n{vector_rag.search(q).answer}")
print(f"\n\nVector + Cypher Response: \n{vc_rag.search(q).answer}")

Vector Response: 
The context does not provide a specific list of biomarkers associated with predicting organ damage in lupus patients. However, it mentions the use of biomarkers to assess the main pathophysiologic pathway, predict risk of specific organ damage, and guide treatment and follow-up in lupus patients. For more detailed information, further research into specific studies or reviews on lupus biomarkers would be necessary.


Vector + Cypher Response: 
The context provided does not explicitly mention specific biomarkers associated with predicting organ damage in lupus patients. However, it does mention that biomarkers express the condition of lupus, which may imply their potential role in assessing disease risk and progression, including organ damage. Additionally, the context discusses the use of autoantibodies as biomarkers for predicting neuropsychiatric events in systemic lupus erythematosus, which could be related to organ damage in the context of neuropsychiatric involve