# GraphRAG Python
End-to-End Example

In [None]:
%%capture
%pip install fsspec langchain-text-splitters openai python-dotenv numpy torch

In [None]:
%%capture
%pip install -U git+https://github.com/neo4j/neo4j-graphrag-python

In [None]:
#%capture
#%pip install -U ../demo/neo4j-graphrag-python

In [None]:
from dotenv import load_dotenv
import os

# load neo4j credentials (and openai api key in background)
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

pdf_file_path = "pgpm-13-39.pdf"

## KG Building


In [None]:
import neo4j
from neo4j_graphrag.llm import OpenAILLM

driver = neo4j.GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

extractor_llm=OpenAILLM(
    model_name="gpt-4o-mini",
    model_params={
        "max_tokens": 16_000,
        "response_format": {"type": "json_object"}, # use json_object formatting for best results
        "temperature": 0 # turning temperature down for more deterministic results
    }
)

In [None]:
#define node labels
basic_node_labels = ["Object",
                     "Entity",
                     "Group",
                     "Person",
                     "Organization",
                     "Place"]

academic_node_labels = ["ArticleOrPaper",
                        "PublicationOrJournal"]

medical_node_labels = ["Anatomy",
                       "BiologicalProcess",
                       "Cell",
                       "CellularComponent",
                       "CellType",
                       "Condition",
                       "Disease",
                       "Drug",
                       "EffectOrPhenotype",
                       "Exposure",
                       "GeneOrProtein",
                       "Molecule",
                       "MolecularFunction",
                       "Pathway"]

node_labels = basic_node_labels + academic_node_labels + medical_node_labels

# define relationship types
rel_types = ["ACTIVATES",
             "AFFECTS",
             "ASSESSES",
             "ASSOCIATED_WITH",
             "AUTHORED",
             "CAUSES",
             "CITES",
             "CLASSIFIES",
             "COLLABORATES_WITH"
             "CONTRIBUTES_TO",
             "CORRELATES_WITH",
             "DESCRIBES",
             "DEVELOPED",
             "DISCUSSES",
             "EXHIBITS",
             "EXPRESSES",
             "HAS_EFFECT",
             "HAS_SYMPTOM",
             "INCLUDES",
             "INDUCES",
             "INTERACTS_WITH",
             "INVOLVES",
             "LEADS_TO",
             "LINKED_TO",
             "LOCATED_IN",
             "MANIFESTS_AS",
             "OBSERVED_IN",
             "PARTICIPATES_IN",
             "PART_OF",
             "PRODUCES",
             "PUBLISHED_IN",
             "REACTS_WITH",
             "REDUCES",
             "RELATED_TO",
             "RESULTS_IN",
             "TARGETS",
             "TREATMENT_FOR",
             "TRIGGERS",
             "USED_FOR",
             "USED_WITH",
             "USES"]


In [None]:
prompt_template = '''
You are a medical researcher tasks with extracting information from papers 
and structuring it in a property graph to inform further medical and research Q&A

Extract the entities (nodes) and specify their type from the following Input text.
Also extract the relationships between these nodes. the relationship direction goes from the start node to the end node. 


Return result as JSON using the following format:
{{"nodes": [ {{"id": "0", "label": "the type of entity", "properties": {{"name": "name of entity", "details":" brief description of entity (dont include info about relationships)" }} }}],
  "relationships": [{{"type": "TYPE_OF_RELATIONSHIP", "start_node_id": "0", "end_node_id": "1", "properties": {{"details": "brief description of relationship if needed"}} }}] }}

Use only the information from the Input text.  Do not add any additional information.  If the input text is empty, return empty Json. 
Make sure to create as many nodes and relationships as needed to offer rich medical context for further research.
An AI knowledge assistant must be able to read this graph and immediately understand the context to inform detailed research questions. 

Use only fhe following nodes and relationships (if provided):
{schema}

Assign a unique ID (string) to each node, and reuse it to define relationships.
Do respect the source and target node types for relationship and
the relationship direction.

Do not return any additional information other than the JSON in it.

Examples:
{examples}

Input text:

{text}
'''

In [None]:
from langchain_text_splitters import CharacterTextSplitter
from neo4j_graphrag.experimental.components.text_splitters.langchain import LangChainTextSplitterAdapter
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline

kg_builder_pdf = SimpleKGPipeline(
    llm=extractor_llm,
    driver=driver,
    entities=node_labels,
    relations=rel_types,
    prompt_template=prompt_template,
    #text_splitter=LangChainTextSplitterAdapter(CharacterTextSplitter(chunk_size=500, chunk_overlap=100, separator=".")),
    from_pdf=True
)

In [None]:
pdf_result = await kg_builder_pdf.run_async(file_path=pdf_file_path)
print(f"PDF Processing Result: {pdf_result}")

In [None]:
with neo4j.GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD)) as driver:
    with driver.session(database="neo4j") as session:
        session.run('''
        MATCH (n:Chunk) WHERE size(n.text) <> 0
        WITH collect(n) AS nodes, toInteger(rand()*$numberOfBatches) AS partition
        CALL (nodes){
            CALL genai.vector.encodeBatch([node IN nodes| node.text], "OpenAI", { token: $token})
            YIELD index, vector
            CALL db.create.setNodeVectorProperty(nodes[index], "embedding", vector)
        } IN TRANSACTIONS OF 1 ROW''', parameters={'token':os.getenv('OPENAI_API_KEY'), 'numberOfBatches':2})

## KG Retrieval
Now lets make some knowledge graph retrievers which we will later use for a GraphRAG pipeline

We will leverage Neo4j's vector search capabilities here. To do this we need to begin by creating a vector index on the text in our Chunk nodes

In [None]:
from neo4j_graphrag.indexes import create_vector_index

create_vector_index(driver, name="text_embeddings", label="Chunk", embedding_property="embedding", dimensions=1536, similarity_fn="cosine")

Now that the index is set up we will start simple with a VectorRetriever.  The VectorRetriever just queries Chunk nodes, brining back the text and some metadata

In [9]:
from neo4j_graphrag.retrievers import VectorRetriever

from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings

embedder = OpenAIEmbeddings()

vector_retriever = VectorRetriever(
    driver,
    index_name="text_embeddings",
    embedder=embedder,
    return_properties=["text"],
)

In [10]:
vector_res = vector_retriever.get_search_results(query_text = "Which biomarkers are associated with predicting organ damage in lupus patients?")

for i in vector_res.records: print("====\n" + i.data()['node']['text'])

====
se
with active disease] or C3 levels [which fall when the disease
is active] provide help to the clinician when assessing disease
activity. Scoring is determined based on new, improving,
worse and similar categories. The art of the physician is to
identify the activity features clinically and apply the appro-
priate management. Renal lupus can, rarely, be active without
the presence of other SLE disease involvement. Cerebral lupus
is scored in 20 separate sections from seizures to demyelina-
tion. In patients with subacute presentations, it may be dif ﬁcult
to assess and treat. Anti-dsDNA antibodies and complement
can be normal in a proportion of cases.
SLEDAI scoring systems are more limited in represen-
tation of the features of lupus that are present. It provides
a global score rather than distinguishing the organ systems
and does not distinguish patients who are partly respond-
ing, from those whose activity is unchanged or worse.
Both SLEDAI and BILAG have been validated74,75

The GraphRAG Python Package offers a whole host of other useful retrieval covering different GraphRAG patterns (text2cypher, vector and/or full text + Cypher template, etc.).  and if none of those fit perfectly you can implement your own custom retrievers. 

Below we will use the VectorCypherRetriever which allows you to run a graph traversal after finding text chunks.  We will use the Cypher Query language to define the logic to traverse the graph.  

As a simple starting point, lets traverse up to 2 hops out from each chunk and textualize the different relationships we pick up.  We will use something called a quantified path pattern to accomplish in this. 

In [28]:
from neo4j_graphrag.retrievers import VectorCypherRetriever

vc_retriever=VectorCypherRetriever(
    driver,
    index_name="text_embeddings",
    embedder=embedder,
    retrieval_query="""
MATCH (node)<-[:FROM_CHUNK]-()-[rl:!FROM_CHUNK]-{1,2}()
UNWIND rl AS r
WITH DISTINCT r
MATCH (sourceDoc:Document)<-[FROM_DOCUMENT]-()<-[:FROM_CHUNK]-(n)-[r]->(m)
WITH n,r,m, apoc.text.join(collect(DISTINCT sourceDoc.path), ', ') AS sources
// return textualize relations
RETURN n.name + '(' + coalesce(n.details, '') + ')'+ 
    ' - ' + type(r) + '(' + coalesce(r.details, '') + ')' +  ' -> ' + 
    m.name + '(' + coalesce(m.details, '') + ')' + ' [sourced from: ' + sources + ']' AS fact
        """,
)


In [29]:
vc_res = vc_retriever.get_search_results(query_text = "How do environmental factors influence systemic lupus erythematosus?")
for i in vc_res.records: print("====\n" + i.data()['fact']) 

====
Systemic lupus erythematosus(An autoimmune disease characterized by the body's immune system attacking its own tissues.) - DESCRIBES(The paper discusses the role of sex hormones in SLE.) -> Sex hormones in acquired immunity and autoimmune disease(A paper discussing the role of sex hormones in immunity and autoimmune diseases.) [sourced from: pgpm-13-39.pdf]
====
Systemic lupus erythematosus(An autoimmune disease characterized by the body's immune system attacking its own tissues.) - DESCRIBES(The paper explores the relationship between stress and SLE.) -> The role of stress in the mosaic of autoimmunity: an overlooked association(A paper exploring the relationship between stress and autoimmunity.) [sourced from: pgpm-13-39.pdf]
====
Systemic lupus erythematosus(An autoimmune disease characterized by the body's immune system attacking its own tissues.) - DESCRIBES(The review discusses the interaction between viruses and SLE.) -> Viruses and autoimmunity: a review on the potential i

In [30]:
len(vc_res.records)

123

## GraphRAG Pipelines
 You can construct GraphRAG pipelines with the `GraphRAG` class.  At minimum, you will need to pass the constructor an LLM and a retriever. You can also pass a custom prompt template, but for now we will just use the default.
 

In [31]:
from neo4j_graphrag.llm import OpenAILLM as LLM
from neo4j_graphrag.generation.graphrag import GraphRAG

llm = LLM(model_name="gpt-4o",  model_params={"temperature": 0})

vc_rag = GraphRAG(llm=llm, retriever=vc_retriever)

Lets start with  simple question and compare the answers for each

In [39]:
q = "What organ systems are most commonly affected by lupus?"
print(f"\n\nVector + Cypher Response: \n{vc_rag.search(q, retriever_config={'top_k':3}).answer}")



Vector + Cypher Response: 
Systemic Lupus Erythematosus (SLE) is a chronic autoimmune disease that can affect multiple organ systems. The most commonly affected organ systems include:

1. **Skin and Mucous Membranes**: SLE often causes skin rashes, including the characteristic "butterfly rash" on the face, as well as photosensitivity and oral ulcers.

2. **Musculoskeletal System**: Joint pain and arthritis are common symptoms, affecting the joints and muscles.

3. **Renal System**: Lupus nephritis, a significant form of kidney inflammation, develops in up to 50% of SLE patients.

4. **Nervous System**: Neuropsychiatric SLE can affect both the central and peripheral nervous systems, leading to a variety of neurological and psychiatric symptoms.

5. **Cardiovascular System**: SLE can lead to inflammation of the heart and blood vessels, increasing the risk of cardiovascular disease.

6. **Respiratory System**: Inflammation of the lungs and pleura can occur, leading to pleuritis and othe

Now lets try a bit more of a complex question

In [37]:
q = "How does precision medicine help in treating systemic lupus erythematosus (SLE)?"
print(f"\n\nVector + Cypher Response: \n{vc_rag.search(q).answer}")



Vector + Cypher Response: 
Precision medicine helps in treating systemic lupus erythematosus (SLE) by tailoring treatment based on individual patient characteristics. This approach involves assessing disease activity using blood markers and other patient-specific factors to customize therapy. Precision medicine aims to achieve clinical and immunological remission, where there is no clinical activity and normal antibody levels. By focusing on the unique aspects of each patient's condition, precision medicine seeks to improve treatment outcomes and manage SLE more effectively.
