In [None]:
# Install required packages using pip
%pip install pypdf langchain_community langchain langchain_openai neo4j_genai langchain_experimental IPython neo4j yfiles_jupyter_graphs yfiles_jupyter_graphs_for_neo4j==1.3.1

## Initialization
This section initializes the notebook by importing necessary libraries and loading environment variables.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv
from neo4j import GraphDatabase
import ast
from IPython.display import clear_output

# Check if running in Google Colab and enable custom widget manager if true
try:
  import google.colab
  from google.colab import output
  output.enable_custom_widget_manager()
except:
  pass

In [None]:
# Load environment variables from 'credentials.env' if it exists
if os.path.exists('credentials.env'):
    load_dotenv('credentials.env', override=True)

    # Neo4j credentials
    uri = os.getenv('NEO4J_URI')
    username = os.getenv('NEO4J_USERNAME')
    password = os.getenv('NEO4J_PASSWORD')
    database = os.getenv('NEO4J_DATABASE')

    # OpenAI credentials
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    os.environ['OPENAI_API_KEY']=OPENAI_API_KEY
else:
    print("File 'credentials.env' not found.")

## Create Text Chunks and Generate Embeddings
This section splits the PDF document into chunks and generates embeddings for each chunk using OpenAI.

In [None]:
# Initialize text splitter with specified chunk size and overlap
chunk_size = 800
chunk_overlap = 100
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap  = chunk_overlap,
    length_function = len,
    is_separator_regex = False,
)

In [None]:
# Load and split the PDF document into chunks
doc_path = 'microsoft-blogpost.pdf'
all_chunks = []

loader = PyPDFLoader(doc_path)
pages = loader.load_and_split()
for page in pages:
    chunks = text_splitter.split_text(page.page_content)
    for chunk in chunks:
        all_chunks.append(chunk)
        
print(f"Parsed: {doc_path}")
print(f"Chunked {len(pages)} pages into {len(all_chunks)} chunks")

In [None]:
# Check the first two chunks to ensure correct splitting
print('\nFirst chunk: \n' + all_chunks[0])
print('\nSecond chunk: \n' + all_chunks[1])

In [None]:
# Generate embeddings for each chunk using OpenAI model
model = 'text-embedding-3-small'
embeddings_model = OpenAIEmbeddings(
    model = model,
    openai_api_key = OPENAI_API_KEY
)
embeddings = []
for chunk in all_chunks:
    embeddings.append(embeddings_model.embed_query(chunk))

In [None]:
# Check the first embedding to ensure correct generation
print(all_chunks[0])
print('\nFirst embedding (sample): \n' + str(embeddings[0][1:100]))

## Write Data to Neo4j
This section shows how to clear the existing database, create document nodes, and link text chunks with their embeddings in Neo4j.

In [None]:
# Connect to Neo4j and clear the database
driver = GraphDatabase.driver(uri, auth=(username, password), database=database)
driver.execute_query('MATCH (n) DETACH DELETE n')
print("Database cleared.")

In [None]:
driver.execute_query("""
CREATE VECTOR INDEX vecindex IF NOT EXISTS
FOR (m:Chunk)
ON m.embedding
OPTIONS {indexConfig: {
 `vector.dimensions`: 1536,
 `vector.similarity_function`: 'cosine'
}}""")

In [None]:
# Create a document node for the PDF
driver.execute_query('CREATE (d:Document{name:"'+doc_path+'"})')
print('Document created.')

In [None]:
# Create chunk nodes and link them to the document node
for index, chunk in enumerate(all_chunks):
    embedding = embeddings[index]
    driver.execute_query(f"""
    MATCH (d:Document)
    WHERE d.name = '{doc_path}'
    CREATE (d)-[:HAS_CHUNK]->(c:Chunk)
    SET c.text = '{chunk}'
    SET c.embedding = {embedding}
    SET c.index = {index}
    """)
    
print('Embeddings set.')

## Visualize the Graph
Use yFiles Jupyter graphs for Neo4j to visualize the document and its chunks.

In [None]:
# Visualize the document and chunks in Neo4j
from yfiles_jupyter_graphs_for_neo4j import Neo4jGraphWidget
widget = Neo4jGraphWidget(driver, overview_enabled=False, context_start_with=None)
widget.show_cypher("MATCH (d:Document)-[r]->(c:Chunk) RETURN d,r,c")

## Extract Graph from Text
Use LangChain to extract entities from the graph.

In [None]:
### Graph Schema
# either one can be set to 'None' to let the model infer data.
allowed_nodes = ["Capability", "Service", "Organization", "Department", "Industry", "DataSource", "Person", "Article"]
allowed_relationships = ["PROVIDED_BY","HAS_SERVICE","ENABLES_CAPABILITY","USED_BY","USES_DATA_FROM","BELONGS_TO_DEPARTMENT","CONNECTS_TO"]


In [None]:
# Import necessary libraries and initialize LangChain's LLMGraphTransformer
import os
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(temperature=0, model_name="gpt-4o")
llm_transformer = LLMGraphTransformer(
    llm=llm, 
    allowed_nodes=allowed_nodes, 
    allowed_relationships=allowed_relationships)
from langchain_core.documents import Document

# Initialize dictionaries to store nodes and relationships
nodes = {}
rels = {}

# Define functions to generate unique hashes for nodes and relationships
def get_node_hash(node):
    return hash(node.id + ':' + node.type)

def get_rel_hash(rel):
    return hash(rel.source.id + ':' + rel.source.type + ':' + rel.type + ':' + rel.target.id + ':' + rel.target.type)

# Process each text chunk to extract graph information
for index, chunk in enumerate(all_chunks):
    documents = [Document(page_content=chunk)]
    graph_documents = llm_transformer.convert_to_graph_documents(documents)

    # Extract unique nodes from the LangChain output
    for node in graph_documents[0].nodes:
        node_hash = get_node_hash(node)
        if node_hash in nodes:
            nodes[node_hash]['chunks'].append(index)
        else:
            nodes[node_hash] = {'id': node_hash, 'name': node.id, 'label': node.type, 'chunks': [index]}

    # Extract unique relationships from the LangChain output
    for rel in graph_documents[0].relationships:
        rel_hash = get_rel_hash(rel)
        if rel_hash in rels:
            rels[rel_hash]['chunks'].append(index)
        else:
            source_hash = get_node_hash(rel.source)
            target_hash = get_node_hash(rel.target)
            rels[rel_hash] = {'id': rel_hash, 'source': source_hash, 'target': target_hash, 'type': rel.type, 'chunks': [index]}

    print(f"Loaded chunk {index+1}/{len(all_chunks)} Current nodes: {len(nodes)}, relationships: {len(rels)}...")

print('Complete')

In [None]:
# Write nodes to Neo4j and link them to the corresponding chunks
## TODO -> this needs some optimization using parameters / batching.
for node in nodes.values():
    driver.execute_query(f"""
    CREATE (n:{node['label']})
    SET n.name = "{node['name']}"
    SET n.id = "{node['id']}"
    WITH n
    UNWIND {str(node['chunks'])} as chunk_index
    MATCH (c:Chunk)
    WHERE c.index = chunk_index
    CREATE (n)-[:IN]->(c)
    """)

print('Nodes created and linked to chunks.')

In [None]:
# Write relationships to Neo4j
## TODO -> this needs some optimization using parameters / batching.
for rel in rels.values():
    driver.execute_query(f"""
    MATCH (n), (m)
    WHERE n.id = "{rel['source']}" AND m.id = "{rel['target']}"
    CREATE (n)-[r:{rel['type']}]->(m)
    SET r.chunks = {str(rel['chunks'])}
    SET r.id = {str(rel['id'])}
    """)

print('Relationships created.')

In [None]:
# Visualize the graph in Neo4j
from yfiles_jupyter_graphs_for_neo4j import Neo4jGraphWidget
widget = Neo4jGraphWidget(driver, overview_enabled=False, context_start_with=None)
widget.show_cypher("MATCH (d)-[r]->(c) WHERE type(r) <> 'IN' AND type(r) <> 'HAS_CHUNK' RETURN d,r,c")

## Time for Querying
We now demonstrate how to use both regular VectorRAG and GraphRAG to ask questions to the database:
- The VectorRAG implementation uses only the chunked text and their embeddings.
- The GraphRAG implementation also uses the context around the documents (extracted entities and their relationships) to generate an answer.

In [None]:
from neo4j import GraphDatabase
from neo4j_genai.retrievers import VectorRetriever, VectorCypherRetriever
from neo4j_genai.llm import OpenAILLM
from neo4j_genai.generation import GraphRAG
from neo4j_genai.embeddings.openai import OpenAIEmbeddings

index_name = "vecindex"
### Set up the RAG framework

# 1. Connect to Neo4j database
driver = GraphDatabase.driver(uri, auth=(username, password), database=database)

# 2. Create Embedder object, needed to convert the user question (text) to a vector
embedder = OpenAIEmbeddings(model="text-embedding-3-small")

# 3. LLM
llm = OpenAILLM(model_name="gpt-4o", model_params={"temperature": 0})

In [None]:
# Vector RAG:
retriever = VectorRetriever(driver, index_name, embedder)
# Initialize the RAG pipeline (note: this is just using plain vector RAG without context)
rag = GraphRAG(retriever=retriever, llm=llm)

# Query the graph
query_text = "Which Microsoft service can Reps on the road use?"
response = rag.search(query_text, retriever_config={"top_k": 3})
print(response.answer)

In [None]:
# Now, context-aware GraphRAG
retriever = VectorCypherRetriever(
    driver,
    index_name=index_name,
    retrieval_query="""
    // Retrieve Neighbourhood (Context)
    MATCH path=(node)<-[r:IN]-(node2)
    OPTIONAL MATCH (node2)-[r2]-(node3) 
    WHERE type(r2) <> "IN"
    RETURN node.index as chunkindex, node2.id as sourceid, node2.name as source, 
    toString(r2.id) as relid, type(r2) as reltype, 
    node3.id as targetid, node3.name as targetname, 
    score""",
    embedder=embedder
)

# Initialize the RAG pipeline
rag = GraphRAG(retriever=retriever, llm=llm)

# Query the graph
response = rag.search(query_text, retriever_config={"top_k": 3}, return_context=True)
print(response.answer)

In [None]:
import re
# Define the regex pattern to match key-value pairs
pattern = re.compile(r"(\w+\.\w+|type\(r2\)|\w+)=('[^']*'|\d+\.\d+|\d+|[^' \n]+)")


used_context = [item.content for item in response.retriever_result.items]
used_context = [re.findall( pattern, item) for item in used_context]
used_context = [{key: value.replace("'","") for key, value in matches} for matches in used_context]

In [None]:
# Visualize the graph in Neo4j
from yfiles_jupyter_graphs_for_neo4j import Neo4jGraphWidget
widget = Neo4jGraphWidget(driver, overview_enabled=False, context_start_with=None)
widget.show_cypher("""
UNWIND $context as row
MATCH (c:Chunk)-[r1]-(n)-[r2]-(m) 
WHERE c.index = toInteger(row['chunkindex']) AND n.id = row['sourceid'] AND r2.id = toInteger(row['relid']) AND m.id = row['targetid']
RETURN c, r1, n, r2, m

""", context=used_context)