In [1]:
# Install required packages using pip
%pip install pypdf langchain_community langchain langchain_openai neo4j_genai langchain_experimental IPython neo4j yfiles_jupyter_graphs yfiles_jupyter_graphs_for_neo4j==1.3.1

Collecting openai<2.0.0,>=1.32.0 (from langchain_openai)
  Using cached openai-1.40.6-py3-none-any.whl.metadata (22 kB)
Using cached openai-1.40.6-py3-none-any.whl (361 kB)
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 0.28.0
    Uninstalling openai-0.28.0:
      Successfully uninstalled openai-0.28.0
Successfully installed openai-1.40.6

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Initialization
This section initializes the notebook by importing necessary libraries and loading environment variables.

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv
from neo4j import GraphDatabase
import ast

# Check if running in Google Colab and enable custom widget manager if true
try:
  import google.colab
  from google.colab import output
  output.enable_custom_widget_manager()
except:
  pass

In [3]:
# Load environment variables from 'credentials.env' if it exists
if os.path.exists('credentials.env'):
    load_dotenv('credentials.env', override=True)

    # Neo4j credentials
    uri = os.getenv('NEO4J_URI')
    username = os.getenv('NEO4J_USERNAME')
    password = os.getenv('NEO4J_PASSWORD')
    database = os.getenv('NEO4J_DATABASE')

    # OpenAI credentials
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    os.environ['OPENAI_API_KEY']=OPENAI_API_KEY
else:
    print("File 'credentials.env' not found.")

## Create Chunk Nodes and Embeddings
Load the predefined chunks from the JSON file into the graph.

In [4]:
# Initialize text splitter with specified chunk size and overlap
chunk_size = 800
chunk_overlap = 100
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap  = chunk_overlap,
    length_function = len,
    is_separator_regex = False,
)

In [5]:
import json

# Define the path to the JSON file
file_path = 'data_autobahn/D83.json'

# Open and load the JSON file
with open(file_path, 'r', encoding='utf-8') as file:
    all_chunks = json.load(file)
    all_chunks = [{'group':chunk['Group'], 'text':chunk["Concatenated Text"], 'unit':chunk['unit'], 'amount':chunk['amount']} for chunk in all_chunks]

In [6]:
all_chunks = all_chunks[0:10]
# Check the first two chunks to ensure correct splitting
print('\nFirst chunk: \n' + all_chunks[0]['text'])
print('\nSecond chunk: \n' + all_chunks[1]['text'])


First chunk: 
Sämtliche Verkehrsführungspläne einschließlich der kompletten Beschilderung (jeweils 22-fach in Papierform) erstellen. Grundlagen bilden die Baubeschreibung, das Verkehrsführungskonzept des AG, die Regelpläne der RSA und die ZTV-SA. Für jede der Bauphasen ist jeweils ein gesonderter Verkehrsführungsplan zu erstellen. Für die Verkehrsanordnung vor Baubeginn ist folgendes zu beachten: Die gesamten Unterlagen sind spätestens sechs Werktage nach Auftragserteilung zur Genehmigung vorzulegen. Gilt für BA 2.

Second chunk: 
Sämtliche Verkehrsführungspläne einschließlich der kompletten Beschilderung (jeweils 22-fach in Papierform) erstellen. Grundlagen bilden die Baubeschreibung, das Verkehrsführungskonzept des AG, die Regelpläne der RSA und die ZTV-SA. Für jede der Bauphasen ist jeweils ein gesonderter Verkehrsführungsplan zu erstellen. Für die Verkehrsanordnung vor Baubeginn ist folgendes zu beachten: Die gesamten Unterlagen sind spätestens sechs Werktage nach Auftragserteilun

In [7]:
# Generate embeddings for each chunk using OpenAI model
model = 'text-embedding-3-small'
embeddings_model = OpenAIEmbeddings(
    model = model,
    openai_api_key = OPENAI_API_KEY
)
embeddings = []
for chunk in all_chunks:
    embeddings.append(embeddings_model.embed_query(chunk['text']))

In [8]:
# Check the first embedding to ensure correct generation
print(all_chunks[0])
print('\nFirst embedding (sample): \n' + str(embeddings[0][1:100]))

{'group': '00000001', 'text': 'Sämtliche Verkehrsführungspläne einschließlich der kompletten Beschilderung (jeweils 22-fach in Papierform) erstellen. Grundlagen bilden die Baubeschreibung, das Verkehrsführungskonzept des AG, die Regelpläne der RSA und die ZTV-SA. Für jede der Bauphasen ist jeweils ein gesonderter Verkehrsführungsplan zu erstellen. Für die Verkehrsanordnung vor Baubeginn ist folgendes zu beachten: Die gesamten Unterlagen sind spätestens sechs Werktage nach Auftragserteilung zur Genehmigung vorzulegen. Gilt für BA 2.', 'unit': 'Psch', 'amount': '00000001000'}

First embedding (sample): 
[0.020278731361031532, 0.04067584127187729, -0.07306499034166336, -0.02708566002547741, -0.006173589266836643, 0.009002904407680035, -0.04088892787694931, -0.027061983942985535, 0.010358370840549469, 0.015768399462103844, -0.05355573818087578, 0.06013774126768112, -0.004317960701882839, 0.010908843949437141, -0.061132144182920456, -0.011518508195877075, 0.023510541766881943, -0.0145609099

## Write Data to Neo4j
This section shows how to clear the existing database, create document nodes, and link text chunks with their embeddings in Neo4j.

In [9]:
# Connect to Neo4j and clear the database
driver = GraphDatabase.driver(uri, auth=(username, password), database=database)
driver.execute_query('MATCH (n) DETACH DELETE n')
print("Database cleared.")

Database cleared.


In [10]:
driver.execute_query("""
CREATE VECTOR INDEX vecindex IF NOT EXISTS
FOR (m:Chunk)
ON m.embedding
OPTIONS {indexConfig: {
 `vector.dimensions`: 1536,
 `vector.similarity_function`: 'cosine'
}}""")
print("Vector index created")

Vector index created


In [11]:
driver.execute_query("""CREATE INDEX IF NOT EXISTS FOR (n:Chunk) ON (n.index) """)
driver.execute_query("""CREATE INDEX IF NOT EXISTS FOR (n:Unit) ON (n.name) """)
driver.execute_query("""CREATE INDEX IF NOT EXISTS FOR (n:Amount) ON (n.value) """)
print("Indexes created.")

Indexes created.


In [12]:
# Create a document node for the PDF
driver.execute_query('CREATE (d:Document{name:"'+file_path+'"})')
print('Document created.')

Document created.


In [13]:

for index, chunk in enumerate(all_chunks):
    driver.execute_query("""
    MERGE (:Unit{name:$unit})
    """, {'unit': chunk['unit']})

for index, chunk in enumerate(all_chunks):
    driver.execute_query("""
    MERGE (:Amount{value:toFloat($amount)/1000.0})
    """, {'amount': chunk['amount']})

In [14]:
# Create chunk nodes and link them to the document node
for index, chunk in enumerate(all_chunks):
    embedding = embeddings[index]
    driver.execute_query(f"""
    MATCH (d:Document)
    WHERE d.name = '{file_path}'
    CREATE (d)-[:HAS_CHUNK]->(c:Chunk)
    SET c.text = $text
    SET c.group = '{all_chunks[index]['group']}'
    SET c.embedding = {embedding}
    SET c.index = {index}
    """, {'text': chunk['text']})
    
print('Chunks created and embeddings set.')

Chunks created and embeddings set.


In [15]:
# Create chunk nodes and link them to the document node
for index, chunk in enumerate(all_chunks):
    embedding = embeddings[index]
    driver.execute_query(f"""
    MATCH (c:Chunk), (a:Amount), (u:Unit)
    WHERE c.index = $index AND a.value = toFloat($amount)/1000.0 AND u.name = $unit
    CREATE (c)-[:HAS_AMOUNT]->(a)
    CREATE (c)-[:HAS_UNIT]->(u)
    """, {'index': index, 'amount': chunk['amount'], 'unit': chunk['unit']})
    
print('Chunks linked to amounts and units')

Chunks linked to amounts and units


## Visualize the Graph
Use yFiles Jupyter graphs for Neo4j to visualize the document and its chunks.

In [16]:
# Visualize the document and chunks in Neo4j
from yfiles_jupyter_graphs_for_neo4j import Neo4jGraphWidget
widget = Neo4jGraphWidget(driver, overview_enabled=False, context_start_with=None)
widget.add_node_configuration('Chunk', text= lambda node : {
        'text': node['properties']['text'], 
        'fontSize': 20, 
        'maximumWidth': 500, 
        'wrapping': 'word', 
        'textAlignment': 'center'
    })  
widget.show_cypher("MATCH path=(d:Document)-[r]->(c:Chunk)-[]->() RETURN path LIMIT 20")

GraphWidget(layout=Layout(height='680px', width='100%'))

## Extract Graph from Text
First, clear all the entities if we had run the workflow before.

In [17]:
# Connect to Neo4j and clear the database
driver = GraphDatabase.driver(uri, auth=(username, password), database=database)
# Connect to Neo4j and clear the database
driver.execute_query("""MATCH ()-[r]->() 
WHERE NOT type(r) IN ["HAS_UNIT","HAS_AMOUNT","HAS_CHUNK"]
DELETE r""")

driver.execute_query("""MATCH (n)
WHERE NOT labels(n)[0] IN ["Chunk", "Document", "Amount", "Unit"]
DELETE n""")
print("Cleared all extracted entities.")

Cleared all extracted entities.


## Define prompting for entity extraction.
We customize the LangChain method code to allow for an annotated schema.
The idea is to merge these two approaches together:

https://api.python.langchain.com/en/latest/_modules/langchain_experimental/graph_transformers/llm.html#LLMGraphTransformer

https://github.com/neo4j-partners/neo4j-generative-ai-azure/blob/main/ingestion/ingestion.ipynb


In [18]:
%pip install openai==0.28
%pip install python-dotenv 
%pip install retry

Collecting openai==0.28
  Using cached openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Using cached openai-0.28.0-py3-none-any.whl (76 kB)
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.40.6
    Uninstalling openai-1.40.6:
      Successfully uninstalled openai-1.40.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-openai 0.1.16 requires openai<2.0.0,>=1.32.0, but you have openai 0.28.0 which is incompatible.[0m[31m
[0mSuccessfully installed openai-0.28.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;

In [19]:
from retry import retry
import openai
from openai import OpenAI

client = OpenAI()


# Helper function for calling LLM with custom prompt.
@retry(tries=2, delay=5)
def call_llm(system, prompt):
    completion = client.chat.completions.create(
          model="gpt-4o",
          temperature=0,  # Try to be as deterministic as possible
          messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": prompt}
          ]
        )
    return completion.choices[0].message.content

In [20]:
system_prompt = (
    "# Knowledge Graph Instructions for GPT-4\n"
    "## 1. Overview\n"
    "You are a top-tier algorithm designed for extracting information in structured "
    "formats to build a knowledge graph.\n"
    "Try to capture as much information from the text as possible without "
    "sacrificing accuracy. Do not add any information that is not explicitly "
    "mentioned in the text.\n"
    "- **Nodes** represent entities and concepts.\n"
    "- The aim is to achieve simplicity and clarity in the knowledge graph, making it\n"
    "accessible for a vast audience.\n"
    "## 2. Labeling Nodes\n"
    "- **Consistency**: Ensure you use available types for node labels.\n"
    "Ensure you use basic or elementary types for node labels.\n"
    "- For example, when you identify an entity representing a person, "
    "always label it as **'person'**. Avoid using more specific terms "
    "like 'mathematician' or 'scientist'."
    "- **Node IDs**: Never utilize integers as node IDs. Node IDs should be "
    "names or human-readable identifiers found in the text.\n"
    "- **Relationships** represent connections between entities or concepts.\n"
    "Ensure consistency and generality in relationship types when constructing "
    "knowledge graphs. Instead of using specific and momentary types "
    "such as 'BECAME_PROFESSOR', use more general and timeless relationship types "
    "like 'PROFESSOR'. Make sure to use general and timeless relationship types!\n"
    "## 3. Coreference Resolution\n"
    "- **Maintain Entity Consistency**: When extracting entities, it's vital to "
    "ensure consistency.\n"
    'If an entity, such as "John Doe", is mentioned multiple times in the text '
    'but is referred to by different names or pronouns (e.g., "Joe", "he"),'
    "always use the most complete identifier for that entity throughout the "
    'knowledge graph. In this example, use "John Doe" as the entity ID.\n'
    "Remember, the knowledge graph should be coherent and easily understandable, "
    "so maintaining consistency in entity references is crucial.\n"
    "## 4. Strict Compliance\n"
    "Adhere to the rules strictly. Non-compliance will result in termination.")

In [21]:
template_prompt="""From the technical description of a construction task below, extract the following Entities & relationships. The text is in German. described in the mentioned format 
0. ALWAYS FINISH THE OUTPUT. Never send partial responses.
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
   `name` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. Do not create new entity types that aren't mentioned below. You will have to generate as many entities as needed as per the types below:
    
    Entity Types:
    
    label:'Unit',name:string // A unit of measurement, e.g. KG, M, Psc
    label:'Amount',unit:str, value:float // A numeric value with a unit.
    label:'Option',name:string // An option that is selected as part of the task.
    label:'Material',name:string // A material that is used in the task. E.g. metal, concrete, powder.
    label:'Norm',name:string // A norm or standard this is applied in this task.
    label:'Dimension',name:string // A dimension relevant to the task.
    label:'Relation',position_id:string // A relation to another task. For example, they may refer to the previous task. This should be embedded here.
    
2. Next, generate relationships between the entities. To refer the head (start) and tail (end) entity, use their respective `name` property. 
The relationship types can only be generated between pairs of node labels as specified here:
    
    Relationship Types:
    from:'Norm',to:'Option',relationship_type:'HAS_OPTION'


The output should look like this. IT MUST BE VALID JSON:
{
    "entities": [{"label":"Option","name":"Yellow Line"}],
    "relationships": [{"start_label":"Option", "start_name": "Yellow Line", "relationship_type":"RELATED_TO","end_label":"Option", "end_name": "Red Line"}],
}

For the following text, extract entities and relations:
$text
"""

In [22]:
from string import Template

def run_completion(text):
    try:
      prompt = Template(template_prompt).substitute(text=text)
      res = call_llm(system_prompt, prompt)
      parsed = json.loads(res.replace('```json', '').replace('```','').replace("\'", "'"))
      
      return parsed
    except Exception as e:
        print(e)

chunk_entities = []
for index, chunk in enumerate(all_chunks):
    chunk_entities.append(run_completion(chunk['text']))
    print(f"Loaded chunk {index+1}/{len(all_chunks)}")
    

Loaded chunk 1/10
Loaded chunk 2/10
Loaded chunk 3/10
Loaded chunk 4/10
Loaded chunk 5/10
Loaded chunk 6/10
Loaded chunk 7/10
Loaded chunk 8/10
Loaded chunk 9/10
Loaded chunk 10/10


In [24]:

# Write nodes to Neo4j and link them to the corresponding chunks
## TODO -> this needs some optimization using parameters / batching.
for index, chunk_obj in enumerate(chunk_entities):
    driver.execute_query("""
    UNWIND $entities as entity
    CALL apoc.merge.node([entity.label],entity) YIELD node as n
    WITH n
    MATCH (c:Chunk)
    WHERE c.index = $index
    CREATE (n)-[:IN]->(c)
    """, {'index': index, 'entities':chunk_obj['entities']})

print('Nodes created and linked to chunks.')

Nodes created and linked to chunks.


In [27]:
# Write relationships to Neo4j and link them to the corresponding nodes.
## TODO -> this needs some optimization using parameters / batching.
for index, chunk_obj in enumerate(chunk_entities):
    driver.execute_query("""
    UNWIND $relationships as rel
    CALL apoc.merge.node([rel.start_label],{name:rel.start_name}) YIELD node as n
    WITH rel, n
    CALL apoc.merge.node([rel.end_label],{name:rel.end_name}) YIELD node as n2
    WITH rel, n, n2
    CALL apoc.merge.relationship(n, rel.relationship_type, {}, {}, n2)
    YIELD rel as created
    RETURN created
    """, {'index': index, 'relationships':chunk_obj['relationships']})

print('Rels created and linked to nodes.')

Rels created and linked to nodes.


In [29]:
# Visualize the graph in Neo4j
from yfiles_jupyter_graphs_for_neo4j import Neo4jGraphWidget
widget = Neo4jGraphWidget(driver, overview_enabled=False, context_start_with=None)
widget.show_cypher("MATCH (d)-[r]->(c) WHERE NOT type(r) in ['HAS_CHUNK','HAS_UNIT','IN'] RETURN d,r,c LIMIT 2000")

GraphWidget(layout=Layout(height='800px', width='100%'))

## Time for Querying
We now demonstrate how to use both regular VectorRAG and GraphRAG to ask questions to the database:
- The VectorRAG implementation uses only the chunked text and their embeddings.
- The GraphRAG implementation also uses the context around the documents (extracted entities and their relationships) to generate an answer.

In [30]:
from neo4j import GraphDatabase
from neo4j_genai.retrievers import VectorRetriever, VectorCypherRetriever
from neo4j_genai.llm import OpenAILLM
from neo4j_genai.generation import GraphRAG
from neo4j_genai.embeddings.openai import OpenAIEmbeddings

index_name = "vecindex"
### Set up the RAG framework

# 1. Connect to Neo4j database
driver = GraphDatabase.driver(uri, auth=(username, password), database=database)

# 2. Create Embedder object, needed to convert the user question (text) to a vector
embedder = OpenAIEmbeddings(model="text-embedding-3-small")

# 3. LLM
llm = OpenAILLM(model_name="gpt-4o", model_params={"temperature": 0})

In [31]:
query_text = "how much Bitumenemulsion do we need to spray per m2, and what is the Bindemittel code"

## VectorRAG
Illustrating RAG without using the graph...

In [32]:
# Vector RAG:
retriever = VectorRetriever(driver, index_name, embedder)
# Initialize the RAG pipeline (note: this is just using plain vector RAG without context)
rag = GraphRAG(retriever=retriever, llm=llm)

response = rag.search(query_text, retriever_config={"top_k": 3})
print("Response with VectorRAG:")
print(response.answer)

Response with VectorRAG:
The provided context does not contain information about the amount of Bitumenemulsion needed per m² or the Bindemittel code. Please provide additional details or context related to Bitumenemulsion and Bindemittel for a more accurate answer.


## GraphRAG
Now using the context

In [39]:
# Now, context-aware GraphRAG
retriever = VectorCypherRetriever(
    driver,
    index_name=index_name,
    retrieval_query="""
    // Retrieve Neighbourhood (Context)
    MATCH path=(node)<-[r:IN]-(node2)
    OPTIONAL MATCH (node2)-[r2]-(node3) 
    WHERE type(r2) <> "IN"
    RETURN 
        node.group as group,
        node.index as chunkindex, 
        //node.text as chunktext,

        labels(node2)[0] as sourcelabel,
        node2.name as source, 
        
        type(r2) as reltype, 
        
        labels(node3)[0] as destlabel,
        node3.name as dest, 
        
        
        score""",
    embedder=embedder
)

# Initialize the RAG pipeline
rag = GraphRAG(retriever=retriever, llm=llm)

# Query the graph
response = rag.search(query_text, retriever_config={"top_k": 3}, return_context=True)
print("Response with GraphRAG:")
print(response.answer)

Response with GraphRAG:
The context provided does not contain specific information about the amount of Bitumenemulsion needed to spray per m² or the Bindemittel code. You may need to refer to specific project guidelines, technical specifications, or consult with a professional in the field for accurate details.


In [42]:
import re
# Define the regex pattern to match key-value pairs
pattern = re.compile(r"(\w+\.\w+|type\(r2\)|\w+)=('[^']*'|\d+\.\d+|\d+|[^' \n]+)")


used_context = [item.content for item in response.retriever_result.items]
used_context = [re.findall( pattern, item) for item in used_context]
used_context = [{key: value.replace("'","") for key, value in matches} for matches in used_context]

In [43]:
used_context

[{'group': '00010004',
  'chunkindex': '6',
  'sourcelabel': 'Option',
  'source': 'Verkehrssicherungsmaßnahmen',
  'reltype': 'HAS_OPTION',
  'destlabel': 'Option',
  'dest': 'LED-Anzeigetafel',
  'score': '0.6960674524307251'},
 {'group': '00010004',
  'chunkindex': '6',
  'sourcelabel': 'Option',
  'source': 'Verkehrssicherungsmaßnahmen',
  'reltype': 'HAS_OPTION',
  'destlabel': 'Option',
  'dest': 'Markierung',
  'score': '0.6960674524307251'},
 {'group': '00010004',
  'chunkindex': '6',
  'sourcelabel': 'Option',
  'source': 'Verkehrssicherungsmaßnahmen',
  'reltype': 'HAS_OPTION',
  'destlabel': 'Option',
  'dest': 'Lichtsignalanlage',
  'score': '0.6960674524307251'},
 {'group': '00010004',
  'chunkindex': '6',
  'sourcelabel': 'Option',
  'source': 'Verkehrssicherungsmaßnahmen',
  'reltype': 'HAS_OPTION',
  'destlabel': 'Option',
  'dest': 'Leitelement',
  'score': '0.6960674524307251'},
 {'group': '00010004',
  'chunkindex': '6',
  'sourcelabel': 'Option',
  'source': 'Verkeh

In [44]:
# Visualize the graph in Neo4j
from yfiles_jupyter_graphs_for_neo4j import Neo4jGraphWidget
widget = Neo4jGraphWidget(driver, overview_enabled=False, context_start_with=None)
widget.show_cypher("""
UNWIND $context as row
MATCH (c:Chunk)<-[r1]-(n)-[r2]-(m) 
WHERE c.index = toInteger(row['chunkindex']) AND type(r2) <> "IN"
RETURN c, r1, n, r2, m LIMIT 500
""", context=used_context)

GraphWidget(layout=Layout(height='800px', width='100%'))