In [1]:
import os
import time
from dotenv import load_dotenv
import textwrap
from pydantic import BaseModel, Field

from langchain.chains import GraphCypherQAChain

from langchain_community.graphs import Neo4jGraph
from langchain_community.chat_models import ChatOllama
from langchain_community.vectorstores import Neo4jVector
from langchain_community.graphs import Neo4jGraph
from langchain_community.document_loaders import PDFPlumberLoader

from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.documents import Document

from langchain_ollama import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts.prompt import PromptTemplate

from langchain_experimental.text_splitter import SemanticChunker
from langchain_experimental.llms.ollama_functions import OllamaFunctions
from langchain_experimental.graph_transformers import LLMGraphTransformer

# Warning control
import warnings
warnings.filterwarnings("ignore")

# Load environment variables
load_dotenv()

True

# Connect to Neo4J Database

In [35]:
# Connect to local Neo4J database
NEO4J_URI = os.getenv("NEO4J_URL")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_DATABASE = os.getenv("NEO4J_DATABASE")

graph = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [36]:
# loader = PDFPlumberLoader("data/ISBANK2023.pdf")
# docs = loader.load()
# text_splitter = SemanticChunker(HuggingFaceEmbeddings())
# documents = text_splitter.split_documents(docs)
# # Check the number of pages
# print("Number of pages in the PDF:",len(docs))
# print("Number of documents after chunking:",len(documents))

# # # Merge multiple documents into a single input for unified graph generation
# # combined_content = "\n\n".join([doc.page_content for doc in documents[50:100]]) 
# # combined_content = [Document(page_content=combined_content)]


In [37]:
# for doc in documents:
#     if "\n" in doc.page_content:
#         print(doc)
#         doc.page_content = doc.page_content.replace("\n", "")

# for i, doc in enumerate(documents):
#     if len(doc.page_content) <= 10:
#         documents.pop(i)

# print("Number of pages in the PDF:",len(docs))
# print("Number of documents after chunking:",len(documents))

In [51]:
fake_text = """
Umut Can Gulsen, born in 1998, was a Turkish and naturalised-French physicist and chemist who conducted pioneering research on quantum physics.
He was the first man to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields. His husband, Pierre Gulsen, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.
He was, in 200, the first man to become a professor at the University of Paris. 
"""

text = """
Marie Curie, born in 1867, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.
She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields.
Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.
She was, in 1906, the first woman to become a professor at the University of Paris. 
"""
# Convert the text into documents
documents = [Document(page_content=text)]

In [52]:
# Initialize the language model for text-to-graph conversion
llm = ChatOllama(model="llama3", temperature=0)
llm_transformer_filtered = LLMGraphTransformer(llm=llm)

In [53]:
# # Convert multiple text documents into graph structures
# graph_documents = []
# for doc in docs[:10]:  # Iterate over multiple documents
#     graph_documents.extend(llm_transformer_filtered.convert_to_graph_documents([doc]))


In [54]:
# Convert the text into graph documents
graph_documents = llm_transformer_filtered.convert_to_graph_documents(documents)

In [55]:
graph_documents

[GraphDocument(nodes=[Node(id='1867', type='Year', properties={}), Node(id='Curie family', type='Family', properties={}), Node(id='Marie Curie', type='Person', properties={}), Node(id='Nobel Prize', type='Award', properties={}), Node(id='University of Paris', type='Institution', properties={}), Node(id='None', type='None', properties={}), Node(id='Pierre Curie', type='Person', properties={}), Node(id='radioactivity', type='Field of Study', properties={})], relationships=[Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='1867', type='Year', properties={}), type='BORN_IN', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='radioactivity', type='Field of Study', properties={}), type='WORKED_ON', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Nobel Prize', type='Award', properties={}), type='WON_AWARD', properties={}), Relationship(source

In [56]:
ix = 0
print(f"Nodes:{graph_documents[ix].nodes}")
print(f"Relationships:{graph_documents[ix].relationships}")

Nodes:[Node(id='1867', type='Year', properties={}), Node(id='Curie family', type='Family', properties={}), Node(id='Marie Curie', type='Person', properties={}), Node(id='Nobel Prize', type='Award', properties={}), Node(id='University of Paris', type='Institution', properties={}), Node(id='None', type='None', properties={}), Node(id='Pierre Curie', type='Person', properties={}), Node(id='radioactivity', type='Field of Study', properties={})]
Relationships:[Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='1867', type='Year', properties={}), type='BORN_IN', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='radioactivity', type='Field of Study', properties={}), type='WORKED_ON', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Nobel Prize', type='Award', properties={}), type='WON_AWARD', properties={}), Relationship(source=Node(id='Marie 

In [57]:
cypher = """
MATCH (n)
DETACH DELETE n
"""
graph.query(cypher)

[]

In [58]:
# Add the generated graph into Neo4j
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)


## Short Solution: 
#### Use GraphCypherQAChain to generate Cypher queries and return response from LLM

In [86]:
CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to query a graph database.
Instructions:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.
Examples: Here are a few examples of generated Cypher statements for particular questions:

# Who is Michael Scott?
MATCH (person:Person)-[:WON_AWARD]->(award:Award)
    WHERE person.id = 'Michael Scott'
RETURN person.id, award.id

# Who won the award?
MATCH (person:Person)-[:WON_AWARD]->(award:Award)
    WHERE person.id = 'Michael Scott'
RETURN person.id, award.id

# What kind of research does Michael conduct?
  MATCH (person:Person)-[:WORKED_ON]->(field:'Field of Study')
    WHERE person.id = 'Michael Scott'
  RETURN person.id, field.id

The question is:
{question}"""

CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"], 
    template=CYPHER_GENERATION_TEMPLATE
)

In [87]:
llm = ChatOllama(model="llama3")
chain = GraphCypherQAChain.from_llm(graph=graph, 
                                    llm=llm, 
                                    verbose=True, 
                                    allow_dangerous_requests=True, 
                                    # return_intermediate_steps=True, 
                                    # return_direct=True, 
                                    cypher_prompt=CYPHER_GENERATION_PROMPT,
                                    )
# response = chain.invoke({"query": "Who is Pierre Curie?"})

def prettyCypherChain(question: str) -> str:
    response = chain.run(question)
    print(textwrap.fill(response, 60))

# # To check the generated query manually
# cypher = """
# MATCH (person:Person)-[:WON_AWARD]->(award:Award) 
# WHERE person.id = 'Marie Curie' 
# RETURN person.id, award.id
# """
# graph.query(cypher)

In [89]:
prettyCypherChain("What is Marie Curie's primary research field?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (person:Person)-[:WORKED_ON]->(field) 
WHERE person.id = 'Marie Curie' 
RETURN person.id, field.id[0m
Full Context:
[32;1m[1;3m[{'person.id': 'Marie Curie', 'field.id': 'radioactivity'}][0m

[1m> Finished chain.[0m
radioactivity.


## Longer Solution: 
#### Create a retriever from scratch

In [63]:
# Create embeddings for more complex search queries
embed = OllamaEmbeddings(model="mxbai-embed-large")
vector_index = Neo4jVector.from_existing_graph(
    embedding=embed,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)
vector_retriever = vector_index.as_retriever()

# Define a model for the extracted entities from the text
class Entities(BaseModel):
    names: list[str] = Field(..., description="All entities from the text")

# Define a prompt to extract entities from the input query
prompt = ChatPromptTemplate.from_messages([ 
    ("system", "Extract organization and person entities from the text."),
    ("human", "Extract entities from: {question}")
])

# Initialize the Ollama model for entity extraction with LLM (using "llama3")
llm = OllamaFunctions(model="llama3", format="json", temperature=0.4)

# Combine the prompt and LLM to create an entity extraction chain
# The output is structured to match the "Entities" model
entity_chain = prompt | llm.with_structured_output(Entities, include_raw=True)

# Function to retrieve relationships of the extracted entities from Neo4j
def graph_retriever(question: str) -> str:
    # Use the entity extraction chain to get entities from the question
    response = entity_chain.invoke({"question": question})
    # Extract the list of entity names from the response
    entities = response['raw'].tool_calls[0]['args']['properties']['names']
    print("-"*30)
    print("Retreived Entities")
    print(entities)
    result = ""  # Initialize a variable to store the result

    # Iterate over each extracted entity
    for entity in entities:
        # Query Neo4j to get relationships for the given entity
        query_response = graph.query(
            """MATCH (p:Person {id: $entity})-[r]->(e)
            RETURN p.id AS source_id, type(r) AS relationship, e.id AS target_id
            LIMIT 50""",
            {"entity": entity}
        )
        # Format the query results and append to the result string
        result += "\n".join([f"{el['source_id']} - {el['relationship']} -> {el['target_id']}" for el in query_response])+ "\n"
    
    # Return the formatted results containing entity relationships
    return result

def full_retriever(question: str):
    # Retrieve graph data for the question using the graph_retriever function
    graph_data = graph_retriever(question)
    print("-"*30)
    print("Graph Data")
    print(graph_data)
    # Retrieve vector data by invoking the vector retriever with the question
    vector_data = [el.page_content for el in vector_retriever.invoke(question)]
    # Retrieve vector-based data for multiple documents
    # vector_results = vector_retriever.invoke(question)
    #vector_data = "\n".join([f"#Document {i+1}: {el.page_content}" for i, el in enumerate(vector_results)])
    print("-"*30)
    print("Vector Data")
    print(vector_data)
    print("-"*30)
    # Combine the graph data and vector data into a formatted string
    return f"Graph data: {graph_data}\nVector data: {'#Document '.join(vector_data)}"

# Define a prompt template for generating a response based on context
template = """Answer the question based only on the following context:
{context}
Question: {question}
Answer:"""

# Create a prompt from the template, which takes the context and question as input
prompt = ChatPromptTemplate.from_template(template)

# Create a processing chain that:
# 1. Generates context using the full_retriever function
# 2. Passes through the question as-is using RunnablePassthrough
# 3. Applies the prompt template to generate the final question
# 4. Uses the LLM (language model) to generate the answer
# 5. Uses StrOutputParser to format the output as a string
chain = (
    {
        "context": lambda input: full_retriever(input),  # Generate context from the question
        "question": RunnablePassthrough(),  # Pass the question through without modification
    }
    | prompt  # Apply the prompt template
    | llm  # Use the language model to answer the question based on context
    | StrOutputParser()  # Parse the model's response as a string
)

In [62]:
# Test the chain with a question
response = chain.invoke(input="Who is Umut Can Gulsen?")
print("Final Answer")
print(response)



------------------------------
Retreived Entities
['Umut Can Gulsen']
------------------------------
Graph Data
Umut Can Gulsen - BECAME_PROFESSOR_AT -> University of Paris
Umut Can Gulsen - WON_AWARD -> Nobel Prize
Umut Can Gulsen - WORKED_ON -> quantum physics
Umut Can Gulsen - BORN_IN -> 1998

------------------------------
Vector Data
['\ntext: \nUmut Can Gulsen, born in 1998, was a Turkish and naturalised-French physicist and chemist who conducted pioneering research on quantum physics.\nHe was the first man to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields. His husband, Pierre Gulsen, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.\nHe was, in 200, the first man to become a professor at the University of Paris. \n']
------------------------------
Final Answer
Umut Can Gulsen is a T