In [6]:
import os
from dotenv import load_dotenv

load_dotenv()


class Settings:
    NEO4J_URI = os.getenv("LOCAL_NEO4J_URI")
    NEO4J_USERNAME = os.getenv("LOCAL_NEO4J_USERNAME")
    NEO4J_PASSWORD = os.getenv("LOCAL_NEO4J_PASSWORD")
    NEO4J_DATABASE = "graphragexp"
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
    OPENAI_EMBEDDING_ENDPOINT = "https://api.openai.com/v1/embeddings"
    # PYDATA_API_KEY = os.getenv('PYDATA_API_KEY')


settings = Settings()

In [4]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-4o",
    api_key=settings.OPENAI_API_KEY,
)

llm_transformer = LLMGraphTransformer(llm=llm)

In [5]:
from IPython.display import Markdown, display


def display_markdown(text):
    """
    Display the given text in markdown format in a Jupyter notebook.

    Parameters:
    text (str): The text to be displayed in markdown format.
    """
    display(Markdown(text))

In [7]:
from langchain_community.graphs import Neo4jGraph

graph = Neo4jGraph(
    url=settings.NEO4J_URI,
    password=settings.NEO4J_PASSWORD,
    username=settings.NEO4J_USERNAME,
    database=settings.NEO4J_DATABASE,
)

In [26]:
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget

# default_cypher = "81`    "
default_cypher = "MATCH (n:Speaker) RETURN n LIMIT 5"


def showGraph(cypher: str = default_cypher):
    driver = GraphDatabase.driver(
        uri=settings.NEO4J_URI,
        auth=(settings.NEO4J_USERNAME, settings.NEO4J_PASSWORD),
        database=settings.NEO4J_DATABASE,
    )
    session = driver.session()
    widget = GraphWidget(graph=session.run(cypher).graph())
    widget.node_label_mapping = "id"
    # display(widget)
    return widget

### Data Loading

In [15]:
import hashlib
import uuid
from datetime import datetime
from typing import Final, Literal

import requests

from pydantic import BaseModel

PYDATA_BASE_URL: Final[str] = "https://london2024.pydata.org/api/events/cfp"
JINA_READER_BASE_URL: Final[str] = "https://r.jina.ai"


class Speaker(BaseModel):
    name: str
    biography: str
    id: str


class SubmissionInfo(BaseModel):
    id: str
    speaker: Speaker
    title: str
    submission_type: Literal["Talk", "Tutorial"]
    abstract: str
    state: Literal["confirmed"]
    description: str
    duration: int
    location: str
    date: str
    start_time: str
    end_time: str


class PyDataSubmissionResult(BaseModel):
    results: list[SubmissionInfo]


class ScrapedWebsite(BaseModel):
    title: str
    url: str
    content: str


def get_pydata_info():
    url = f"{PYDATA_BASE_URL}/submissions?limit=100"
    # headers = {"Authorization": f"Bearer {settings.PYDATA_API_KEY}"}

    # response = requests.request("GET", url, headers=headers)
    response = requests.request("GET", url)

    return response.json()


def extract_date_and_time(date_string: str) -> tuple[str, str]:
    parsed_date = datetime.fromisoformat(date_string)

    date_part = parsed_date.date().isoformat()
    time_part = parsed_date.time().isoformat()

    return date_part, time_part


def parse_data(data: dict) -> PyDataSubmissionResult:
    submissions = data.get("results")
    result: list[SubmissionInfo] = []
    for submission in submissions:
        for speaker in submission.get("speakers"):
            result.append(
                SubmissionInfo(
                    id=str(uuid.uuid4()),
                    speaker=Speaker(
                        id=str(uuid.uuid4()),
                        name=speaker.get("name"),
                        biography=speaker.get("biography") or "Not available",
                    ),
                    title=submission.get("title"),
                    submission_type=submission.get("submission_type").get("en"),
                    abstract=submission.get("abstract"),
                    state=submission.get("state"),
                    description=submission.get("description"),
                    duration=submission.get("duration"),
                    location=submission.get("slot").get("room").get("en"),
                    date=extract_date_and_time(submission.get("slot").get("start"))[0],
                    start_time=extract_date_and_time(
                        submission.get("slot").get("start")
                    )[1],
                    end_time=extract_date_and_time(submission.get("slot").get("end"))[
                        1
                    ],
                )
            )

    return PyDataSubmissionResult(results=result)


def scrape_website(url: str) -> ScrapedWebsite:
    url = f"{JINA_READER_BASE_URL}/{url}"
    headers = {"Accept": "application/json", "X-No-Cache": "true"}
    response = requests.request("GET", url, headers=headers)
    return ScrapedWebsite(**response.json().get("data"))


def generate_md5_hash(document_text: str) -> str:
    md5_hash = hashlib.md5()
    md5_hash.update(document_text.encode("utf-8"))

    return md5_hash.hexdigest()


def fetch_data() -> PyDataSubmissionResult:
    result = get_pydata_info()
    return parse_data(result)


def load_data_into_database():
    data = fetch_data()
    driver = GraphDatabase.driver(
        uri=settings.NEO4J_URI,
        auth=(settings.NEO4J_USERNAME, settings.NEO4J_PASSWORD),
        database=settings.NEO4J_DATABASE,
    )

    def create_nodes_and_relationships(tx, submission: SubmissionInfo):
        tx.run(
            "MERGE (s:Speaker {id: $id, name: $speaker_name, biography: $biography})",
            id=submission.speaker.id,
            speaker_name=submission.speaker.name,
            biography=submission.speaker.biography,
        )

        tx.run(
            """
            MERGE (submission:Submission {id:$id, title: $title, submission_type: $submission_type, abstract: $abstract, 
                                          state: $state, description: $description, duration: $duration, 
                                          location: $location, date: $date, start_time: $start_time, 
                                          end_time: $end_time
                                          })
            """,
            id=submission.id,
            title=submission.title,
            submission_type=submission.submission_type,
            abstract=submission.abstract,
            state=submission.state,
            description=submission.description,
            duration=submission.duration,
            location=submission.location,
            date=submission.date,
            start_time=submission.start_time,
            end_time=submission.end_time,
        )

        # Relationship between the Speaker and Submission Node
        tx.run(
            """
            MATCH (s:Speaker {id: $speaker_id}), 
                  (submission:Submission {id: $submission_id})
            MERGE (s)-[:PRESENTED]->(submission)
            """,
            speaker_id=submission.speaker.id,
            submission_id=submission.id,
        )

        document_text = f"""
                    This is a submission for a 2024 PyData Conference
                    The title for this submission is: {submission.title} and the abstract is: 
                    {submission.abstract}. And the description is: {submission.description}. The location for the 
                    {submission.submission_type} is at {submission.location} on {submission.date} from 
                    {submission.start_time} to {submission.end_time}.
                    The speaker for the {submission.submission_type} is {submission.speaker.name} and here is their 
                    biography {submission.speaker.biography}
                """

        tx.run(
            """
            MERGE (d:Document {id:$document_id, text: $document_text})
            """,
            document_id=generate_md5_hash(document_text),
            document_text=document_text,
        )

        # Relationship between document node and submission node
        tx.run(
            """
            MATCH (d:Document {id: $document_id}), 
                  (submission:Submission {id: $submission_id})
            MERGE (d)-[:MENTIONS]->(submission)
            """,
            document_id=generate_md5_hash(document_text),
            submission_id=submission.id,
        )

        # Relationship between the document node and Submission node
        tx.run(
            """
            MATCH (d:Document {id: $document_id}),
                  (speaker:Speaker {id: $submission_id})
            MERGE (d)-[:MENTIONS]->(submission)
            """,
            document_id=generate_md5_hash(document_text),
            submission_id=submission.id,
        )

        # Creating relationships between submissions based on type of submission, location and date
        tx.run(
            """
            MATCH (submission1:Submission {id: $submission_id})
            WITH submission1
            MATCH (submission2:Submission)
            WHERE submission1.location = submission2.location AND submission1 <> submission2
            MERGE (submission1)-[:ON_LOCATION]->(submission2)
            """,
            submission_id=submission.id,
        )
        tx.run(
            """
            MATCH (submission1:Submission {id: $submission_id})
            WITH submission1
            MATCH (submission2:Submission)
            WHERE submission1.date = submission2.date AND submission1 <> submission2
            MERGE (submission1)-[:ON_DATE]->(submission2)
            """,
            submission_id=submission.id,
        )
        tx.run(
            """
            MATCH (submission1:Submission {id: $submission_id})
            WITH submission1
            MATCH (submission2:Submission)
            WHERE submission1.submission_type = submission2.submission_type AND submission1 <> submission2
            MERGE (submission1)-[:ON_TYPE]->(submission2)
            """,
            submission_id=submission.id,
        )

    with driver.session() as session:
        for submission in data.results:
            session.write_transaction(create_nodes_and_relationships, submission)

    driver.close()

In [27]:
# load_data_into_database()

In [28]:
graph.refresh_schema()
print(graph.schema)

Node properties:
Speaker {id: STRING, biography: STRING, name: STRING}
Submission {state: STRING, submission_type: STRING, location: STRING, start_time: STRING, duration: INTEGER, end_time: STRING, title: STRING, abstract: STRING, date: STRING, description: STRING, id: STRING}
Document {id: STRING, text: STRING, embedding: LIST}
Relationship properties:

The relationships:
(:Speaker)-[:PRESENTED]->(:Submission)
(:Submission)-[:ON_LOCATION]->(:Submission)
(:Submission)-[:ON_DATE]->(:Submission)
(:Submission)-[:ON_TYPE]->(:Submission)
(:Document)-[:MENTIONS]->(:Submission)


In [29]:
showGraph()

GraphWidget(layout=Layout(height='500px', width='100%'))

In [23]:
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    api_key=settings.OPENAI_API_KEY,
)

vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding",
    index_name="document_embedding",
    url=settings.NEO4J_URI,
    username=settings.NEO4J_USERNAME,
    password=settings.NEO4J_PASSWORD,
    database=settings.NEO4J_DATABASE,
)



In [24]:
graph.refresh_schema()

graph.query(
    """
  SHOW VECTOR INDEXES
  """
)

[{'id': 3,
  'name': 'document_embedding',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Document'],
  'properties': ['embedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0}]

In [25]:
graph.refresh_schema()
graph.get_structured_schema

{'node_props': {'Speaker': [{'property': 'id', 'type': 'STRING'},
   {'property': 'biography', 'type': 'STRING'},
   {'property': 'name', 'type': 'STRING'}],
  'Submission': [{'property': 'state', 'type': 'STRING'},
   {'property': 'submission_type', 'type': 'STRING'},
   {'property': 'location', 'type': 'STRING'},
   {'property': 'start_time', 'type': 'STRING'},
   {'property': 'duration', 'type': 'INTEGER'},
   {'property': 'end_time', 'type': 'STRING'},
   {'property': 'title', 'type': 'STRING'},
   {'property': 'abstract', 'type': 'STRING'},
   {'property': 'date', 'type': 'STRING'},
   {'property': 'description', 'type': 'STRING'},
   {'property': 'id', 'type': 'STRING'}],
  'Document': [{'property': 'id', 'type': 'STRING'},
   {'property': 'text', 'type': 'STRING'},
   {'property': 'embedding', 'type': 'LIST'}]},
 'rel_props': {},
 'relationships': [{'start': 'Speaker',
   'type': 'PRESENTED',
   'end': 'Submission'},
  {'start': 'Submission', 'type': 'ON_LOCATION', 'end': 'Submi

In [30]:
from langchain_community.chains.graph_qa.cypher import GraphCypherQAChain

cypher_chain = GraphCypherQAChain.from_llm(graph=graph, llm=llm, verbose=True)
response = cypher_chain.invoke(
    {
        "query": """How many submissions are there in total? 
                    Please group this submission into distinct talks and tutorials."""
    }
)
response



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (s:Submission)
RETURN s.submission_type AS type, COUNT(s) AS count
[0m
Full Context:
[32;1m[1;3m[{'type': 'Tutorial', 'count': 16}, {'type': 'Talk', 'count': 51}][0m

[1m> Finished chain.[0m


{'query': 'How many submissions are there in total? \n                    Please group this submission into distinct talks and tutorials.',
 'result': 'There are a total of 67 submissions, grouped into 16 tutorials and 51 talks.'}

> cypher
> MATCH (s:Submission)
> RETURN s.submission_type AS type, COUNT(s) AS count

In [31]:
response = cypher_chain.invoke({"query": "What are the various locations where the conference is holding?"})
response



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (s:Submission)
RETURN DISTINCT s.location AS locations
[0m
Full Context:
[32;1m[1;3m[{'locations': 'Warwick'}, {'locations': 'Salisbury'}, {'locations': 'Minories'}, {'locations': 'Beaumont'}][0m

[1m> Finished chain.[0m


{'query': 'What are the various locations where the conference is holding?',
 'result': 'The various locations where the conference is holding are Warwick, Salisbury, Minories, and Beaumont.'}

In [32]:
from langchain.chains.retrieval_qa.base import RetrievalQA

qa_graph_chain = RetrievalQA.from_chain_type(
    llm, retriever=vector_index.as_retriever(), verbose = True
)

result = qa_graph_chain.invoke({"query": "Where and when is the talk about Graph database holding? Who is the speaker?"})
result["result"]



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'The talk about Graph databases is part of a tutorial titled "Graph databases and Retrieval Augmented Generation." It will be held at Minories on 2024-06-14 from 13:30:00 to 15:00:00. The speaker for this tutorial is Kehinde Richard Ogunyale.'

In [33]:
from langchain import hub
from langchain.agents import AgentExecutor, create_react_agent
from langchain.tools import Tool
from langchain_community.chat_message_histories import Neo4jChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

tools = [
    Tool.from_function(
        name="Cypher Chain",
        description="For when you need to answer questions that can be answered by directly querying the database without having to use knowledge of vector embeddings. The question will be a string. Return a string.",
        func=cypher_chain.invoke,
    ),
    Tool.from_function(
        name="Natural QA Chain",
        description="For answering text based questions that are not easy or straight-forward to answer using Cypher queries but can be answered using vector embeddings. The question will be a String. Return a String",
        func=qa_graph_chain.invoke,
    ),
]


def get_memory(session_id):
    return Neo4jChatMessageHistory(session_id=session_id, graph=graph)


agent_prompt = hub.pull("hwchase17/react-chat")
agent = create_react_agent(llm, tools, agent_prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, handle_parsing_errors=True)

chat_agent = RunnableWithMessageHistory(
    agent_executor,
    get_memory,
    input_messages_key="input",
    history_messages_key="chat_history",
)

In [34]:
question = """I am a software engineer and I would like to learn more about graph databases and retrieval augmented generation, 
              which talk should I attend?"""

response = chat_agent.invoke(
    {"input": question},
    {"configurable": {"session_id": "random_id"}},
)
response["output"]





[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (d:Document)-[:MENTIONS]->(s:Submission)
WHERE d.text CONTAINS "graph databases" AND d.text CONTAINS "retrieval augmented generation"
RETURN s.title, s.description, s.start_time, s.end_time, s.location
[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'A software engineer interested in learning more about graph databases and retrieval augmented generation should attend the tutorial titled "Graph databases and Retrieval Augmented Generation" by Kehinde Richard Ogunyale. This session will cover the integration of graph databases with Retrieval-Augmented Generation (RAG) technology, providing a detailed, step-by-step guide on building AI applications that leverage these technologies. The tutorial is scheduled to take place at Minories on 2024-06-14 from 13:30:00 to 15:00:00.'

In [35]:
question = "I want to attend all AI related talks, which talks should I attend?"

response = chat_agent.invoke(
    {"input": question},
    {"configurable": {"session_id": "random_id"}},
)
response["output"]



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (s:Submission)-[:PRESENTED]-(sp:Speaker)
WHERE s.title CONTAINS 'AI' OR s.abstract CONTAINS 'AI' OR s.description CONTAINS 'AI'
RETURN s.title AS Title, sp.name AS Speaker, s.date AS Date, s.start_time AS StartTime, s.end_time AS EndTime, s.location AS Location
[0m
Full Context:
[32;1m[1;3m[{'Title': 'Analytics engineering without dbt? Building the composable Python data stack with Kedro and Ibis', 'Speaker': 'Deepyaman Datta', 'Date': '2024-06-14', 'StartTime': '13:30:00', 'EndTime': '15:00:00', 'Location': 'Warwick'}, {'Title': 'Analytics engineering without dbt? Building the composable Python data stack with Kedro and Ibis', 'Speaker': 'Juan Luis Cano Rodríguez', 'Date': '2024-06-14', 'StartTime': '13:30:00', 'EndTime': '15:00:00', 'Location': 'Warwick'}, {'Title': 'Building Multi-Agent Generative-AI Applications with AutoGen', 'Speaker': 'Victor Dibia', 'Date': '2024-06-15', 'StartTi

'Here are the AI-related talks you should attend:\n\n1. **Title:** Building Multi-Agent Generative-AI Applications with AutoGen\n   - **Speakers:** Victor Dibia, Chi Wang, Diego Colombo\n   - **Date:** 2024-06-15\n   - **Start Time:** 11:15:00\n   - **End Time:** 11:55:00\n   - **Location:** Minories\n\n2. **Title:** An Introduction to Retrieval Augmented Generation\n   - **Speaker:** Dan Gibson\n   - **Date:** 2024-06-14\n   - **Start Time:** 09:00:00\n   - **End Time:** 10:30:00\n   - **Location:** Minories\n\n3. **Title:** Taking LLMs out of the black box: A practical guide to human-in-the-loop distillation\n   - **Speaker:** Ines Montani\n   - **Date:** 2024-06-15\n   - **Start Time:** 12:00:00\n   - **End Time:** 12:40:00\n   - **Location:** Minories'

In [36]:
from langchain.text_splitter import TokenTextSplitter
from langchain_core.documents import Document
from tqdm import tqdm

list_of_websites = [
    "https://pydata.org/london2024",
    "https://pydata.org/london2024/about",
    "https://pydata.org/london2024/job-board",
    "https://pydata.org/london2024/humble",
    "https://pydata.org/london2024/sponsor",
]

scraped_data: list[ScrapedWebsite] = [
    scrape_website(url) for url in tqdm(list_of_websites, desc="Scraping website")
]
list_of_documents: list[Document] = [
    Document(
        page_content=data.content,
        metadata={
            "title": data.title,
            "url": data.url,
            "id": generate_md5_hash(data.content),
        },
    )
    for data in tqdm(scraped_data, desc="Processing documents")
]

text_splitter = TokenTextSplitter(chunk_size=1024, chunk_overlap=24)
documents = text_splitter.split_documents(list_of_documents)

print(documents)

Scraping website: 100%|██████████| 5/5 [00:14<00:00,  2.81s/it]
Processing documents: 100%|██████████| 5/5 [00:00<00:00, 3584.26it/s]

[Document(page_content='![Image 1](https://images.squarespace-cdn.com/content/v1/655270d4c839892af01aaf02/6a9c721d-5397-4349-a711-befa4be34f62/AdobeStock_132796360-dark.jpg)\n\nWHAT TO EXPECT\n--------------\n\nPyData London 2024 is a 3-day in-person event for the international community of data scientists, data engineers, and developers of data analysis tools to share ideas and learn from each other.\n\nDuring the conference, attendees will have the opportunity to attend live keynote sessions and talks, lightning talks, and get to know fellow members of the **Py****Data** Community.\n\n![Image 2](https://images.squarespace-cdn.com/content/v1/655270d4c839892af01aaf02/1699901673377-62BKP9F0DZ91L3D8PA26/promo+photo+with+overlay+1+-+small.png)\n\n![Image 3](https://images.squarespace-cdn.com/content/v1/655270d4c839892af01aaf02/d033c077-464f-4609-a990-4d841fe3b98f/magicpattern-mesh-gradient-1705596178730.png)\n\n**The event will be in-person at the** [**Leonardo Royal Hotel London Tower Br




In [37]:
from hashlib import md5
from langchain_community.graphs.graph_document import GraphDocument
from langchain_community.graphs.neo4j_graph import BASE_ENTITY_LABEL, Neo4jGraph, _get_rel_import_query
from langchain_core.embeddings import Embeddings

include_docs_query = (
    "MERGE (d:Document {id:$document.metadata.id}) "
    "SET d.text = $document.page_content "
    "SET d.embedding = $document.metadata.embedding "
    "WITH d "
)


def _get_node_import_query(baseEntityLabel: bool, include_source: bool) -> str:
    if baseEntityLabel:
        return (
            f"{include_docs_query if include_source else ''}"
            "UNWIND $data AS row "
            f"MERGE (source:`{BASE_ENTITY_LABEL}` {{id: row.id}}) "
            "SET source += row.properties "
            f"{'MERGE (d)-[:MENTIONS]->(source) ' if include_source else ''}"
            "WITH source, row "
            "CALL apoc.create.addLabels( source, [row.type] ) YIELD node "
            "RETURN distinct 'done' AS result"
        )
    else:
        return (
            f"{include_docs_query if include_source else ''}"
            "UNWIND $data AS row "
            "CALL apoc.merge.node([row.type], {id: row.id}, "
            "row.properties, {}) YIELD node "
            f"{'MERGE (d)-[:MENTIONS]->(node) ' if include_source else ''}"
            "RETURN distinct 'done' AS result"
        )


class ModifiedNeo4JGraph(Neo4jGraph):
    def add_graph_documents_with_embeddings(self, graph_documents: list[GraphDocument], embedding: Embeddings, include_source: bool = False,
                            baseEntityLabel: bool = False) -> None:
        """
        This method constructs nodes and relationships in the graph based on the
        provided GraphDocument objects.

        Parameters:
        - graph_documents (List[GraphDocument]): A list of GraphDocument objects
        that contain the nodes and relationships to be added to the graph. Each
        GraphDocument should encapsulate the structure of part of the graph,
        including nodes, relationships, and the source document information.
        - embedding (Embeddings): An embedding function to embed the chunked documents.
        - include_source (bool, optional): If True, stores the source document
        and links it to nodes in the graph using the MENTIONS relationship.
        This is useful for tracing back the origin of data. Merges source
        documents based on the `id` property from the source document metadata
        if available; otherwise it calculates the MD5 hash of `page_content`
        for merging process. Defaults to False.
        - baseEntityLabel (bool, optional): If True, each newly created node
        gets a secondary __Entity__ label, which is indexed and improves import
        speed and performance. Defaults to False.
        """

        if baseEntityLabel:  # Check if constraint already exists
            constraint_exists = any(
                [
                    el["labelsOrTypes"] == [BASE_ENTITY_LABEL]
                    and el["properties"] == ["id"]
                    for el in self.structured_schema.get("metadata", {}).get(
                    "constraint"
                )
                ]
            )
            if not constraint_exists:
                # Create constraint
                self.query(
                    f"CREATE CONSTRAINT IF NOT EXISTS FOR (b:{BASE_ENTITY_LABEL}) "
                    "REQUIRE b.id IS UNIQUE;"
                )
                self.refresh_schema()  # Refresh constraint information

        node_import_query = _get_node_import_query(baseEntityLabel, include_source)
        rel_import_query = _get_rel_import_query(baseEntityLabel)
        for document in graph_documents:
            if not document.source.metadata.get("id"):
                document.source.metadata["id"] = md5(
                    document.source.page_content.encode("utf-8")
                ).hexdigest()
                document.source.metadata["embedding"] = embedding.embed_query(text=document.source.page_content)

            # Import nodes
            self.query(
                node_import_query,
                {
                    "data": [el.__dict__ for el in document.nodes],
                    "document": document.source.__dict__,
                },
            )
            # Import relationships
            self.query(
                rel_import_query,
                {
                    "data": [
                        {
                            "source": el.source.id,
                            "source_label": el.source.type,
                            "target": el.target.id,
                            "target_label": el.target.type,
                            "type": el.type.replace(" ", "_").upper(),
                            "properties": el.properties,
                        }
                        for el in document.relationships
                    ]
                },
            )

In [38]:
graph_documents = llm_transformer.convert_to_graph_documents(documents)

print(f"Graph Documents: {graph_documents}")

Graph Documents: [GraphDocument(nodes=[Node(id='Pydata London 2024', type='Event'), Node(id='Data Scientists', type='Person'), Node(id='Data Engineers', type='Person'), Node(id='Developers Of Data Analysis Tools', type='Person'), Node(id='Pydata Community', type='Organization'), Node(id='Leonardo Royal Hotel London Tower Bridge', type='Place'), Node(id='45 Prescot St, London E1 8Gp, United Kingdom', type='Place'), Node(id='Dr. Rebecca Bilbro', type='Person'), Node(id='Yellowbrick', type='Software'), Node(id='Scikit-Learn', type='Software'), Node(id='Matplotlib', type='Software'), Node(id='Rotational Labs', type='Organization'), Node(id='University Of Illinois, Urbana-Champaign', type='Organization')], relationships=[Relationship(source=Node(id='Pydata London 2024', type='Event'), target=Node(id='Data Scientists', type='Person'), type='INCLUDES'), Relationship(source=Node(id='Pydata London 2024', type='Event'), target=Node(id='Data Engineers', type='Person'), type='INCLUDES'), Relations

In [39]:
graph = ModifiedNeo4JGraph(
    url=settings.NEO4J_URI,
    password=settings.NEO4J_PASSWORD,
    username=settings.NEO4J_USERNAME,
    database=settings.NEO4J_DATABASE
)

graph.add_graph_documents_with_embeddings(
    graph_documents=graph_documents,
    baseEntityLabel=True,
    include_source=True,
    embedding=embeddings,
)

In [41]:
from typing import Callable

def get_chat_agent(cypher_chain: Callable, qa_graph_chain: Callable) -> Callable:
    tools = [
        Tool.from_function(
            name="Cypher Chain",
            description="For when you need to answer questions that can be answered by directly querying the database without having to use knowledge of vector embeddings. The question will be a string. Return a string.",
            func=cypher_chain.invoke,
        ),
    
        Tool.from_function(
            name="Natural QA Chain",
            description="For answering text based questions that are not easy or straight-forward to answer using Cypher queries but can be answered using vector embeddings. The question will be a String. Return a String",
            func=qa_graph_chain.invoke
        )
    ]
    
    agent_prompt = hub.pull("hwchase17/react-chat")
    agent = create_react_agent(llm, tools, agent_prompt)
    agent_executor = AgentExecutor(agent=agent, tools=tools, handle_parsing_errors=True)
    
    chat_agent = RunnableWithMessageHistory(
        agent_executor,
        get_memory,
        input_messages_key="input",
        history_messages_key="chat_history",
    )
    
    return chat_agent
    

def get_memory(session_id):
    return Neo4jChatMessageHistory(session_id=session_id, graph=graph)


In [42]:
vector_index_without_retrieval_query = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding",
    index_name = "document_embedding",
    url=settings.NEO4J_URI,
    password=settings.NEO4J_PASSWORD,
    username=settings.NEO4J_USERNAME,
    database=settings.NEO4J_DATABASE
)



In [43]:
qa_graph_chain_without_retrieval_query = RetrievalQA.from_chain_type(
    llm, retriever=vector_index_without_retrieval_query.as_retriever(), verbose = True
)

result = qa_graph_chain.invoke({"query": "Who are the keynote speakers?"})
display_markdown(result["result"])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


The keynote speakers at the 2024 PyData Conference are:

1. Tania Allard
2. Dr. Rebecca Bilbro
3. Dr. Matthew Crooks

In [44]:
chat_agent = get_chat_agent(cypher_chain,qa_graph_chain_without_retrieval_query)
question = "What are the keynote speakers talking about?"
response = chat_agent.invoke(
    {
        "input": question
    },
    {"configurable": {"session_id": "randomId"}},
)
display_markdown(result['result'])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


The keynote speakers at the 2024 PyData Conference are:

1. Tania Allard
2. Dr. Rebecca Bilbro
3. Dr. Matthew Crooks

In [45]:
vector_index_with_retrieval_query = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding",
    index_name = "document_embedding",
    retrieval_query="""
    WITH node AS doc, score as similarity
    CALL {
        WITH doc
        MATCH (doc)-[*1..2]-(related)
        WITH doc, related, [key in keys(related) WHERE key <> 'embedding'] AS filtered_keys
        RETURN DISTINCT related, [key IN filtered_keys | key + ': ' + toString(related[key])] AS properties_list
    }
    WITH doc, similarity, properties_list
    RETURN coalesce(doc.text, '') + ' ' + coalesce(reduce(s = '', prop IN properties_list | s + ', ' + prop), '') AS text, similarity AS score, {source: doc.text} AS metadata
    LIMIT 25
    """
)



In [46]:
qa_graph_chain = RetrievalQA.from_chain_type(
    llm, retriever=vector_index_with_retrieval_query.as_retriever(), verbose = True
)

result = qa_graph_chain.invoke({"query": "Which speakers are software engineers? "})
display_markdown(result["result"])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


I don't have information about the speakers to determine which ones are software engineers. If you can provide a list or more details, I may be able to help further.