# Module 3 - GraphRAG and Agents

This module has the following objectives:
- Experiment with queries for an Agent
- Define Tooling
- Create an agents with the available tools
- Chatbot for an Agent
- Text2Cypher (if we got time)

In [None]:
#!pip install graphdatascience neo4j dotenv openai langchain, langgraph, pydantic, gradio

Import our usual suspects (and some more...)

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
from graphdatascience import GraphDataScience
from neo4j import Query, GraphDatabase, RoutingControl, Result
from langchain.schema import HumanMessage
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langgraph.prebuilt import create_react_agent
from openai import OpenAI
from typing import List, Optional
from pydantic import BaseModel, Field, validator
import functools
from langchain_core.tools import tool
import gradio as gr
import time

## Setup

Load env variables

In [None]:
env_file = 'ws.env'

In [None]:
if os.path.exists(env_file):
    load_dotenv(env_file, override=True)

    # Neo4j
    HOST = os.getenv('NEO4J_URI')
    USERNAME = os.getenv('NEO4J_USERNAME')
    PASSWORD = os.getenv('NEO4J_PASSWORD')
    DATABASE = os.getenv('NEO4J_DATABASE')

    # AI
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    os.environ['OPENAI_API_KEY']=OPENAI_API_KEY
    LLM = os.getenv('LLM')
    EMBEDDINGS_MODEL = os.getenv('EMBEDDINGS_MODEL')
else:
    print(f"File {env_file} not found.")

Connect to neo4j db

In [None]:
driver = GraphDatabase.driver(
    HOST,
    auth=(USERNAME, PASSWORD)
)

Test the connection

In [None]:
driver.execute_query(
    """
    MATCH (n) RETURN COUNT(n) as Count
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

Test whether we got our constraints

In [None]:
schema_result_df  = driver.execute_query(
    'show indexes',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [None]:
schema_result_df.head(100)

## Agent Thinking

Let's say we want to build an Agent with multiple tools. Let's try to provide the following functionality: 

1. Retrieve the skills of a person.
   - Input: Person
   - Output: Skills
   - Example: *What skills does Kristof Neys have?* 
3. Retrieve similar skills to other skills.
   - Input: Skills
   - Output: Skills
   - Example: *What skills are similar to PowerBI and Data Visualization?*
4. Retrieve similar persons to a person specified.  
   - Input: Person
   - Output: Person
   - Example: *"Which persons have similar skills as Kristof Neys?"*
6. Retrieve Persons based on a set of skills.
   - Input: Skills
   - Output: Person
   - Example: *Which persons have Python and AWS experience?*

In [None]:
embeddings = OpenAIEmbeddings(model=EMBEDDINGS_MODEL)

## 1 - Retrieve Skills of Person

Find the connected skills given a person name.

In [None]:
person_name = "Lucy Turner"

In [None]:
person_skills_df = driver.execute_query(
    """
    MATCH (p:Person{name: $person_name})-[:KNOWS]->(s:Skill)
    RETURN p.name as name, COLLECT(s.name) as skills
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    person_name = person_name
)

In [None]:
person_skills_df

## 2 - Retrieve similar skills

Retrieve skills based on a list of skills

In [None]:
skills = ['Contineous Delivery', 'Cloud Native', 'Security']
skills_vectors = embeddings.embed_documents(skills)

In [None]:
search_persons_with_skills_df = driver.execute_query(
    """
        UNWIND $skills_vectors AS v
        CALL db.index.vector.queryNodes('skill-embeddings', 3, TOFLOATLIST(v)) YIELD node, score
        WHERE score > 0.89
        OPTIONAL MATCH (node)-[:SIMILAR_SEMANTIC]-(s:Skill)
        WITH COLLECT(node) AS nodes, COLLECT(DISTINCT s) AS skills
        WITH nodes + skills AS all_skills
        UNWIND all_skills AS skill
        RETURN DISTINCT skill.name as skill_name
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    skills_vectors = skills_vectors
)

In [None]:
search_persons_with_skills_df

## 3 - Person Similarity

## Strategy 3.1 - Communities

We can use the community here to find similar people

In [None]:
person_name_1 = "John Garcia"

In [None]:
person_similarity_community_df = driver.execute_query(
    """
    MATCH (p1:Person {name: $person_name_1})-[:KNOWS]->(s:Skill)
    WITH p1, COLLECT(s.name) as s1
    MATCH (p2:Person {leiden_community: p1.leiden_community})-[:KNOWS]->(s2:Skill)
    RETURN p1.name AS person_1, s1 AS skills_1, p1.leiden_community AS community, p2.name AS person_2, COLLECT(s2.name) AS skills_2
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    person_name_1 = person_name_1
)

In [None]:
person_similarity_community_df

You can find all Skills in the community in the browser:
```
MATCH p=(:Person{leiden_community: 88})-[:KNOWS]->(s:Skill)
RETURN p
```

### Strategy 3.2 - Similar Skillsets

We can use the SIMILAR_SKILLSET relationship to find similar persons

In [None]:
person_name_1 = "John Garcia"

In [None]:
person_similar_skillset_df = driver.execute_query(
    """
    MATCH (p1:Person {name: $person_name_1})-[:KNOWS]->(s:Skill)
    WITH p1, COLLECT(s.name) as s1
    MATCH (p1)-[r:SIMILAR_SKILLSET]-(p2:Person)-[:KNOWS]->(s2:Skill)
    WHERE r.overlap > 1
    RETURN p1.name AS person_1, s1 AS skills_1, r.overlap AS score, p2.name AS person_2, COLLECT(DISTINCT s2.name) AS skills_2
    ORDER BY score DESC
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    person_name_1 = person_name_1
)

In [None]:
person_similar_skillset_df

### Strategy 3.3 Similar Skillsets and Semantic Meaning

Use the Semantic Meaning and Skill overlap to find people with similar skills

In [None]:
person_name_1 = "John Garcia"

In [None]:
person_similarity_df = driver.execute_query(
    """
    MATCH (p1:Person {name: $person_name_1})-[:KNOWS]->(s:Skill)
    WITH p1, COLLECT(s.name) as skills_1
    CALL (p1){
      MATCH (p1)-[:KNOWS]->(s1:Skill)-[r:SIMILAR_SEMANTIC]-(s2:Skill)<-[:KNOWS]-(p2:Person)
      
      RETURN p1 as person_1, p2 as person_2, SUM(r.score) AS score
      UNION
      MATCH (p1)-[r:SIMILAR_SKILLSET]-(p2:Person)
      RETURN p1 as person_1, p2 AS person_2, SUM(r.overlap) AS score
    }
    WITH person_1.name as person_1, skills_1, person_2, SUM(score) as score
    WHERE score >= 1
    MATCH (person_2)-[:KNOWS]->(s:Skill)
    RETURN person_1, skills_1,  person_2.name as person_2, COLLECT(s.name) as skills_2, score
    ORDER BY score DESC LIMIT 5
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    person_name_1 = person_name_1
)

In [None]:
person_similarity_df

## 4 - Recommendation of Person given on skills

## Vector Index Search

In [None]:
skills = ['AWS', 'Security']

In [None]:
skills_vectors = embeddings.embed_documents(skills)

We get the approximate top 10 nearest nodes to the search vector `v` and take the 3 first returned. Then put them together in a list (`skill_list`) and does same ranking as before (number of skills)

In [None]:
nn_df = driver.execute_query(
    """UNWIND $skills_vectors AS v
    CALL db.index.vector.queryNodes('skill-embeddings', 3, TOFLOATLIST(v)) YIELD node, score
    WHERE score > 0.85
    WITH v as embedding, COALESCE(COLLECT(node.name), []) AS top
    RETURN *
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    skills_vectors = skills_vectors
)
nn_df['skills'] = skills
cols = list(nn_df.columns)[-1:] + list(nn_df.columns)[:-1]
nn_df = nn_df[cols]

In [None]:
nn_df

In [None]:
find_persons_given_skills_df = driver.execute_query(
    """
    UNWIND $skills_vectors AS v
    CALL db.index.vector.queryNodes('skill-embeddings', 3, TOFLOATLIST(v)) YIELD node, score
    WHERE score > 0.85
    OPTIONAL MATCH (node)-[:SIMILAR_SEMANTIC]-(s:Skill)
    WITH COLLECT(node) AS nodes, COLLECT(DISTINCT s) AS skills
    WITH nodes + skills AS all_skills
    UNWIND all_skills AS skill
    MATCH (p:Person)-[:KNOWS]->(skill)
    RETURN p.name AS person, COUNT(DISTINCT(skill)) AS skill_count, COLLECT(DISTINCT(skill.name)) as similar_skills
    ORDER BY skill_count DESC LIMIT 10
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    skills_vectors = skills_vectors
)

In [None]:
find_persons_given_skills_df

## Agents with GraphRAG

### Lets create a Retrieval agent

In [None]:
class Skill(BaseModel):
    """
    Represents a professional skill or knowledge of a person.
    """
    name: str = Field(..., description="Sortened name of the skill")

### Tool 1

In [None]:
def retrieve_skills_of_person(person_name: str) -> pd.DataFrame:
    """Retrieve the skills of a person. Person is provided with it's name"""
    return driver.execute_query(
        """
        MATCH (p:Person{name: $person_name})-[:KNOWS]->(s:Skill)
        RETURN p.name as name, COLLECT(s.name) as skills
        """,
        database_=DATABASE,
        routing_=RoutingControl.READ,
        result_transformer_= lambda r: r.to_df(),
        person_name = person_name
    )

In [None]:
retrieve_skills_of_person('Mia Nelson') 

### Tool 2

In [None]:
def find_similar_skills(skills: List[Skill]) -> pd.DataFrame:
    """Find similar skills to list of skills specified. Skills are specified by a list of their names"""
    skills = [s.name for s in skills]
    skills_vectors = embeddings.embed_documents(skills)
    return driver.execute_query(
    """
        UNWIND $skills_vectors AS v
        CALL db.index.vector.queryNodes('skill-embeddings', 3, TOFLOATLIST(v)) YIELD node, score
        WHERE score > 0.89
        OPTIONAL MATCH (node)-[:SIMILAR_SEMANTIC]-(s:Skill)
        WITH COLLECT(node) AS nodes, COLLECT(DISTINCT s) AS skills
        WITH nodes + skills AS all_skills
        UNWIND all_skills AS skill
        RETURN DISTINCT skill.name as skill_name
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    skills_vectors = skills_vectors
)

In [None]:
find_similar_skills([Skill(name='Python')])

### Tool 3

In [None]:
def person_similarity(person_name: str) -> pd.DataFrame:
    """Find a similar person to the one specified based on their skill similarity. Persons are provided with their name"""
    
    return driver.execute_query(
        """
        MATCH (p1:Person {name: $person_name})-[:KNOWS]->(s:Skill)
        WITH p1, COLLECT(s.name) as skills_1
        CALL (p1){
          MATCH (p1)-[:KNOWS]->(s1:Skill)-[r:SIMILAR_SEMANTIC]-(s2:Skill)<-[:KNOWS]-(p2:Person)
          RETURN p1 as person_1, p2 as person_2, SUM(r.score) AS score
          UNION 
          MATCH (p1)-[r:SIMILAR_SKILLSET]-(p2:Person)
          RETURN p1 as person_1, p2 AS person_2, SUM(r.overlap) AS score
        }
        WITH person_1.name as person_1, skills_1, person_2, SUM(score) as score
        WHERE score >= 1
        MATCH (person_2)-[:KNOWS]->(s:Skill)
        RETURN person_1, skills_1,  person_2.name as person_2, COLLECT(s.name) as skills_2, score
        ORDER BY score DESC LIMIT 5
        """,
        database_=DATABASE,
        routing_=RoutingControl.READ,
        result_transformer_= lambda r: r.to_df(),
        person_name = person_name
    )

In [None]:
person_similarity("Christopher Jackson")

### Tool 4

In [None]:
def find_person_based_on_skills(skills: List[Skill]) -> pd.DataFrame:
    """
    Find persons based on skills they have. Skills are specified by their names. 
    Note that similar skills can be found. These are considered similar. 
    """
    skills = [s.name for s in skills]
    skills_vectors = embeddings.embed_documents(skills)
    return driver.execute_query(
        """
        UNWIND $skills_vectors AS v
        CALL db.index.vector.queryNodes('skill-embeddings', 3, TOFLOATLIST(v)) YIELD node, score
        WHERE score > 0.89
        OPTIONAL MATCH (node)-[:SIMILAR_SEMANTIC]-(s:Skill)
        WITH COLLECT(node) AS nodes, COLLECT(DISTINCT s) AS skills
        WITH nodes + skills AS all_skills
        UNWIND all_skills AS skill
        MATCH (p:Person)-[:KNOWS]->(skill)
        RETURN p.name AS person, COUNT(DISTINCT(skill)) AS score, COLLECT(DISTINCT(skill.name)) as similar_skills
        ORDER BY score DESC LIMIT 10
        """,
        database_=DATABASE,
        routing_=RoutingControl.READ,
        result_transformer_= lambda r: r.to_df(),
        skills_vectors = skills_vectors
)

In [None]:
find_person_based_on_skills([Skill(name='Security'), Skill(name='Pandas')])

## Setting up the Agent

In [None]:
llm = ChatOpenAI(model_name=LLM, temperature=0)

In [None]:
response = llm.invoke([HumanMessage(content="hi!")])
response.content

In [None]:
tools = [
    retrieve_skills_of_person, 
    find_similar_skills,
    person_similarity,
    find_person_based_on_skills,
]

llm_with_tools = llm.bind_tools(tools)

In [None]:
response = llm_with_tools.invoke([HumanMessage(content="What skills does Kristof Neys have?")])

print(f"ContentString: {response.content}")
print(f"ToolCalls: {response.tool_calls}")

In [None]:
response = llm_with_tools.invoke([HumanMessage(content="What skills are similar to PowerBI and Data Visualization?")])

print(f"ContentString: {response.content}")
print(f"ToolCalls: {response.tool_calls}")

In [None]:
response = llm_with_tools.invoke([HumanMessage(content="Which persons have similar skills as Kristof Neys?")])

print(f"ContentString: {response.content}")
print(f"ToolCalls: {response.tool_calls}")

In [None]:
response = llm_with_tools.invoke([HumanMessage(content="Which persons have Python and AWS experience?")])

print(f"ContentString: {response.content}")
print(f"ToolCalls: {response.tool_calls}")

We can see that there's now no text content, but there is a tool call! It wants us to call the Tavily Search tool. This isn't calling that tool yet - it's just telling us to. In order to actually call it, we'll want to create our agent.

## Running Agents with LangGraph

In [None]:
agent_executor = create_react_agent(llm, tools)

In [None]:
response = agent_executor.invoke({"messages": [HumanMessage(content="hi!")]})

In [None]:
response["messages"]

#### Run some examples! 

In [None]:
def ask_to_agent(question):
    for step in agent_executor.stream(
        {"messages": [HumanMessage(content=question)]},
        stream_mode="values",
    ):
        step["messages"][-1].pretty_print()

In [None]:
question = "What skills does Kristof Neys have?"

In [None]:
ask_to_agent(question)

In [None]:
question = "What skills are similar to PowerBI and Data Visualization?"

In [None]:
ask_to_agent(question)

In [None]:
question = "Which persons have similar skills as Daniel Hill?"

In [None]:
ask_to_agent(question)

In [None]:
question = "Which persons have Python and AWS experience?"

In [None]:
ask_to_agent(question)

## Chatbot

Now create a chatbot with the agent providing the responses

In [None]:
def user(user_message, history):
    if history is None:
        history = []
    history.append({"role": "user", "content": user_message})
    return "", history

def get_answer(history):
    steps = []
    full_prompt = "\n".join([f"{msg['role'].capitalize()}: {msg['content']}" for msg in history])
    
    for step in agent_executor.stream(
            {"messages": [HumanMessage(content=full_prompt)]},
            stream_mode="values",
    ):
        step["messages"][-1].pretty_print()
        steps.append(step["messages"][-1].content)
    
    return steps[-1]

def bot(history):
    bot_message = get_answer(history)
    history.append({"role": "assistant", "content": ""})

    for character in bot_message:
        history[-1]["content"] += character
        time.sleep(0.01)
        yield history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot(
        label="Chatbot on a Graph",
        avatar_images=[
            "https://png.pngtree.com/png-vector/20220525/ourmid/pngtree-concept-of-facial-animal-avatar-chatbot-dog-chat-machine-illustration-vector-png-image_46652864.jpg",
            "https://d-cb.jc-cdn.com/sites/crackberry.com/files/styles/larger/public/article_images/2023/08/openai-logo.jpg"
        ],
        type="messages", 
    )
    msg = gr.Textbox(label="Message")
    clear = gr.Button("Clear")

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, [chatbot], chatbot
    )

    clear.click(lambda: [], None, chatbot, queue=False)

demo.queue()
demo.launch(share=True)

If you want to have the light-mode for the chatbot paste the following after the URL: /?__theme=light

### Text2Cypher

If time allows we can still experiment with the Text2Cypher functionality. 

In [None]:
text2cypher_prompt =  PromptTemplate.from_template(
    """
    Task: Generate a Cypher statement for querying a Neo4j graph database from a user input. 
    - Do not include triple backticks ``` or ```cypher or any additional text except the generated Cypher statement in your response.
    - Do not use any properties or relationships not included in the schema.
    
    Schema:
    {schema}
    
    #User Input
    {question}
    
    Cypher query:
    """
)

In [None]:
annotated_schema = """
    Nodes:
      Person:
        description: "A person in our talent pool."
        properties:
          name:
            type: "string"
            description: "The full name of the person. serves as a unique identifier."
          email:
            type: "string"
            description: "The email address of the person."
          leiden_community:
            type: "integer"
            description: "The talent community for the person.  People in the same talent segment share similar skills."
      Skill:
        description: "A professional skill."
        properties:
          name:
            type: "string"
            description: "The unique name of the skill."
    Relationships:
        KNOWS:
            description: "A person knowing a skill."
            query_pattern: "(:Person)-[:KNOWS]->(:Skill)"
    """

In [None]:
text2cypher_llm = ChatOpenAI(model=LLM, temperature=0)

In [None]:
@tool
def perform_aggregation_query(question: str) -> pd.DataFrame:
    """
    perform an aggregation query on the Neo4j graph database and obtain the results.
    """
    prompt = text2cypher_prompt.invoke({'schema': annotated_schema, 'question': question})
    query = text2cypher_llm.invoke(prompt).content
    print(f"executing Cypher query:\n{query}")
    return driver.execute_query(
        query,
        database_=DATABASE,
        routing_=RoutingControl.READ,
        result_transformer_= lambda r: r.to_df()
    )    

In [None]:
perform_aggregation_query('describe communities by skills') 

In [None]:
perform_aggregation_query('how many people share skills with Isabella Allen, and what are the skills')

In [None]:
perform_aggregation_query('Can you list me a 5 random person name from the database?')