# Module objectives
- Exact lookups
- Semantic search/vector index
- Combine Exact and Semantic search + graph expansion for better context
- The R in GraphRAG (and maybe agents)

In [None]:
!pip install graphdatascience neo4j dotenv

# Setup

Import our usual suspects

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
from graphdatascience import GraphDataScience
from neo4j import Query, GraphDatabase, RoutingControl, Result

Load env variables

In [None]:
load_dotenv('ws.env', override=True)
# Neo4j
HOST = os.getenv('HOST')
USERNAME = os.getenv('USERNAME')
PASSWORD = os.getenv('PASSWORD')
DATABASE = os.getenv('DATABASE')

# AI
LLM = os.getenv('LLM')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY']=OPENAI_API_KEY

Connect to neo4j db

In [None]:
driver = GraphDatabase.driver(
    HOST,
    auth=(USERNAME, PASSWORD)
)
driver.verify_connectivity(database=DATABASE)

## Schema
Keeping this in case we need to add in more indexes

In [None]:
schema_statements = [
    'create constraint if not exists for (n:Person) require (n.email) is node key',
    'create constraint if not exists for (n:Skill) require (n.name) is node key',
    'create vector index ada_v if not exists for (n:Skill) on (n.embedding)'
]
for statement in schema_statements:
    driver.execute_query(
        statement,
        database_=DATABASE,
        routing_=RoutingControl.WRITE
    )

# Fetch all indexes
schema_result_df  = driver.execute_query(
    'show indexes',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)
schema_result_df.head(100)


# Basic search

In [None]:
skills = ['Contineous Delivery', 'Cloud Native', 'Security']

# Find person given some skills
driver.execute_query(
    '''
    match (p:Person)-[:KNOWS]->(s:Skill)
    where s.name in $skills
    return 
        count(*) as rank, 
        p.email as email, 
        p.name as person_name, 
        collect{ match (p)-[:KNOWS]->(anySkill) return anySkill.name } as skills
    order by rank desc limit 10
    ''',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    skills = skills
).head(10)

# Vector index search

In [None]:
# Import langchain open ai
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [None]:
# from openai import OpenAI
# client = OpenAI()
# client.models.list()

In [None]:
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
v_skills=embeddings.embed_documents(skills)

### Strategy 1
We get the approximate top 10 nearest nodes to the search vector `v` and take the 3 first returned. Then put them together in a list (`skill_list`) and does same ranking as before (number of skills)

In [None]:
driver.execute_query(
    '''
    unwind $v_skills as v
    call db.index.vector.queryNodes('ada_v', 10, toFloatList(v)) yield node
    with v, collect(node)[0..3] as top3
    unwind top3 as s
    with collect(s) as skill_list 
    match (p:Person)-[:KNOWS]->(s)
    where s in skill_list
    return 
        count(*) as rank, 
        p.email as email, 
        p.name as person_name, 
        collect{ match (p)-[:KNOWS]->(anySkill) return anySkill.name } as skills
    order by rank desc limit 10
    ''',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    v_skills = v_skills
).head(10)

## Strategy 2
Same strategy as before for finding the semantic similar skills, but we do a post sorting based on cosine similarity

In [None]:
driver.execute_query(
    '''
    unwind $v_skills as v
    call db.index.vector.queryNodes('ada_v', 10, toFloatList(v)) yield node
    with v, collect(node)[0..3] as top3
    unwind top3 as s
    with collect(s) as skill_list 
    match (p:Person)-[:KNOWS]->(s)
    where s in skill_list
    with p, sum(reduce(res=0.0, x in $v_skills | res + vector.similarity.cosine(x,s.embedding))) as score
    return 
        score as rank, 
        p.email as email, 
        p.name as person_name, 
        collect{ match (p)-[:KNOWS]->(anySkill) return anySkill.name } as skills
    order by rank desc limit 10
    ''',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    v_skills = v_skills
).head(10)

## Strategy 3
We can also look at community membership (from our kmeans community detection from before). We could for instance imagining the user wanting to explore the community that looks most relevant.


In [None]:
driver.execute_query(
    '''
    unwind $v_skills as v
    call db.index.vector.queryNodes('ada_v', 10, toFloatList(v)) yield node
    with v, collect(node)[0..3] as top3
    unwind top3 as s
    with collect(s) as skill_list 
    match (p:Person)-[:KNOWS]->(s)
    where s in skill_list
    with p, sum(reduce(res=0.0, x in $v_skills | res + vector.similarity.cosine(x,s.embedding))) as score
    with
        p.kmeans5_cluster as community,
        score as rank, 
        p.email as email, 
        p.name as person_name, 
        collect{ match (p)-[:KNOWS]->(anySkill) return anySkill.name order by anySkill.name} as skills
    order by rank desc limit 10
    return 
        community,
        rank,
        email,
        person_name,
        skills
    order by community
    ''',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    v_skills = v_skills
).head(10)

# Lets create a Retrieval agent

In [None]:
from typing import List, Optional
from pydantic import BaseModel, Field, validator
import functools
from langchain_core.tools import tool

class Skill(BaseModel):
    """
    Represents a professional skill or knwoledge of a person.
    """
    name: str = Field(..., description="Sortened name of the skill")

@tool
def retrieve_persons_given_set_of_skills(skills: List[Skill]) -> pd.DataFrame:
    """Given a list of skills this function will retrieve relevant persons. 
    If the question contains multiple skills, only call the function once using the set of
    skills as the argument for the function"""
    skills = [s.name for s in skills]
    print(skills)
    v_skills = embeddings.embed_documents(skills)
    return driver.execute_query(
        ''' 
            unwind $v_skills as v
            call db.index.vector.queryNodes('ada_v', 10, toFloatList(v)) yield node
            with v, collect(node)[0..3] as top3
            unwind top3 as s
            with collect(s) as skill_list 
            match (p:Person)-[:KNOWS]->(s)
            where s in skill_list
            with p, sum(reduce(res=0.0, x in $v_skills | res + vector.similarity.cosine(x,s.embedding))) as score
            with
                p.kmeans5_cluster as community,
                score as rank, 
                p.email as email, 
                p.name as person_name, 
                collect{ match (p)-[:KNOWS]->(anySkill) return anySkill.name order by anySkill.name} as skills
            order by rank desc limit 10
            return 
                community,
                rank,
                email,
                person_name,
                skills
            order by community
        ''',
        database_=DATABASE,
        v_skills=v_skills,
        routing_=RoutingControl.READ,
        result_transformer_= lambda r: r.to_df()
    )

In [None]:

llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
tools = [
    retrieve_persons_given_set_of_skills
]
llm_with_tools = llm.bind_tools(tools)

In [None]:
response = llm_with_tools.invoke("I am looking for a senior java developer that also knows cypher").tool_calls

In [None]:
response


In [None]:
name_to_functions = {
    'retrieve_persons_given_set_of_skills': retrieve_persons_given_set_of_skills,
}

In [None]:
selected_tool = name_to_functions[response[0]['name']]
args = response[0]['args']

In [None]:
df_response = selected_tool.invoke(args)

In [None]:
df_response.head(10)