In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import os

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

HF_API_KEY = os.getenv("HUGGINGFACE_API_KEY")

In [3]:
from langchain_community.graphs import Neo4jGraph

NEO4J_DATABASE = "graphrag"

# Neo4j
NEO4J_URI = os.getenv("LOCAL_NEO4J_URI")
NEO4J_USERNAME = os.getenv("LOCAL_NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("LOCAL_NEO4J_PASSWORD")

graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
)

### Data Pre-processing

In [4]:
import pandas as pd

DATASET = "data/profiles.csv"
df = pd.read_csv(DATASET)
df.head()

Unnamed: 0,id,name,company,education,languages,industry,country
0,paul-lukes-906608134,Paul Lukes,Toolbox Creative,California College of the Arts,English|Czech,Advertising Services,United States
1,roberto-merola-baa923103,Roberto Merola,Capgemini,Université libre de Bruxelles,English|Italian|French|Dutch|German,IT Services and IT Consulting,Belgium
2,minju-hong-bsn-rn-1a7801239,"Minju Hong, BSN, RN",University of Washington Medical Center,University of Washington School of Nursing,Korean|English,Hospitals and Health Care,United States
3,prateek-chitpur-710a1a12a,Prateek Chitpur,George Mason University,George Mason University Education George Mason...,English|Hindi|Marathi|Kannada|Telugu,Higher Education,United States
4,aadcampos,Alexandre Campos,Serpro - Serviço Federal de Processamento de D...,Unichristus,English,IT Services and IT Consulting,Brazil


### Load data on Neo4J

In [5]:
graph.refresh_schema()
print(graph.schema)

Node properties:
Person {name: STRING}
Company {name: STRING}
School {name: STRING}
Industry {name: STRING}
Country {name: STRING}
Language {name: STRING}
Relationship properties:

The relationships:
(:Person)-[:SPEAKS]->(:Language)
(:Person)-[:WORKS_IN]->(:Company)
(:Person)-[:LIVES_IN]->(:Country)
(:Person)-[:EDUCATED_AT]->(:School)
(:Company)-[:IS_IN]->(:Industry)


In [6]:
# people_query = """
# LOAD CSV WITH HEADERS FROM 'https://raw.githubusercontent.com/martin-fabbri/graph-llm-agents/main/notebooks/data/profiles.csv'
# AS row
# MERGE (person:Person {name: row.name})
# MERGE (company:Company {name: row.company})
# MERGE (school:School {name: row.education})
# MERGE (industry:Industry {name: row.industry})
# MERGE (country:Country {name: row.country})

# FOREACH (lang in split(row.languages, '|') | 
#     MERGE (language:Language {name:trim(lang)})
#     MERGE (person)-[:SPEAKS]->(language))

# MERGE (person)-[:WORKS_IN]->(company)
# MERGE (person)-[:LIVES_IN]->(country)
# MERGE (person)-[:EDUCATED_AT]->(school)
# MERGE (company)-[:IS_IN]->(industry)
# """

# graph.query(people_query)

In [7]:
graph.refresh_schema()
print(graph.schema)

Node properties:
Person {name: STRING}
Company {name: STRING}
School {name: STRING}
Industry {name: STRING}
Country {name: STRING}
Language {name: STRING}
Relationship properties:

The relationships:
(:Person)-[:SPEAKS]->(:Language)
(:Person)-[:WORKS_IN]->(:Company)
(:Person)-[:LIVES_IN]->(:Country)
(:Person)-[:EDUCATED_AT]->(:School)
(:Company)-[:IS_IN]->(:Industry)


In [8]:
from langchain.chains import GraphCypherQAChain
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash-latest", google_api_key=GEMINI_API_KEY, temperature=0
)
chain = GraphCypherQAChain.from_llm(graph=graph, llm=llm, verbose=True)

#### Question: A worker who graduated from Simon Fraser University is currently employed at?

1. Returns [] (gemini-pro-1.0)

MATCH (p:Person)-[:EDUCATED_AT]->(s:School {name: "Simon Fraser University"})-[:IS_IN]->(i:Industry)<-[:IS_IN]-(c:Company)<-[:WORKS_IN]-(p)
RETURN c.name

2. Rerturns "Elastic Path" (gemini-pro-1.5)

MATCH (p:Person)-[:EDUCATED_AT]->(s:School {name: 'Simon Fraser University'}), (p)-[:WORKS_IN]->(c:Company) RETURN c.name

3. Rerturns [] (gemini-flash-1.5)

MATCH (p:Person)-[:EDUCATED_AT]->(s:School {name: "Simon Fraser University"})-[:WORKS_IN]->(c:Company) RETURN c.name



In [9]:
questions = [
    "List all companies in advertising services industry!",
    "A worker who graduated from Simon Fraser University is currently employed at?",
    "Where is Paul Lukes working?",
    "A worker residing in Canada who is proficient in Vietnamese?",
    "How many workers in United States speak Urdu?",
]
for q in questions:
    print("====== START ======")
    print(chain.invoke(q)["result"])
    print("====== END ====== \n")



[1m> Entering new GraphCypherQAChain chain...[0m


### Prompting Strategies

In [None]:
examples = [
    {
        "question": "Which workers speak French?",
        "query": "MATCH (p:Person)-[:SPEAKS]->(l:Language {{name: 'French'}}) RETURN p.name",
    },
    {
        "question": "What industries are workers named Emily associated with?",
        "query": "MATCH (p:Person {{name: 'Emily'}})-[:WORKS_IN]->(c:Company)-[:IS_IN]->(i:Industry) RETURN i.name",
    },
    {
        "question": "Which workers live in Canada and speak German?",
        "query": "MATCH (p:Person)-[:LIVES_IN]->(:Country {{name: 'Canada'}}), (p)-[:SPEAKS]->(:Language {{name: 'German'}}) RETURN p.name",
    },
    {
        "question": "In which countries do workers who speak Spanish live?",
        "query": "MATCH (p:Person)-[:SPEAKS]->(:Language {{name: 'Spanish'}})<-[:SPEAKS]-(worker:Person)-[:LIVES_IN]->(c:Country) RETURN DISTINCT c.name AS Country",
    },
    {
        "question": "What companies do workers named John work in?",
        "query": "MATCH (p:Person {{name: 'John'}})-[:WORKS_IN]->(c:Company) RETURN c.name",
    },
    {
        "question": "How many workers in Hospital and Health Care industry able to speak Korea",
        "query": "MATCH (p:Person)-[:WORKS_IN]->(:Company)-[:IS_IN]->(:Industry {{name: 'Hospitals and Health Care'}}),(p)-[:SPEAKS]->(:Language {{name: 'Korean'}}) RETURN COUNT(DISTINCT p) AS NumberOfWorkers",
    },
    {
        "question": "What companies are located in the technology industry?",
        "query": "MATCH (c:Company)-[:IS_IN]->(:Industry {{name: 'Technology'}}) RETURN c.name",
    },
    {
        "question": "Where do workers named Alice live?",
        "query": "MATCH (p:Person {{name: 'Alice'}})-[:LIVES_IN]->(c:Country) RETURN c.name",
    },
]

In [None]:
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate

example_prompt = PromptTemplate.from_template(
    "User input: {question}\nCypher query: {query}"
)


prompt = FewShotPromptTemplate(
    examples=examples[:3],
    example_prompt=example_prompt,
    prefix="You are a Neo4j expert. Given an input question, create a syntactically correct Cypher query to run.\n\nHere is the schema information\n{schema}.\n\nBelow are a number of examples of questions and their corresponding Cypher queries.",
    suffix="User input: {question}\nCypher query: ",
    input_variables=["question", "schema"],
)

In [None]:
print(prompt.format(question="Where do Michael work?", schema="foo"))

In [None]:
from langchain.chains import GraphCypherQAChain

chain2 = GraphCypherQAChain.from_llm(graph=graph, llm=llm, cypher_prompt=prompt, verbose=True)

In [None]:
questions = [
    "List all companies in advertising services industry!",
    "A worker who graduated from Simon Fraser University is currently employed at?",
    "Where is Paul Lukes working?",
    "A worker residing in Canada who is proficient in Vietnamese?",
    "How many worker in United States speak Urdu?",
]

for q in questions:
    print("====== START ======")
    print(chain2.invoke(q)["result"])
    print("====== END ====== \n")

In [None]:
examples = [
    {
        "question": "Find all people who work in a particular industry and speak a specific language",
        "query": "MATCH (p:Person)-[:WORKS_IN]->(:Company)-[:IS_IN]->(i:Industry {{name: 'Technology'}}), (p)-[:SPEAKS]->(l:Language {{name: 'English'}}) RETURN p.name AS PersonName",
    },
    {
        "question": "Find all companies in a specific industry located in a particular country",
        "query": "MATCH (c:Company)-[:IS_IN]->(i:Industry {{name: 'Healthcare'}}), (p:Person)-[:WORKS_IN]->(c), (p)-[:LIVES_IN]->(co:Country {{name: 'Germany'}}) RETURN DISTINCT c.name AS CompanyName",
    },
    {
        "question": "Find all people who were educated at a specific school and live in a particular country",
        "query": "MATCH (p:Person)-[:EDUCATED_AT]->(s:School {{name: 'Harvard University'}}), (p)-[:LIVES_IN]->(co:Country {{name: 'USA'}}) RETURN p.name AS PersonName",
    },
    {
        "question": "Find all people who work in companies in a specific industry",
        "query": "MATCH (p:Person)-[:WORKS_IN]->(c:Company)-[:IS_IN]->(i:Industry {{name: 'Finance'}}) RETURN p.name AS PersonName, c.name AS CompanyName",
    },
    {
        "question": "Find all languages spoken by people who work in a specific industry",
        "query": "MATCH (p:Person)-[:WORKS_IN]->(:Company)-[:IS_IN]->(i:Industry {{name: 'Education'}}), (p)-[:SPEAKS]->(l:Language) RETURN DISTINCT l.name AS LanguageName",
    },
    {
        "question": "Find all companies in a specific industry where employees speak a specific language",
        "query": "MATCH (p:Person)-[:WORKS_IN]->(c:Company)-[:IS_IN]->(i:Industry {{name: 'Automotive'}}), (p)-[:SPEAKS]->(l:Language {{name: 'Spanish'}}) RETURN DISTINCT c.name AS CompanyName",
    }
]


In [None]:
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate

example_prompt = PromptTemplate.from_template(
    "User input: {question}\nCypher query: {query}"
)

prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix="You are a Neo4j expert. Given an input question, create a syntactically correct Cypher query to run.\n\nHere is the schema information\n{schema}.\n\nBelow are a number of examples of questions and their corresponding Cypher queries.",
    suffix="User input: {question}\nCypher query: ",
    input_variables=["question", "schema"],
)