In [1]:
import pandas as pd
import json
import os
from dotenv import load_dotenv
from langchain_community.graphs import Neo4jGraph
from langchain.chains import GraphCypherQAChain
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.llms import HuggingFaceHub
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
# Gemini (https://aistudio.google.com/app/apikey)
gemini_api = os.getenv("GEMINI_API")

# Hugging Face (if we want to use open source LLM)
hf_api = os.getenv("HF_API")

# Neo4j 
#neo4j_url = os.getenv("NEO4J_URL")
#print(neo4j_url)
#neo4j_user = os.getenv("NEO4J_USER")
#print(neo4j_user)
#neo4j_password = os.getenv("NEO4J_PASSWORD")
#print(neo4j_password)
# https://api.python.langchain.com/en/latest/graphs/langchain_community.graphs.neo4j_graph.Neo4jGraph.html
graph = Neo4jGraph("bolt://localhost:7687","neo4j","Alakran.23")


# 2. Insert data from CSV to Neo4J

In [3]:
graph.refresh_schema()
print(graph.schema)

Node properties are the following:
Occurrence {recordedBy: STRING, catalogNumber: STRING, occurrenceID: STRING, occurrenceStatus: STRING},Event {eventDate: DATE, day: INTEGER, month: INTEGER, year: INTEGER, eventID: STRING},Taxon {scientificName: STRING, taxonRank: STRING, url: STRING, order: STRING, taxonKey: STRING, verbatimScientificName: STRING, class: STRING, genus: STRING, family: STRING, kingdom: STRING, phylum: STRING, infraspecificEpithet: STRING, verbatimScientificNameAuthorship: STRING},Location {decimalLongitude: FLOAT, countryCode: STRING, coordinateUncertaintyInMeters: STRING, locationID: STRING, stateProvince: STRING, decimalLatitude: FLOAT, locality: STRING},Identification {identifiedBy: STRING, dateIdentified: STRING, typeStatus: STRING},Institution {url: STRING, institutionCode: STRING},Dataset {datasetKey: STRING, dataset_url: STRING},Record {basisOfRecord: STRING, collectionCode: STRING, rightsHolder: STRING, license: STRING, datasetKey: STRING}
Relationship propert

# 3. Query through our Knowledge Graph

In [4]:
llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key = gemini_api ,temperature = 0)
chain = GraphCypherQAChain.from_llm(graph=graph, llm=llm, verbose=True)

In [53]:
questions = ["Which institutions are related to Argentina according to recorded occurrences?",
             "On what dates Ezequiel Vera recorded occurrences?",
             "How many species occurrences were recorded between 2023-01-01 and 2023-12-31?",
             "What are the scientific names of the occurrences that are associated with a SUBSPECIE?"]
for q in questions:
    print('====== START ======')
    print(chain.invoke(q)['result'])
    print('====== END ====== \n')



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (o:Occurrence)-[:AT_EVENT]->(e:Event)-[:LOCATED_AT]->(l:Location)
WHERE l.countryCode = 'AR'
RETURN o.recordedBy AS institution[0m
Full Context:
[32;1m[1;3m[{'institution': 'Fernando López'}, {'institution': 'Danalí Herr'}, {'institution': 'Néstor Trossero'}, {'institution': 'Mariano Ordano'}, {'institution': 'Alain Le Hérissé'}, {'institution': 'Carlos Schmidtutz'}, {'institution': 'Carlos Schmidtutz'}, {'institution': 'Nicolas Olejnik'}, {'institution': 'Nicolas Olejnik'}, {'institution': 'Guille Ivan Spajic'}][0m

[1m> Finished chain.[0m
Fernando López, Danalí Herr, Néstor Trossero, Mariano Ordano, Alain Le Hérissé, Carlos Schmidtutz, Nicolas Olejnik, Guille Ivan Spajic



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (o:Occurrence)
WHERE o.recordedBy = "Ezequiel Vera"
RETURN o.occurrenceID, o.recordedBy, o.catalogNumber, o.occurrenceStatus, o.eve

# 4. Prompting Strategies

In [54]:
examples= [
    {
        "question": "How many occurrences are recorded in the dataset identified with ID 50c9509d-22c7-4a22-a47d-8c48425ef4a7?",
        "query": "MATCH (o:Occurrence)-[:IN_DATASET]->(d:Dataset {{datasetKey: '50c9509d-22c7-4a22-a47d-8c48425ef4a7'}}) RETURN count(o) AS numOccurrences",
    },
    {
        "question": "Which are the occurrences identifiers recorded by Eduardo Luis Beltrocco?",
        "query": "MATCH (o:Occurrence {{recordedBy: 'Eduardo Luis Beltrocco'}}) RETURN o.occurrenceID",
    },
    {
        "question": "What are the coordinates of the location of the occurrence https://www.inaturalist.org/observations/167230458?",
        "query": "MATCH (o:Occurrence {{occurrenceID: 'https://www.inaturalist.org/observations/167230458'}})-[:AT_EVENT]->(:Event)-[:LOCATED_AT]->(l:Location) RETURN l.decimalLatitude, l.decimalLongitude",
    },
    {
        "question": "What datasetsKey are associated with the iNaturalist institution?",
        "query": "MATCH (d:Dataset)<-[:IN_DATASET]-(:Occurrence)-[:IS_PART_OF]->(r:Record)-[:BY_INSTITUTION]->(i:Institution {{institutionCode: 'iNaturalist'}}) RETURN DISTINCT d.datasetKey, d.dataset_url",
    },
    {
        "question":"What species have been recorded in Chubut Argentina?",
        "query": "MATCH (:Location {{countryCode: 'AR', stateProvince: 'Chubut'}})<-[:LOCATED_AT]-(:Event)<-[:AT_EVENT]-(o:Occurrence)<-[:HAS_OCCURRENCE]-(i:Identification)-[:TO_TAXON]->(t:Taxon) RETURN DISTINCT t.scientificName",
    },
    {
        "question": "In which countries was the species Bombus dahlbomii Guérin-Méneville, 1835 recorded?",
        "query": "MATCH (t:Taxon {{scientificName: 'Bombus dahlbomii Guérin-Méneville, 1835'}})<-[:TO_TAXON]-(:Identification)-[:HAS_OCCURRENCE]->(:Occurrence)-[:AT_EVENT]->(:Event)-[:LOCATED_AT]->(l:Location) RETURN DISTINCT l.countryCode",
    },
    {
        "question": "What are the occurrences associated with the dataset 50c9509d-22c7-4a22-a47d-8c48425ef4a7?",
        "query": "MATCH (o:Occurrence)-[:IN_DATASET]->(d:Dataset {{datasetKey: '50c9509d-22c7-4a22-a47d-8c48425ef4a7'}}) RETURN o.occurrenceID",
    },
]


In [55]:
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate

example_prompt = PromptTemplate.from_template(
    "User input: {question}\nCypher query: {query}"
)
prompt = FewShotPromptTemplate(
    examples=examples[:7],
    example_prompt=example_prompt,
    prefix="You are a Neo4j expert. Given an input question, create a syntactically correct Cypher query to run.\n\nHere is the schema information\n{schema}.\n\nBelow are a number of examples of questions and their corresponding Cypher queries.",
    suffix="User input: {question}\nCypher query: ",
    input_variables=["question", "schema"],

)

In [56]:
print(prompt.format(question="Which are the occurrences identifiers recorded by Eduardo Luis Beltrocco?", schema="foo"))

You are a Neo4j expert. Given an input question, create a syntactically correct Cypher query to run.

Here is the schema information
foo.

Below are a number of examples of questions and their corresponding Cypher queries.

User input: How many occurrences are recorded in the dataset identified with ID 50c9509d-22c7-4a22-a47d-8c48425ef4a7?
Cypher query: MATCH (o:Occurrence)-[:IN_DATASET]->(d:Dataset {datasetKey: '50c9509d-22c7-4a22-a47d-8c48425ef4a7'}) RETURN count(o) AS numOccurrences

User input: Which are the occurrences identifiers recorded by Eduardo Luis Beltrocco?
Cypher query: MATCH (o:Occurrence {recordedBy: 'Eduardo Luis Beltrocco'}) RETURN o.occurrenceID

User input: What are the coordinates of the location of the occurrence https://www.inaturalist.org/observations/167230458?
Cypher query: MATCH (o:Occurrence {occurrenceID: 'https://www.inaturalist.org/observations/167230458'})-[:AT_EVENT]->(:Event)-[:LOCATED_AT]->(l:Location) RETURN l.decimalLatitude, l.decimalLongitude

Us

In [57]:
chain2 = GraphCypherQAChain.from_llm(graph=graph, llm=llm, cypher_prompt=prompt, verbose=True)

In [58]:
questions = ["Which institutions are related to Argentina according to recorded occurrences?",
             "On what dates Ezequiel Vera recorded occurrences?",
             "How many species occurrences were recorded between 2023-01-01 and 2023-12-31?",
             "What are the scientific names of the occurrences that are associated with a SUBSPECIE?"]
for q in questions:
    print('====== START ======')
    print(chain2.invoke(q)['result'])
    print('====== END ====== \n')



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (:Location {countryCode: 'AR'})<-[:LOCATED_AT]-(:Event)<-[:AT_EVENT]-(o:Occurrence)-[:IS_PART_OF]->(r:Record)-[:BY_INSTITUTION]->(i:Institution) RETURN DISTINCT i.institutionCode[0m
Full Context:
[32;1m[1;3m[{'i.institutionCode': 'iNaturalist'}, {'i.institutionCode': 'IANIGLA'}][0m

[1m> Finished chain.[0m
iNaturalist, IANIGLA are related to Argentina according to recorded occurrences.



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (o:Occurrence {recordedBy: 'Ezequiel Vera'})-[:AT_EVENT]->(e:Event) RETURN DISTINCT e.eventDate[0m
Full Context:
[32;1m[1;3m[{'e.eventDate': neo4j.time.Date(2019, 4, 20)}, {'e.eventDate': neo4j.time.Date(2021, 10, 9)}, {'e.eventDate': neo4j.time.Date(2017, 7, 16)}, {'e.eventDate': neo4j.time.Date(2019, 2, 13)}, {'e.eventDate': neo4j.time.Date(2019, 4, 7)}, {'e.eventDate': neo4j.time.Date(2019, 7, 27)}, {'e.eventDate':

In [28]:
print(prompt.format(question="Where do Michael work?", schema="foo"))

KeyError: 'datasetKey'

In [None]:
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate

example_prompt = PromptTemplate.from_template(
    "User input: {question}\nCypher query: {query}"
)
prompt = FewShotPromptTemplate(
    examples=examples[:3],
    example_prompt=example_prompt,
    prefix="You are a Neo4j expert. Given an input question, create a syntactically correct Cypher query to run.\n\nHere is the schema information\n{schema}.\n\nBelow are a number of examples of questions and their corresponding Cypher queries.",
    suffix="User input: {question}\nCypher query: ",
    input_variables=["question", "schema"],
)

In [None]:
examples= [
    {
        "question": "Which workers speak French?",
        "query": "MATCH (p:Person)-[:SPEAKS]->(l:Language {{name: 'French'}}) RETURN p.name",
    },
    {
        "question": "What industries are workers named Emily associated with?",
        "query": "MATCH (p:Person {{name: 'Emily'}})-[:WORKS_IN]->(c:Company)-[:IS_IN]->(i:Industry) RETURN i.name",
    },
    {
        "question": "Which workers live in Canada and speak German?",
        "query": "MATCH (p:Person)-[:LIVES_IN]->(:Country {{name: 'Canada'}}), (p)-[:SPEAKS]->(:Language {{name: 'German'}}) RETURN p.name",
    },
    {
        "question": "In which countries do workers who speak Spanish live?",
        "query": "MATCH (p:Person)-[:SPEAKS]->(:Language {{name: 'Spanish'}})<-[:SPEAKS]-(worker:Person)-[:LIVES_IN]->(c:Country) RETURN DISTINCT c.name AS Country",
    },
    {
        "question": "What companies do workers named John work in?",
        "query": "MATCH (p:Person {{name: 'John'}})-[:WORKS_IN]->(c:Company) RETURN c.name",
    },
    {
        "question":"How many workers in Hospital and Health Care industry able to speak Korea",
        "query": "MATCH (p:Person)-[:WORKS_IN]->(:Company)-[:IS_IN]->(:Industry {{name: 'Hospitals and Health Care'}}),(p)-[:SPEAKS]->(:Language {{name: 'Korean'}}) RETURN COUNT(DISTINCT p) AS NumberOfWorkers",
    },
    {
        "question": "What companies are located in the technology industry?",
        "query": "MATCH (c:Company)-[:IS_IN]->(:Industry {{name: 'Technology'}}) RETURN c.name",
    },
    {
        "question": "Where do workers named Alice live?",
        "query": "MATCH (p:Person {{name: 'Alice'}})-[:LIVES_IN]->(c:Country) RETURN c.name",
    },
]


In [15]:
from langchain_community.vectorstores import Neo4jVector
from langchain_core.example_selectors import SemanticSimilarityExampleSelector
from langchain_community.embeddings import HuggingFaceEmbeddings

example_selector = SemanticSimilarityExampleSelector.from_examples(
    examples,
    HuggingFaceEmbeddings(),
    Neo4jVector,
    url = neo4j_url,
    username = neo4j_user,
    password = neo4j_password,
    k=3,
    input_keys=["question"],
)

In [None]:
example_selector.select_examples({"question": "Where do Michael work?"})

In [21]:
dynamic_prompt = FewShotPromptTemplate(
    example_selector=example_selector, #previous: examples = examples[:3]
    example_prompt=example_prompt,
    prefix="You are a Neo4j expert. Given an input question, create a syntactically correct Cypher query to run.\n\nHere is the schema information\n{schema}.\n\nBelow are a number of examples of questions and their corresponding Cypher queries.",
    suffix="User input: {question}\nCypher query: ",
    input_variables=["question", "schema"],
)

In [27]:
print(dynamic_prompt.format(question="Where do Michael work?", schema="foo"))

You are a Neo4j expert. Given an input question, create a syntactically correct Cypher query to run.

Here is the schema information
foo.

Below are a number of examples of questions and their corresponding Cypher queries.

User input: What companies do workers named John work in?
Cypher query: MATCH (p:Person {name: 'John'})-[:WORKS_IN]->(c:Company) RETURN c.name

User input: Where do workers named Alice live?
Cypher query: MATCH (p:Person {name: 'Alice'})-[:LIVES_IN]->(c:Country) RETURN c.name

User input: What industries are workers named Emily associated with?
Cypher query: MATCH (p:Person {name: 'Emily'})-[:WORKS_IN]->(c:Company)-[:IS_IN]->(i:Industry) RETURN i.name

User input: Where do Michael work?
Cypher query: 
