### Setup Notebook

In [2]:
# # Cell 1: Uninstall all related packages
# %pip uninstall langchain langchain-core langchain-community langchain-openai langchain-neo4j -y


In [3]:
# # # Cell 2: Restart the Kernel
# # # # This step is critical to clear the old packages from memory
# import IPython
# IPython.Application.instance().kernel.do_shutdown(True)

In [4]:
# Cell 3: Reinstall modern, compatible packages
%pip install langchain-core langchain-community langchain-openai openai neo4j langgraph

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
# import json
# from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()

import os

from operator import itemgetter 

from langchain_openai import ChatOpenAI
from langgraph.graph import StateGraph, START, END
from langchain_community.graphs import Neo4jGraph
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser




from typing_extensions import List, TypedDict
from typing import Dict, Any


import psycopg2
from neo4j import GraphDatabase
from decimal import Decimal
from shapely import wkb
from pyproj import CRS, Transformer


  from pydantic.v1.fields import FieldInfo as FieldInfoV1


In [6]:

load_dotenv()

keys = ["NEO4J_USER", "NEO4J_PASSWORD","NEO4J_USER_LOCAL", "NEO4J_PASSWORD_LOCAL"]
for key in keys:
    if key not in os.environ:
        raise Exception(f"Key '{key}' not found not in .env")
    
print("Credentials from .env file successfully loaded")

NEO4J_LOCAL_DATABASE = "busoppdse203"

Credentials from .env file successfully loaded


In [None]:
# graph = Neo4jGraph(url="bolt://localhost:7687",    username='neo4j',password='admin123',database=NEO4J_LOCAL_DATABASE )
graph = Neo4jGraph(url="bolt://67.58.49.87:7687",    username='neo4j',password="h2u9l4px" )
graph.query("MATCH (n) RETURN n LIMIT 1;")




  graph = Neo4jGraph(url="bolt://67.58.49.87:7687",    username='neo4j',password="h2u9l4px" )


[{'n': {'code': 'CA', 'name': 'California', 'id': 1}}]

In [25]:

get_schema_runnable = graph.get_schema

In [None]:
STATIC_NEO4J_SCHEMA = graph.get_schema 


### Setup LLM Chat

In [41]:
# Initialize the LLM
model_ver = "gpt-4o"

cypher_model = ChatOpenAI(model="gpt-4o-mini", temperature=0) 

main_qa_model = ChatOpenAI(model="gpt-4o", temperature=0)

#### Generic GraphRAG

In [9]:
# Define state for application
class State(TypedDict):
    question: str
    context: List[dict]
    answer: str


# Retrieve context 
# def retrieve(state: State):
#     context = graph.query("CALL db.schema.visualization()")
#     return {"context": context}



def retrieve(state: State):
    schema_map = graph.query("CALL apoc.meta.schema()")
    
    context = schema_map 
    return {"context": context}

# Create a prompt
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}

Answer:"""




prompt = ChatPromptTemplate.from_template(template)

# Generate the answer based on the question and context
def generate(state: State):
    messages = prompt.invoke({"question": state["question"], "context": state["context"]})
    response = model.invoke(messages)
    return {"answer": response.content}




# Define application steps
workflow = StateGraph(State).add_sequence([retrieve, generate])
workflow.add_edge(START, "retrieve")
app = workflow.compile()




In [10]:
# entity_type = 'BlockGroup'
# question = "How is the graph structured?"



question = "What questions can you answer"
# question = f"What attributes are available on a {entity_type} node"
# question = "Are any cities contained in cities"

# question = 'What is the context'

response = app.invoke({"question": question})
print("Answer:", response["answer"])

Answer: I can answer questions related to the structure and relationships of the data provided in the context. Here are some examples of questions I can answer:

1. What types of nodes are present in the data?
2. How many nodes of each type are there?
3. What properties are associated with each type of node?
4. What types of relationships exist between nodes?
5. How many relationships of each type are there?
6. What are the directions and labels of the relationships?
7. Are there any unique or indexed properties for the nodes?
8. What are the specific properties of a given node type, such as "BlockGroup" or "BusinessLocation"?

If you have a specific question related to these aspects, feel free to ask!


####  Text 2 Cypher

In [11]:
# Test 1: Check if 'query' is callable
print(f"Type of graph.query: {type(graph.query)}")

# Test 2: Run a simple query
try:
    test_result = graph.query("MATCH (n) RETURN count(n) AS count")
    print("Graph query test passed.")
    print(test_result)
except AttributeError as e:
    print(f"Error during query test: {e}")

Type of graph.query: <class 'method'>
Graph query test passed.
[{'count': 77993}]


In [42]:


# from the online course
# cypher_qa = GraphCypherQAChain.from_llm(
#     graph=graph, 
#     llm=model, 
#     cypher_llm=cypher_model,
#     allow_dangerous_requests=True,
#     verbose=True,
# )


# # Create the Cypher QA chain
# cypher_qa = GraphCypherQAChain.from_llm(
#     graph=graph, 
#     llm=model, 
#     allow_dangerous_requests=True,
#     verbose=True, 
# )



In [62]:
neo4j_schema = graph.get_schema
print("Schema successfully retrieved.")
print(neo4j_schema)


Schema successfully retrieved.
Node properties:
State {name: STRING, code: STRING, id: INTEGER}
County {name: STRING, id: INTEGER}
City {name: STRING, id: INTEGER}
Community {name: STRING, id: INTEGER}
Zipcode {id: INTEGER, zipcode: INTEGER}
Business {categories: LIST, num_locations: INTEGER, name: STRING, id: INTEGER}
BusinessLocation {location: POINT, longitude: FLOAT, blockgroup: FLOAT, avg_rating: FLOAT, geom: STRING, confidence: FLOAT, reasoning: STRING, url: STRING, city: STRING, latitude: FLOAT, zip: INTEGER, franchise: STRING, address: STRING, name: STRING, id: INTEGER, categories: LIST}
BlockGroup {ogc_fid: INTEGER, fem35: INTEGER, di100_cy: INTEGER, medhinc_cy: INTEGER, gini_fy: FLOAT, population: FLOAT, statefp: INTEGER, ctblockgroup: INTEGER, fem30: INTEGER, crmcytotc: INTEGER, apportionm: FLOAT, s16_sales: FLOAT, avghinc_cy: INTEGER, male35: INTEGER, fem25: INTEGER, source_cou: STRING, aggregatio: STRING, male30: INTEGER, male25: INTEGER, totpop_cy: INTEGER, n35_bus: FLOAT

In [65]:

# --- 2. Prompt Templates ---



CYPHER_PROMPT = ChatPromptTemplate.from_messages(
    [
        ("system", 
         "You are an expert Neo4j developer. Use the detailed schema: {schema}. "
         "**CRITICAL RULE:** The `location` attribute is already a Neo4j POINT object. **NEVER** use the `point()` function on `n.location`. "
         "**GEOSPATIAL RULE:** To find neighbors, calculate distance using `point.distance(p1.location, p2.location)` in meters (0.1 degrees is 11132m). "
         "**CYPHER SCOPE RULE:** Always use `WITH` to carry forward necessary node variables (like the original location node, 'bl') into subsequent clauses. "
         "Generate a single, correct, executable Cypher query. **ONLY output the raw Cypher statement.**"),
        ("human", "Question: {question}"),
        ("system", "BEGIN CYPHER QUERY. DO NOT ADD ANY OTHER TEXT."),
    ]
)

ANSWER_PROMPT = ChatPromptTemplate.from_messages(
    [
        ("system", 
         "You are a helpful assistant. Use the Query Result: {query_result} and Original Question: {question} "
         "to provide a concise answer. If the result is empty or unhelpful, state that the information is unavailable."),
        ("human", "Original Question: {question}"),
    ]
)

# --- 3. Helper Functions for LCEL ---

def prepare_input_with_schema(input_dict: Dict[str, Any]) -> Dict[str, Any]:
    """Injects the static schema string into the incoming question dictionary."""
    return {
        "question": input_dict["question"],
        "schema": STATIC_NEO4J_SCHEMA # Injects the pre-fetched static string
    }

def execute_cypher_query(input_dict: Dict[str, Any]) -> str:
    """Executes the generated Cypher query using the Neo4jGraph object."""
    return graph.query(input_dict['cypher'])

# --- 4. The Final LCEL Chain ---

cypher_qa_chain = (
    # 1. Start by injecting the static schema string
    RunnableLambda(prepare_input_with_schema)
    | {
        # 2. Parallel Step: Generate Cypher, and PASS THROUGH question/schema
        "cypher": CYPHER_PROMPT | cypher_model | StrOutputParser(),
        "question": itemgetter("question"),
        "schema": itemgetter("schema"), 
    }
    # 3. Execution Step: Pass generated Cypher through and execute query
    | RunnablePassthrough.assign(
        query_result=RunnableLambda(execute_cypher_query),
        question=itemgetter("question"),
        generated_cypher=itemgetter("cypher"), 
    )
    # 4. Final Output Formatting: Define the exact keys to return
    | {
        "final_answer": ANSWER_PROMPT | main_qa_model | StrOutputParser(),
        "generated_cypher": itemgetter("generated_cypher"), 
        
        # *** ADDED: Return the raw Cypher query output ***
        "query_output": itemgetter("query_result") 
    }
)

# Example Invocation and Output Access:
# question = "Find a starbucks with a high rating..."
# response = cypher_qa_chain.invoke({"question": question})

# print(f"1. Final Answer: {response['final_answer']}")
# print(f"2. Cypher Query: {response['generated_cypher']}")
# print(f"3. Query Output: {response['query_output']}")


In [66]:

question = "Find a starbucks with a high rating, look for its location, create a boundaing box of +/- 0.1 degrees and return 5 business locations in there"
response = cypher_qa_chain.invoke({"question": question})

print(f"1. Final Answer: {response['final_answer']}")
print(f"2. Cypher Query: {response['generated_cypher']}")
print(f"3. Query Output: {response['query_output']}")


1. Final Answer: The information to find a specific Starbucks with a high rating and its exact location is unavailable. Therefore, I cannot create a bounding box or provide business locations within that area.
2. Cypher Query: MATCH (bl:BusinessLocation)-[:belongs_to]->(b:Business {name: 'Starbucks'})
WHERE bl.avg_rating > 4.5
WITH bl
MATCH (bl2:BusinessLocation)
WHERE point.distance(bl.location, bl2.location) <= 11132 * 0.1
RETURN bl2
LIMIT 5
3. Query Output: []


#### Ask Questions

In [50]:

# Invoke the chain
question = "Find a starbucks with a high rating, look for its location, create a boundaing box of +/- 0.1 degrees and return 5 business locations in there"
response = cypher_qa.invoke({"question": question}) 

print(response["result"])

CypherSyntaxError: {code: Neo.ClientError.Statement.SyntaxError} {message: Invalid input 'To': expected 'FOREACH', 'ALTER', 'ORDER BY', 'CALL', 'USING PERIODIC COMMIT', 'CREATE', 'LOAD CSV', 'START DATABASE', 'STOP DATABASE', 'DEALLOCATE', 'DELETE', 'DENY', 'DETACH', 'DROP', 'DRYRUN', 'FINISH', 'GRANT', 'INSERT', 'LIMIT', 'MATCH', 'MERGE', 'NODETACH', 'OFFSET', 'OPTIONAL', 'REALLOCATE', 'REMOVE', 'RENAME', 'RETURN', 'REVOKE', 'ENABLE SERVER', 'SET', 'SHOW', 'SKIP', 'TERMINATE', 'UNWIND', 'USE' or 'WITH' (line 1, column 1 (offset: 0))
"To find a Starbucks with a high rating, create a bounding box around its location, and return 5 business locations within that bounding box, you can use the following Cypher query:"
 ^}

In [39]:

# Invoke the chain
question = "What kind of business locations have high ratings in heavily tot populated blockgroups where starbucks" \
" have high ratings? " \
"Find 5 of those businesses that do well but in a blockgroup with few starbucks"
response = cypher_qa.invoke({"question": question})
print(response["result"])

CypherSyntaxError: {code: Neo.ClientError.Statement.SyntaxError} {message: Invalid input 'To': expected 'FOREACH', 'ALTER', 'ORDER BY', 'CALL', 'USING PERIODIC COMMIT', 'CREATE', 'LOAD CSV', 'START DATABASE', 'STOP DATABASE', 'DEALLOCATE', 'DELETE', 'DENY', 'DETACH', 'DROP', 'DRYRUN', 'FINISH', 'GRANT', 'INSERT', 'LIMIT', 'MATCH', 'MERGE', 'NODETACH', 'OFFSET', 'OPTIONAL', 'REALLOCATE', 'REMOVE', 'RENAME', 'RETURN', 'REVOKE', 'ENABLE SERVER', 'SET', 'SHOW', 'SKIP', 'TERMINATE', 'UNWIND', 'USE' or 'WITH' (line 1, column 1 (offset: 0))
"To answer this question, we need to find business locations with high ratings in blockgroups that have a high total population and where Starbucks also has high ratings. Additionally, we need to ensure that these blockgroups have few Starbucks locations. Here's a Cypher query to achieve this:"
 ^}

In [37]:
print(graph.schema)

Node properties:
State {name: STRING, code: STRING, id: INTEGER}
County {name: STRING, id: INTEGER}
City {name: STRING, id: INTEGER}
Community {name: STRING, id: INTEGER}
Zipcode {id: INTEGER, zipcode: INTEGER}
Business {categories: LIST, num_locations: INTEGER, name: STRING, id: INTEGER}
BusinessLocation {location: POINT, longitude: FLOAT, blockgroup: FLOAT, avg_rating: FLOAT, geom: STRING, confidence: FLOAT, reasoning: STRING, url: STRING, city: STRING, latitude: FLOAT, zip: INTEGER, franchise: STRING, address: STRING, name: STRING, id: INTEGER, categories: LIST}
BlockGroup {ogc_fid: INTEGER, fem35: INTEGER, di100_cy: INTEGER, medhinc_cy: INTEGER, gini_fy: FLOAT, population: FLOAT, statefp: INTEGER, ctblockgroup: INTEGER, fem30: INTEGER, crmcytotc: INTEGER, apportionm: FLOAT, s16_sales: FLOAT, avghinc_cy: INTEGER, male35: INTEGER, fem25: INTEGER, source_cou: STRING, aggregatio: STRING, male30: INTEGER, male25: INTEGER, totpop_cy: INTEGER, n35_bus: FLOAT, x1024_x: FLOAT, s22_bus: FLOA