# Chatting with the Knowledge Graph



## Setup

In [23]:
import os

from dotenv import load_dotenv

from langchain_community.graphs import Neo4jGraph
from langchain_openai import ChatOpenAI
from neo4j import GraphDatabase

from langchain.prompts.prompt import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import GraphCypherQAChain


# Load from environment
load_dotenv('../.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
GOOGLE_MAPS_API_KEY = os.getenv('GOOGLE_MAPS_API_KEY')

# Global constants
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'


IMPORT_DATA_DIRECTORY = '../data/sample/'

if OPENAI_API_KEY is None:
  raise ValueError("OPENAI_API_KEY is not set. Please add it to your .env file.")

kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)
gdb = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD) )


# Cypher - queries about addresses


In [3]:
# Tell me about a manager named royal bank
gdb.execute_query("""
  CALL db.index.fulltext.queryNodes(
         "fullTextManagerNames", 
         "royal bank") YIELD node, score
  RETURN node.name, score LIMIT 1
""").records

[<Record node.name='Royal Bank of Canada' score=4.431276321411133>]

In [4]:
# What is the location of royal bank?
gdb.execute_query("""
CALL db.index.fulltext.queryNodes(
         "fullTextManagerNames", 
         "royal bank"
  ) YIELD node, score
WITH node as mgr LIMIT 1
MATCH (mgr:Manager)-[:LOCATED_AT]->(addr:Address)
RETURN mgr.name, addr
""").records

[<Record mgr.name='Royal Bank of Canada' addr=<Node element_id='4:7018e0eb-4cdc-47a8-a756-5c2a5bc2343f:3303' labels=frozenset({'Address'}) properties={'country': 'Canada', 'city': 'Toronto', 'location': POINT(-79.3805647 43.6508267), 'state': 'Ontario'}>>]

In [5]:
# Which state has the most investment firms?
kg.query("""
  MATCH p=(:Manager)-[:LOCATED_AT]->(address:Address)
  RETURN address.state as state, count(address.state) as numManagers
    ORDER BY numManagers DESC
    LIMIT 10
""")

[{'state': 'New York', 'numManagers': 303},
 {'state': 'California', 'numManagers': 302},
 {'state': 'Massachusetts', 'numManagers': 147},
 {'state': 'Pennsylvania', 'numManagers': 138},
 {'state': 'Texas', 'numManagers': 125},
 {'state': 'Illinois', 'numManagers': 121},
 {'state': 'Florida', 'numManagers': 116},
 {'state': 'Connecticut', 'numManagers': 77},
 {'state': 'Ohio', 'numManagers': 76},
 {'state': 'New Jersey', 'numManagers': 69}]

In [7]:
# Which state has the most public companies listed?
kg.query("""
  MATCH p=(:Company)-[:LOCATED_AT]->(address:Address)
  RETURN address.state as state, count(address.state) as numCompanies
    ORDER BY numCompanies DESC
""")

[{'state': 'California', 'numCompanies': 7},
 {'state': 'Delaware', 'numCompanies': 1},
 {'state': 'New York', 'numCompanies': 1},
 {'state': 'Oregon', 'numCompanies': 1}]

In [8]:
# What are the cities in California with the most investment firms?
kg.query("""
  MATCH p=(:Manager)-[:LOCATED_AT]->(address:Address)
         WHERE address.state = 'California'
  RETURN address.city as city, count(address.city) as numManagers
    ORDER BY numManagers DESC
    LIMIT 10
""")

[{'city': 'San Francisco', 'numManagers': 48},
 {'city': 'Los Angeles', 'numManagers': 44},
 {'city': 'San Diego', 'numManagers': 17},
 {'city': 'Pasadena', 'numManagers': 13},
 {'city': 'Menlo Park', 'numManagers': 9},
 {'city': 'Newport Beach', 'numManagers': 9},
 {'city': 'Irvine', 'numManagers': 9},
 {'city': 'Walnut Creek', 'numManagers': 8},
 {'city': 'Palo Alto', 'numManagers': 6},
 {'city': 'Lafayette', 'numManagers': 6}]

In [9]:
# Which city in California has the most companies listed?
kg.query("""
  MATCH p=(:Company)-[:LOCATED_AT]->(address:Address)
         WHERE address.state = 'California'
  RETURN address.city as city, count(address.city) as numCompanies
    ORDER BY numCompanies DESC
""")

[{'city': 'Santa Clara', 'numCompanies': 3},
 {'city': 'San Jose', 'numCompanies': 2},
 {'city': 'Sunnyvale', 'numCompanies': 1},
 {'city': 'Cupertino', 'numCompanies': 1}]

In [10]:
# What are top investment firms in San Francisco?
kg.query("""
  MATCH p=(mgr:Manager)-[:LOCATED_AT]->(address:Address),
         (mgr)-[owns:OWNS_STOCK_IN]->(:Company)
         WHERE address.city = "San Francisco"
  RETURN mgr.managerName as city, sum(owns.value) as totalInvestmentValue
    ORDER BY totalInvestmentValue DESC
    LIMIT 10
""")



[{'city': None, 'totalInvestmentValue': 9393919041000.0}]

In [12]:
# What companies are in Santa Clara?
kg.query("""
  MATCH (com:Company)-[:LOCATED_AT]->(address:Address)
         WHERE address.city = "Santa Clara"
  RETURN com.name
""")

[{'com.name': 'PALO ALTO NETWORKS INC'},
 {'com.name': 'SEAGATE TECHNOLOGY'},
 {'com.name': 'ATLASSIAN CORP PLC'}]

In [16]:
# What companies are near Santa Clara?
kg.query("""
  MATCH (address:Address)
    WHERE address.city = "Santa Clara"
  MATCH (com:Company)-[:LOCATED_AT]->(companyAddress:Address)
    WHERE point.distance(address.location, companyAddress.location) < 10000
  RETURN com.name, com.address
""")

[{'com.name': 'PALO ALTO NETWORKS INC',
  'com.address': '3000 Tannery Way, Santa Clara, CA 95054, USA'},
 {'com.name': 'NETAPP INC',
  'com.address': 'Headquarters Dr, San Jose, CA 95134, USA'},
 {'com.name': 'WESTERN DIGITAL CORP.', 'com.address': 'San Jose, CA, USA'},
 {'com.name': 'SEAGATE TECHNOLOGY',
  'com.address': '2445 Augustine Dr, Santa Clara, CA 95054, USA'},
 {'com.name': 'ATLASSIAN CORP PLC', 'com.address': 'Santa Clara, CA, USA'}]

In [18]:
# Which city in California has the most management firms listed?
kg.query("""
  MATCH p=(:Manager)-[:LOCATED_AT]->(address:Address)
         WHERE address.state = 'California'
  RETURN address.city as city, count(address.city) as numManagers
    ORDER BY numManagers DESC LIMIT 10
""")

[{'city': 'San Francisco', 'numManagers': 48},
 {'city': 'Los Angeles', 'numManagers': 44},
 {'city': 'San Diego', 'numManagers': 17},
 {'city': 'Pasadena', 'numManagers': 13},
 {'city': 'Menlo Park', 'numManagers': 9},
 {'city': 'Newport Beach', 'numManagers': 9},
 {'city': 'Irvine', 'numManagers': 9},
 {'city': 'Walnut Creek', 'numManagers': 8},
 {'city': 'Palo Alto', 'numManagers': 6},
 {'city': 'Lafayette', 'numManagers': 6}]

In [15]:
# What investment firms are near Santa Clara?
kg.query("""
  MATCH (address:Address)
    WHERE address.city = "Santa Clara"
  MATCH (mgr:Manager)-[:LOCATED_AT]->(managerAddress:Address)
    WHERE point.distance(address.location, managerAddress.location) < 10000
  RETURN mgr.name, mgr.address
""")

[]

In [21]:
# Which investment firms are near Palo Aalto Networks?
kg.query("""
  CALL db.index.fulltext.queryNodes(
         "fullTextCompanyNames", 
         "Palo Aalto Networks"
         ) YIELD node, score
  WITH node as com
  MATCH (com)-[:LOCATED_AT]->(comAddress:Address),
    (mgr:Manager)-[:LOCATED_AT]->(mgrAddress:Address)
    WHERE point.distance(comAddress.location, mgrAddress.location) < 20000
  RETURN mgr.name, mgr.address,
    toInteger(point.distance(comAddress.location, mgrAddress.location) / 2000) as distanceKm
    ORDER BY distanceKm ASC
    LIMIT 10
""")

[{'mgr.name': 'SCHARF INVESTMENTS, LLC',
  'mgr.address': '16450 LOS GATOS BLVD, SUITE 207, LOS GATOS, CA, 95032',
  'distanceKm': 6},
 {'mgr.name': 'Comprehensive Financial Management LLC',
  'mgr.address': '720 University Avenue, Suite 200, Los Gatos, CA, 95032',
  'distanceKm': 6},
 {'mgr.name': 'Legacy Capital Group California, Inc.',
  'mgr.address': '459 MONTEREY AVENUE, SUITE 100, LOS GATOS, CA, 95030',
  'distanceKm': 6},
 {'mgr.name': 'AIMZ Investment Advisors, LLC',
  'mgr.address': '4984 EL CAMINO REAL, SUITE 101, LOS ALTOS, CA, 94022',
  'distanceKm': 7},
 {'mgr.name': 'Family CFO Inc',
  'mgr.address': '1064 LAURELES DRIVE, LOS ALTOS, CA, 94022',
  'distanceKm': 7}]

In [22]:
CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to query a graph database.
Instructions:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.
Examples: Here are a few examples of generated Cypher statements for particular questions:

# What are the top investment firms are in San Francisco?
MATCH (mgr:Manager)-[:LOCATED_AT]->(mgrAddress:Address)
    WHERE mgrAddress.city = 'San Francisco'
RETURN mgr.managerName

# What companies are in Santa Clara?
MATCH (com:Company)-[:LOCATED_AT]->(comAddress:Address)
    WHERE comAddress.city = 'Santa Clara'
RETURN com.companyName

# What investment firms are near Santa Clara?
  MATCH (address:Address)
    WHERE address.city = "Santa Clara"
  MATCH (mgr:Manager)-[:LOCATED_AT]->(managerAddress:Address)
    WHERE point.distance(address.location, managerAddress.location) < 20 * 1000
  RETURN mgr.managerName, mgr.managerAddress

# Which investment firms are near Palo Aalto Networks?
  CALL db.index.fulltext.queryNodes(
         "fullTextCompanyNames", 
         "Palo Aalto Networks"
         ) YIELD node, score
  WITH node as com
  MATCH (com)-[:LOCATED_AT]->(comAddress:Address),
    (mgr:Manager)-[:LOCATED_AT]->(mgrAddress:Address)
    WHERE point.distance(comAddress.location, mgrAddress.location) < 20 * 1000
  RETURN mgr, 
    toInteger(point.distance(comAddress.location, mgrAddress.location) / 1000) as distanceKm
    ORDER BY distanceKm ASC
    LIMIT 10
  
The question is:
{question}"""

In [25]:
import textwrap

CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"], template=CYPHER_GENERATION_TEMPLATE
)

cypherChain = GraphCypherQAChain.from_llm(
    ChatOpenAI(temperature=0),
    graph=kg,
    verbose=True,
    allow_dangerous_requests=True,
    cypher_prompt=CYPHER_GENERATION_PROMPT,
)

def prettyCypherChain(question: str) -> str:
    response = cypherChain.run(question)
    print(textwrap.fill(response, 60))


In [28]:
prettyCypherChain("What investment firms are in San Francisco?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (mgr:Manager)-[:LOCATED_AT]->(mgrAddress:Address)
    WHERE mgrAddress.city = 'San Francisco'
RETURN mgr.name[0m
Full Context:
[32;1m[1;3m[{'mgr.name': 'OSBORNE PARTNERS CAPITAL MANAGEMENT, LLC'}, {'mgr.name': 'OSTERWEIS CAPITAL MANAGEMENT INC'}, {'mgr.name': 'JACOBS & CO/CA'}, {'mgr.name': 'VAN STRUM & TOWNE INC.'}, {'mgr.name': 'RBF Capital, LLC'}, {'mgr.name': 'ALGERT GLOBAL LLC'}, {'mgr.name': 'WETHERBY ASSET MANAGEMENT INC'}, {'mgr.name': 'Avalon Global Asset Management LLC'}, {'mgr.name': 'Pacific Heights Asset Management LLC'}, {'mgr.name': 'Violich Capital Management, Inc.'}][0m

[1m> Finished chain.[0m
I don't know the answer.


In [27]:
prettyCypherChain("What companies are in Santa Clara?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (com:Company)-[:LOCATED_AT]->(comAddress:Address)
WHERE comAddress.city = 'Santa Clara'
RETURN com.name[0m
Full Context:
[32;1m[1;3m[{'com.name': 'PALO ALTO NETWORKS INC'}, {'com.name': 'SEAGATE TECHNOLOGY'}, {'com.name': 'ATLASSIAN CORP PLC'}][0m

[1m> Finished chain.[0m
PALO ALTO NETWORKS INC, SEAGATE TECHNOLOGY, ATLASSIAN CORP
PLC are in Santa Clara.


In [29]:
prettyCypherChain("What investment firms are near Santa Clara?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (address:Address)
    WHERE address.city = "Santa Clara"
MATCH (mgr:Manager)-[:LOCATED_AT]->(managerAddress:Address)
    WHERE point.distance(address.location, managerAddress.location) < 20 * 1000
RETURN mgr.name, mgr.address[0m
Full Context:
[32;1m[1;3m[{'mgr.name': 'SCHARF INVESTMENTS, LLC', 'mgr.address': '16450 LOS GATOS BLVD, SUITE 207, LOS GATOS, CA, 95032'}, {'mgr.name': 'Comprehensive Financial Management LLC', 'mgr.address': '720 University Avenue, Suite 200, Los Gatos, CA, 95032'}, {'mgr.name': 'Legacy Capital Group California, Inc.', 'mgr.address': '459 MONTEREY AVENUE, SUITE 100, LOS GATOS, CA, 95030'}, {'mgr.name': 'AIMZ Investment Advisors, LLC', 'mgr.address': '4984 EL CAMINO REAL, SUITE 101, LOS ALTOS, CA, 94022'}, {'mgr.name': 'Family CFO Inc', 'mgr.address': '1064 LAURELES DRIVE, LOS ALTOS, CA, 94022'}][0m

[1m> Finished chain.[0m
SCHARF INVESTMENTS, LLC, Comprehensive Fina

In [30]:
prettyCypherChain("Which investment firms are near Palo Aalto Networks?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mCALL db.index.fulltext.queryNodes(
     "fullTextCompanyNames", 
     "Palo Aalto Networks"
     ) YIELD node, score
WITH node as com
MATCH (com)-[:LOCATED_AT]->(comAddress:Address),
  (mgr:Manager)-[:LOCATED_AT]->(mgrAddress:Address)
  WHERE point.distance(comAddress.location, mgrAddress.location) < 20 * 1000
RETURN mgr, 
  toInteger(point.distance(comAddress.location, mgrAddress.location) / 1000) as distanceKm
  ORDER BY distanceKm ASC
  LIMIT 10[0m
Full Context:
[32;1m[1;3m[{'mgr': {'address': '16450 LOS GATOS BLVD, SUITE 207, LOS GATOS, CA, 95032', 'cik': '1463746', 'name': 'SCHARF INVESTMENTS, LLC', 'location': POINT(-121.9652627 37.2298178)}, 'distanceKm': 13}, {'mgr': {'address': '720 University Avenue, Suite 200, Los Gatos, CA, 95032', 'cik': '1799802', 'name': 'Comprehensive Financial Management LLC', 'location': POINT(-121.9302449 37.2260616)}, 'distanceKm': 13}, {'mgr': {'address': '459 MO